anolis-glibc/glibc-2.28-Add-macro-defination-of-lasx-lsx-and-fcc-.patch
ticat_fp d91eae1237 LoongArch: Sync loongarch64 code to lnd.36
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
2024-05-29 10:24:08 +08:00

3946 lines
113 KiB
Diff

From d97d963796b092b9c0bd4712f992a08dd20bf5ed Mon Sep 17 00:00:00 2001
From: caiyinyu <caiyinyu@loongson.cn>
Date: Tue, 11 Jul 2023 15:40:15 +0800
Subject: [PATCH 11/14] glibc-2.28: Add macro defination of lasx lsx and fcc
registers.
Change-Id: Ic723521775a0133e25bf1d568c588f930ec5ff49
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/dl-trampoline.h | 64 +--
.../loongarch/lp64/multiarch/memchr-lasx.S | 74 +--
sysdeps/loongarch/lp64/multiarch/memchr-lsx.S | 48 +-
.../loongarch/lp64/multiarch/memcmp-lasx.S | 138 +++---
sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S | 194 ++++----
.../loongarch/lp64/multiarch/memmove-lasx.S | 160 +++----
.../loongarch/lp64/multiarch/memmove-lsx.S | 424 +++++++++---------
.../loongarch/lp64/multiarch/memrchr-lasx.S | 74 +--
.../loongarch/lp64/multiarch/memrchr-lsx.S | 48 +-
.../loongarch/lp64/multiarch/memset-lasx.S | 64 +--
sysdeps/loongarch/lp64/multiarch/memset-lsx.S | 62 +--
.../loongarch/lp64/multiarch/rawmemchr-lasx.S | 30 +-
.../loongarch/lp64/multiarch/rawmemchr-lsx.S | 30 +-
sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S | 114 ++---
.../loongarch/lp64/multiarch/strchr-lasx.S | 52 +--
sysdeps/loongarch/lp64/multiarch/strchr-lsx.S | 30 +-
sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S | 114 ++---
sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S | 112 ++---
.../loongarch/lp64/multiarch/strlen-lasx.S | 24 +-
sysdeps/loongarch/lp64/multiarch/strlen-lsx.S | 30 +-
.../loongarch/lp64/multiarch/strncmp-lsx.S | 144 +++---
.../loongarch/lp64/multiarch/strnlen-lasx.S | 46 +-
.../loongarch/lp64/multiarch/strnlen-lsx.S | 30 +-
.../loongarch/lp64/multiarch/strrchr-lasx.S | 88 ++--
.../loongarch/lp64/multiarch/strrchr-lsx.S | 56 +--
sysdeps/loongarch/lp64/s_cosf.S | 4 +-
sysdeps/loongarch/lp64/s_sinf.S | 4 +-
sysdeps/loongarch/sys/regdef.h | 74 +++
28 files changed, 1203 insertions(+), 1129 deletions(-)
diff --git a/sysdeps/loongarch/dl-trampoline.h b/sysdeps/loongarch/dl-trampoline.h
index fb15983f..96f41f1d 100644
--- a/sysdeps/loongarch/dl-trampoline.h
+++ b/sysdeps/loongarch/dl-trampoline.h
@@ -61,23 +61,23 @@ ENTRY (_dl_runtime_resolve, 3)
FREG_S fa6, sp, 10*SZREG + 6*SZFREG
FREG_S fa7, sp, 10*SZREG + 7*SZFREG
#ifdef USE_LASX
- xvst $xr0, sp, 10*SZREG + 0*256
- xvst $xr1, sp, 10*SZREG + 1*256
- xvst $xr2, sp, 10*SZREG + 2*256
- xvst $xr3, sp, 10*SZREG + 3*256
- xvst $xr4, sp, 10*SZREG + 4*256
- xvst $xr5, sp, 10*SZREG + 5*256
- xvst $xr6, sp, 10*SZREG + 6*256
- xvst $xr7, sp, 10*SZREG + 7*256
+ xvst xr0, sp, 10*SZREG + 0*256
+ xvst xr1, sp, 10*SZREG + 1*256
+ xvst xr2, sp, 10*SZREG + 2*256
+ xvst xr3, sp, 10*SZREG + 3*256
+ xvst xr4, sp, 10*SZREG + 4*256
+ xvst xr5, sp, 10*SZREG + 5*256
+ xvst xr6, sp, 10*SZREG + 6*256
+ xvst xr7, sp, 10*SZREG + 7*256
#elif defined USE_LSX
- vst $vr0, sp, 10*SZREG + 0*128
- vst $vr1, sp, 10*SZREG + 1*128
- vst $vr2, sp, 10*SZREG + 2*128
- vst $vr3, sp, 10*SZREG + 3*128
- vst $vr4, sp, 10*SZREG + 4*128
- vst $vr5, sp, 10*SZREG + 5*128
- vst $vr6, sp, 10*SZREG + 6*128
- vst $vr7, sp, 10*SZREG + 7*128
+ vst vr0, sp, 10*SZREG + 0*128
+ vst vr1, sp, 10*SZREG + 1*128
+ vst vr2, sp, 10*SZREG + 2*128
+ vst vr3, sp, 10*SZREG + 3*128
+ vst vr4, sp, 10*SZREG + 4*128
+ vst vr5, sp, 10*SZREG + 5*128
+ vst vr6, sp, 10*SZREG + 6*128
+ vst vr7, sp, 10*SZREG + 7*128
#endif
#endif
@@ -119,23 +119,23 @@ ENTRY (_dl_runtime_resolve, 3)
FREG_L fa6, sp, 10*SZREG + 6*SZFREG
FREG_L fa7, sp, 10*SZREG + 7*SZFREG
#ifdef USE_LASX
- xvld $xr0, sp, 10*SZREG + 0*256
- xvld $xr1, sp, 10*SZREG + 1*256
- xvld $xr2, sp, 10*SZREG + 2*256
- xvld $xr3, sp, 10*SZREG + 3*256
- xvld $xr4, sp, 10*SZREG + 4*256
- xvld $xr5, sp, 10*SZREG + 5*256
- xvld $xr6, sp, 10*SZREG + 6*256
- xvld $xr7, sp, 10*SZREG + 7*256
+ xvld xr0, sp, 10*SZREG + 0*256
+ xvld xr1, sp, 10*SZREG + 1*256
+ xvld xr2, sp, 10*SZREG + 2*256
+ xvld xr3, sp, 10*SZREG + 3*256
+ xvld xr4, sp, 10*SZREG + 4*256
+ xvld xr5, sp, 10*SZREG + 5*256
+ xvld xr6, sp, 10*SZREG + 6*256
+ xvld xr7, sp, 10*SZREG + 7*256
#elif defined USE_LSX
- vld $vr0, sp, 10*SZREG + 0*128
- vld $vr1, sp, 10*SZREG + 1*128
- vld $vr2, sp, 10*SZREG + 2*128
- vld $vr3, sp, 10*SZREG + 3*128
- vld $vr4, sp, 10*SZREG + 4*128
- vld $vr5, sp, 10*SZREG + 5*128
- vld $vr6, sp, 10*SZREG + 6*128
- vld $vr7, sp, 10*SZREG + 7*128
+ vld vr0, sp, 10*SZREG + 0*128
+ vld vr1, sp, 10*SZREG + 1*128
+ vld vr2, sp, 10*SZREG + 2*128
+ vld vr3, sp, 10*SZREG + 3*128
+ vld vr4, sp, 10*SZREG + 4*128
+ vld vr5, sp, 10*SZREG + 5*128
+ vld vr6, sp, 10*SZREG + 6*128
+ vld vr7, sp, 10*SZREG + 7*128
#endif
#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S
index 387a35fe..425fcede 100644
--- a/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S
+++ b/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S
@@ -17,28 +17,28 @@ LEAF(MEMCHR, 6)
andi t0, a0, 0x3f
bstrins.d a0, zero, 5, 0
- xvld $xr0, a0, 0
- xvld $xr1, a0, 32
+ xvld xr0, a0, 0
+ xvld xr1, a0, 32
li.d t1, -1
li.d t2, 64
- xvreplgr2vr.b $xr2, a1
+ xvreplgr2vr.b xr2, a1
sll.d t3, t1, t0
sub.d t2, t2, t0
- xvseq.b $xr0, $xr0, $xr2
+ xvseq.b xr0, xr0, xr2
- xvseq.b $xr1, $xr1, $xr2
- xvmsknz.b $xr0, $xr0
- xvmsknz.b $xr1, $xr1
- xvpickve.w $xr3, $xr0, 4
+ xvseq.b xr1, xr1, xr2
+ xvmsknz.b xr0, xr0
+ xvmsknz.b xr1, xr1
+ xvpickve.w xr3, xr0, 4
- xvpickve.w $xr4, $xr1, 4
- vilvl.h $vr0, $vr3, $vr0
- vilvl.h $vr1, $vr4, $vr1
- vilvl.w $vr0, $vr1, $vr0
+ xvpickve.w xr4, xr1, 4
+ vilvl.h vr0, vr3, vr0
+ vilvl.h vr1, vr4, vr1
+ vilvl.w vr0, vr1, vr0
- movfr2gr.d t0, $f0
+ movfr2gr.d t0, fa0
and t0, t0, t3
bgeu t2, a2, L(end)
bnez t0, L(found)
@@ -46,28 +46,28 @@ LEAF(MEMCHR, 6)
addi.d a4, a3, -1
bstrins.d a4, zero, 5, 0
L(loop):
- xvld $xr0, a0, 64
- xvld $xr1, a0, 96
+ xvld xr0, a0, 64
+ xvld xr1, a0, 96
addi.d a0, a0, 64
- xvseq.b $xr0, $xr0, $xr2
- xvseq.b $xr1, $xr1, $xr2
+ xvseq.b xr0, xr0, xr2
+ xvseq.b xr1, xr1, xr2
beq a0, a4, L(out)
- xvmax.bu $xr3, $xr0, $xr1
- xvseteqz.v $fcc0, $xr3
- bcnez $fcc0, L(loop)
- xvmsknz.b $xr0, $xr0
+ xvmax.bu xr3, xr0, xr1
+ xvseteqz.v fcc0, xr3
+ bcnez fcc0, L(loop)
+ xvmsknz.b xr0, xr0
- xvmsknz.b $xr1, $xr1
- xvpickve.w $xr3, $xr0, 4
- xvpickve.w $xr4, $xr1, 4
- vilvl.h $vr0, $vr3, $vr0
+ xvmsknz.b xr1, xr1
+ xvpickve.w xr3, xr0, 4
+ xvpickve.w xr4, xr1, 4
+ vilvl.h vr0, vr3, vr0
- vilvl.h $vr1, $vr4, $vr1
- vilvl.w $vr0, $vr1, $vr0
- movfr2gr.d t0, $f0
+ vilvl.h vr1, vr4, vr1
+ vilvl.w vr0, vr1, vr0
+ movfr2gr.d t0, fa0
L(found):
ctz.d t1, t0
@@ -79,15 +79,15 @@ L(ret0):
L(out):
- xvmsknz.b $xr0, $xr0
- xvmsknz.b $xr1, $xr1
- xvpickve.w $xr3, $xr0, 4
- xvpickve.w $xr4, $xr1, 4
-
- vilvl.h $vr0, $vr3, $vr0
- vilvl.h $vr1, $vr4, $vr1
- vilvl.w $vr0, $vr1, $vr0
- movfr2gr.d t0, $f0
+ xvmsknz.b xr0, xr0
+ xvmsknz.b xr1, xr1
+ xvpickve.w xr3, xr0, 4
+ xvpickve.w xr4, xr1, 4
+
+ vilvl.h vr0, vr3, vr0
+ vilvl.h vr1, vr4, vr1
+ vilvl.w vr0, vr1, vr0
+ movfr2gr.d t0, fa0
L(end):
sub.d t2, zero, a3
diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S
index c6952657..08a630d3 100644
--- a/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S
@@ -17,23 +17,23 @@ LEAF(MEMCHR, 6)
andi t0, a0, 0x1f
bstrins.d a0, zero, 4, 0
- vld $vr0, a0, 0
- vld $vr1, a0, 16
+ vld vr0, a0, 0
+ vld vr1, a0, 16
li.d t1, -1
li.d t2, 32
- vreplgr2vr.b $vr2, a1
+ vreplgr2vr.b vr2, a1
sll.d t3, t1, t0
sub.d t2, t2, t0
- vseq.b $vr0, $vr0, $vr2
+ vseq.b vr0, vr0, vr2
- vseq.b $vr1, $vr1, $vr2
- vmsknz.b $vr0, $vr0
- vmsknz.b $vr1, $vr1
- vilvl.h $vr0, $vr1, $vr0
+ vseq.b vr1, vr1, vr2
+ vmsknz.b vr0, vr0
+ vmsknz.b vr1, vr1
+ vilvl.h vr0, vr1, vr0
- movfr2gr.s t0, $f0
+ movfr2gr.s t0, fa0
and t0, t0, t3
bgeu t2, a2, L(end)
bnez t0, L(found)
@@ -41,23 +41,23 @@ LEAF(MEMCHR, 6)
addi.d a4, a3, -1
bstrins.d a4, zero, 4, 0
L(loop):
- vld $vr0, a0, 32
- vld $vr1, a0, 48
+ vld vr0, a0, 32
+ vld vr1, a0, 48
addi.d a0, a0, 32
- vseq.b $vr0, $vr0, $vr2
- vseq.b $vr1, $vr1, $vr2
+ vseq.b vr0, vr0, vr2
+ vseq.b vr1, vr1, vr2
beq a0, a4, L(out)
- vmax.bu $vr3, $vr0, $vr1
- vseteqz.v $fcc0, $vr3
- bcnez $fcc0, L(loop)
- vmsknz.b $vr0, $vr0
+ vmax.bu vr3, vr0, vr1
+ vseteqz.v fcc0, vr3
+ bcnez fcc0, L(loop)
+ vmsknz.b vr0, vr0
- vmsknz.b $vr1, $vr1
- vilvl.h $vr0, $vr1, $vr0
- movfr2gr.s t0, $f0
+ vmsknz.b vr1, vr1
+ vilvl.h vr0, vr1, vr0
+ movfr2gr.s t0, fa0
L(found):
ctz.w t0, t0
@@ -68,10 +68,10 @@ L(ret0):
jr ra
L(out):
- vmsknz.b $vr0, $vr0
- vmsknz.b $vr1, $vr1
- vilvl.h $vr0, $vr1, $vr0
- movfr2gr.s t0, $f0
+ vmsknz.b vr0, vr0
+ vmsknz.b vr1, vr1
+ vilvl.h vr0, vr1, vr0
+ movfr2gr.s t0, fa0
L(end):
sub.d t2, zero, a3
diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S
index 9151d38d..2c192954 100644
--- a/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S
+++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S
@@ -20,39 +20,39 @@ LEAF(MEMCMP, 6)
li.d t1, 160
bgeu a2, t1, L(make_aligned) # a2 >= 160
L(loop32):
- xvld $xr0, a0, 0
- xvld $xr1, a1, 0
+ xvld xr0, a0, 0
+ xvld xr1, a1, 0
addi.d a0, a0, 32
addi.d a1, a1, 32
addi.d a2, a2, -32
- xvseq.b $xr2, $xr0, $xr1
+ xvseq.b xr2, xr0, xr1
- xvsetanyeqz.b $fcc0, $xr2
- bcnez $fcc0, L(end)
+ xvsetanyeqz.b fcc0, xr2
+ bcnez fcc0, L(end)
L(last_bytes):
bltu t2, a2, L(loop32)
- xvld $xr0, a3, -32
+ xvld xr0, a3, -32
- xvld $xr1, a4, -32
- xvseq.b $xr2, $xr0, $xr1
+ xvld xr1, a4, -32
+ xvseq.b xr2, xr0, xr1
L(end):
- xvmsknz.b $xr2, $xr2
- xvpermi.q $xr4, $xr0, 1
+ xvmsknz.b xr2, xr2
+ xvpermi.q xr4, xr0, 1
- xvpickve.w $xr3, $xr2, 4
- xvpermi.q $xr5, $xr1, 1
- vilvl.h $vr2, $vr3, $vr2
- movfr2gr.s t0, $f2
+ xvpickve.w xr3, xr2, 4
+ xvpermi.q xr5, xr1, 1
+ vilvl.h vr2, vr3, vr2
+ movfr2gr.s t0, fa2
cto.w t0, t0
- vreplgr2vr.b $vr2, t0
- vshuf.b $vr0, $vr4, $vr0, $vr2
- vshuf.b $vr1, $vr5, $vr1, $vr2
+ vreplgr2vr.b vr2, t0
+ vshuf.b vr0, vr4, vr0, vr2
+ vshuf.b vr1, vr5, vr1, vr2
- vpickve2gr.bu t0, $vr0, 0
- vpickve2gr.bu t1, $vr1, 0
+ vpickve2gr.bu t0, vr0, 0
+ vpickve2gr.bu t1, vr1, 0
sub.d a0, t0, t1
jr ra
@@ -60,59 +60,59 @@ L(end):
L(less32):
srli.d t0, a2, 4
beqz t0, L(less16)
- vld $vr0, a0, 0
- vld $vr1, a1, 0
+ vld vr0, a0, 0
+ vld vr1, a1, 0
- vld $vr2, a3, -16
- vld $vr3, a4, -16
+ vld vr2, a3, -16
+ vld vr3, a4, -16
L(short_ret):
- vseq.b $vr4, $vr0, $vr1
- vseq.b $vr5, $vr2, $vr3
+ vseq.b vr4, vr0, vr1
+ vseq.b vr5, vr2, vr3
- vmsknz.b $vr4, $vr4
- vmsknz.b $vr5, $vr5
- vilvl.h $vr4, $vr5, $vr4
- movfr2gr.s t0, $f4
+ vmsknz.b vr4, vr4
+ vmsknz.b vr5, vr5
+ vilvl.h vr4, vr5, vr4
+ movfr2gr.s t0, fa4
cto.w t0, t0
- vreplgr2vr.b $vr4, t0
- vshuf.b $vr0, $vr2, $vr0, $vr4
- vshuf.b $vr1, $vr3, $vr1, $vr4
+ vreplgr2vr.b vr4, t0
+ vshuf.b vr0, vr2, vr0, vr4
+ vshuf.b vr1, vr3, vr1, vr4
- vpickve2gr.bu t0, $vr0, 0
- vpickve2gr.bu t1, $vr1, 0
+ vpickve2gr.bu t0, vr0, 0
+ vpickve2gr.bu t1, vr1, 0
sub.d a0, t0, t1
jr ra
L(less16):
srli.d t0, a2, 3
beqz t0, L(less8)
- vldrepl.d $vr0, a0, 0
- vldrepl.d $vr1, a1, 0
+ vldrepl.d vr0, a0, 0
+ vldrepl.d vr1, a1, 0
- vldrepl.d $vr2, a3, -8
- vldrepl.d $vr3, a4, -8
+ vldrepl.d vr2, a3, -8
+ vldrepl.d vr3, a4, -8
b L(short_ret)
L(less8):
srli.d t0, a2, 2
beqz t0, L(less4)
- vldrepl.w $vr0, a0, 0
- vldrepl.w $vr1, a1, 0
- vldrepl.w $vr2, a3, -4
+ vldrepl.w vr0, a0, 0
+ vldrepl.w vr1, a1, 0
+ vldrepl.w vr2, a3, -4
- vldrepl.w $vr3, a4, -4
+ vldrepl.w vr3, a4, -4
b L(short_ret)
L(less4):
srli.d t0, a2, 1
beqz t0, L(less2)
- vldrepl.h $vr0, a0, 0
- vldrepl.h $vr1, a1, 0
- vldrepl.h $vr2, a3, -2
- vldrepl.h $vr3, a4, -2
+ vldrepl.h vr0, a0, 0
+ vldrepl.h vr1, a1, 0
+ vldrepl.h vr2, a3, -2
+ vldrepl.h vr3, a4, -2
b L(short_ret)
L(less2):
@@ -132,12 +132,12 @@ L(ret0):
nop
/* make src1 aligned, and adjust scr2 and length. */
L(make_aligned):
- xvld $xr0, a0, 0
+ xvld xr0, a0, 0
- xvld $xr1, a1, 0
- xvseq.b $xr2, $xr0, $xr1
- xvsetanyeqz.b $fcc0, $xr2
- bcnez $fcc0, L(end)
+ xvld xr1, a1, 0
+ xvseq.b xr2, xr0, xr1
+ xvsetanyeqz.b fcc0, xr2
+ bcnez fcc0, L(end)
andi t0, a0, 0x1f
sub.d t0, t2, t0
@@ -151,17 +151,17 @@ L(make_aligned):
L(loop_align):
- xvld $xr0, a0, 0
- xvld $xr1, a1, 0
- xvld $xr2, a0, 32
- xvld $xr3, a1, 32
+ xvld xr0, a0, 0
+ xvld xr1, a1, 0
+ xvld xr2, a0, 32
+ xvld xr3, a1, 32
- xvseq.b $xr0, $xr0, $xr1
- xvseq.b $xr1, $xr2, $xr3
- xvmin.bu $xr2, $xr1, $xr0
- xvsetanyeqz.b $fcc0, $xr2
+ xvseq.b xr0, xr0, xr1
+ xvseq.b xr1, xr2, xr3
+ xvmin.bu xr2, xr1, xr0
+ xvsetanyeqz.b fcc0, xr2
- bcnez $fcc0, L(pair_end)
+ bcnez fcc0, L(pair_end)
addi.d a0, a0, 64
addi.d a1, a1, 64
bne a0, a5, L(loop_align)
@@ -173,15 +173,15 @@ L(loop_align):
L(pair_end):
- xvmsknz.b $xr0, $xr0
- xvmsknz.b $xr1, $xr1
- xvpickve.w $xr2, $xr0, 4
- xvpickve.w $xr3, $xr1, 4
-
- vilvl.h $vr0, $vr2, $vr0
- vilvl.h $vr1, $vr3, $vr1
- vilvl.w $vr0, $vr1, $vr0
- movfr2gr.d t0, $f0
+ xvmsknz.b xr0, xr0
+ xvmsknz.b xr1, xr1
+ xvpickve.w xr2, xr0, 4
+ xvpickve.w xr3, xr1, 4
+
+ vilvl.h vr0, vr2, vr0
+ vilvl.h vr1, vr3, vr1
+ vilvl.w vr0, vr1, vr0
+ movfr2gr.d t0, fa0
cto.d t0, t0
ldx.bu t1, a0, t0
diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S
index 8535aa22..b407275f 100644
--- a/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S
@@ -21,28 +21,28 @@ ENTRY_NO_ALIGN(MEMCMP)
pcaddi t0, -7
andi a3, a0, 0xf
- vld $vr5, t0, 0
+ vld vr5, t0, 0
andi a4, a1, 0xf
bne a3, a4, L(unaligned)
bstrins.d a0, zero, 3, 0
xor a1, a1, a4
- vld $vr0, a0, 0
- vld $vr1, a1, 0
+ vld vr0, a0, 0
+ vld vr1, a1, 0
li.d t0, 16
- vreplgr2vr.b $vr3, a3
+ vreplgr2vr.b vr3, a3
sub.d t1, t0, a3
- vadd.b $vr3, $vr3, $vr5
+ vadd.b vr3, vr3, vr5
- vshuf.b $vr0, $vr3, $vr0, $vr3
- vshuf.b $vr1, $vr3, $vr1, $vr3
- vseq.b $vr4, $vr0, $vr1
+ vshuf.b vr0, vr3, vr0, vr3
+ vshuf.b vr1, vr3, vr1, vr3
+ vseq.b vr4, vr0, vr1
bgeu t1, a2, L(al_end)
- vsetanyeqz.b $fcc0, $vr4
- bcnez $fcc0, L(al_found)
+ vsetanyeqz.b fcc0, vr4
+ bcnez fcc0, L(al_found)
sub.d a2, a2, t1
andi t1, a2, 31
@@ -53,70 +53,70 @@ ENTRY_NO_ALIGN(MEMCMP)
L(al_loop):
- vld $vr0, a0, 16
- vld $vr1, a1, 16
- vld $vr2, a0, 32
- vld $vr3, a1, 32
+ vld vr0, a0, 16
+ vld vr1, a1, 16
+ vld vr2, a0, 32
+ vld vr3, a1, 32
addi.d a0, a0, 32
addi.d a1, a1, 32
- vseq.b $vr4, $vr0, $vr1
- vseq.b $vr6, $vr2, $vr3
+ vseq.b vr4, vr0, vr1
+ vseq.b vr6, vr2, vr3
- vand.v $vr6, $vr4, $vr6
- vsetanyeqz.b $fcc0, $vr6
- bcnez $fcc0, L(al_pair_end)
+ vand.v vr6, vr4, vr6
+ vsetanyeqz.b fcc0, vr6
+ bcnez fcc0, L(al_pair_end)
bne a0, a4, L(al_loop)
L(al_less_32bytes):
bgeu t0, a2, L(al_less_16bytes)
- vld $vr0, a0, 16
- vld $vr1, a1, 16
- vld $vr2, a0, 32
+ vld vr0, a0, 16
+ vld vr1, a1, 16
+ vld vr2, a0, 32
- vld $vr3, a1, 32
+ vld vr3, a1, 32
addi.d a2, a2, -16
- vreplgr2vr.b $vr6, a2
- vslt.b $vr5, $vr5, $vr6
+ vreplgr2vr.b vr6, a2
+ vslt.b vr5, vr5, vr6
- vseq.b $vr4, $vr0, $vr1
- vseq.b $vr6, $vr2, $vr3
- vorn.v $vr6, $vr6, $vr5
+ vseq.b vr4, vr0, vr1
+ vseq.b vr6, vr2, vr3
+ vorn.v vr6, vr6, vr5
L(al_pair_end):
- vsetanyeqz.b $fcc0, $vr4
+ vsetanyeqz.b fcc0, vr4
- bcnez $fcc0, L(al_found)
- vnori.b $vr4, $vr6, 0
- vfrstpi.b $vr4, $vr4, 0
- vshuf.b $vr0, $vr2, $vr2, $vr4
+ bcnez fcc0, L(al_found)
+ vnori.b vr4, vr6, 0
+ vfrstpi.b vr4, vr4, 0
+ vshuf.b vr0, vr2, vr2, vr4
- vshuf.b $vr1, $vr3, $vr3, $vr4
- vpickve2gr.bu t0, $vr0, 0
- vpickve2gr.bu t1, $vr1, 0
+ vshuf.b vr1, vr3, vr3, vr4
+ vpickve2gr.bu t0, vr0, 0
+ vpickve2gr.bu t1, vr1, 0
sub.d a0, t0, t1
jr ra
L(al_less_16bytes):
beqz a2, L(out)
- vld $vr0, a0, 16
- vld $vr1, a1, 16
+ vld vr0, a0, 16
+ vld vr1, a1, 16
- vseq.b $vr4, $vr0, $vr1
+ vseq.b vr4, vr0, vr1
L(al_end):
- vreplgr2vr.b $vr6, a2
- vslt.b $vr5, $vr5, $vr6
- vorn.v $vr4, $vr4, $vr5
+ vreplgr2vr.b vr6, a2
+ vslt.b vr5, vr5, vr6
+ vorn.v vr4, vr4, vr5
L(al_found):
- vnori.b $vr4, $vr4, 0
- vfrstpi.b $vr4, $vr4, 0
- vshuf.b $vr0, $vr0, $vr0, $vr4
- vshuf.b $vr1, $vr1, $vr1, $vr4
+ vnori.b vr4, vr4, 0
+ vfrstpi.b vr4, vr4, 0
+ vshuf.b vr0, vr0, vr0, vr4
+ vshuf.b vr1, vr1, vr1, vr4
- vpickve2gr.bu t0, $vr0, 0
- vpickve2gr.bu t1, $vr1, 0
+ vpickve2gr.bu t0, vr0, 0
+ vpickve2gr.bu t1, vr1, 0
sub.d a0, t0, t1
jr ra
@@ -133,28 +133,28 @@ L(unaligned):
bstrins.d a0, zero, 3, 0
xor a1, a1, a4
- vld $vr4, a0, 0
- vld $vr1, a1, 0
+ vld vr4, a0, 0
+ vld vr1, a1, 0
li.d t0, 16
- vreplgr2vr.b $vr2, a4
+ vreplgr2vr.b vr2, a4
sub.d a6, a4, a3 # a6 hold the diff
sub.d t1, t0, a4
sub.d t2, t0, a6
- vadd.b $vr2, $vr2, $vr5 # [4, 5, 6, ...]
- vreplgr2vr.b $vr6, t2
- vadd.b $vr6, $vr6, $vr5 # [14, 15, 16, ... ]
- vshuf.b $vr0, $vr4, $vr4, $vr6 # make data be in the same position
+ vadd.b vr2, vr2, vr5 # [4, 5, 6, ...]
+ vreplgr2vr.b vr6, t2
+ vadd.b vr6, vr6, vr5 # [14, 15, 16, ... ]
+ vshuf.b vr0, vr4, vr4, vr6 # make data be in the same position
- vshuf.b $vr1, $vr2, $vr1, $vr2
- vshuf.b $vr0, $vr2, $vr0, $vr2
- vseq.b $vr7, $vr0, $vr1
+ vshuf.b vr1, vr2, vr1, vr2
+ vshuf.b vr0, vr2, vr0, vr2
+ vseq.b vr7, vr0, vr1
bgeu t1, a2, L(un_end)
- vsetanyeqz.b $fcc0, $vr7
- bcnez $fcc0, L(un_found)
+ vsetanyeqz.b fcc0, vr7
+ bcnez fcc0, L(un_found)
sub.d a2, a2, t1
andi t1, a2, 31
@@ -165,63 +165,63 @@ L(unaligned):
L(un_loop):
- vld $vr2, a0, 16
- vld $vr1, a1, 16
- vld $vr3, a1, 32
+ vld vr2, a0, 16
+ vld vr1, a1, 16
+ vld vr3, a1, 32
addi.d a1, a1, 32
addi.d a0, a0, 32
- vshuf.b $vr0, $vr2, $vr4, $vr6
- vld $vr4, a0, 0
- vseq.b $vr7, $vr0, $vr1
+ vshuf.b vr0, vr2, vr4, vr6
+ vld vr4, a0, 0
+ vseq.b vr7, vr0, vr1
- vshuf.b $vr2, $vr4, $vr2, $vr6
- vseq.b $vr8, $vr2, $vr3
- vand.v $vr8, $vr7, $vr8
- vsetanyeqz.b $fcc0, $vr8
+ vshuf.b vr2, vr4, vr2, vr6
+ vseq.b vr8, vr2, vr3
+ vand.v vr8, vr7, vr8
+ vsetanyeqz.b fcc0, vr8
- bcnez $fcc0, L(un_pair_end)
+ bcnez fcc0, L(un_pair_end)
bne a1, a4, L(un_loop)
L(un_less_32bytes):
bltu a2, t0, L(un_less_16bytes)
- vld $vr2, a0, 16
+ vld vr2, a0, 16
- vld $vr1, a1, 16
+ vld vr1, a1, 16
addi.d a0, a0, 16
addi.d a1, a1, 16
addi.d a2, a2, -16
- vshuf.b $vr0, $vr2, $vr4, $vr6
- vor.v $vr4, $vr2, $vr2
- vseq.b $vr7, $vr0, $vr1
- vsetanyeqz.b $fcc0, $vr7
+ vshuf.b vr0, vr2, vr4, vr6
+ vor.v vr4, vr2, vr2
+ vseq.b vr7, vr0, vr1
+ vsetanyeqz.b fcc0, vr7
- bcnez $fcc0, L(un_found)
+ bcnez fcc0, L(un_found)
L(un_less_16bytes):
beqz a2, L(out)
- vld $vr1, a1, 16
+ vld vr1, a1, 16
bgeu a6, a2, 1f
- vld $vr2, a0, 16
+ vld vr2, a0, 16
1:
- vshuf.b $vr0, $vr2, $vr4, $vr6
- vseq.b $vr7, $vr0, $vr1
+ vshuf.b vr0, vr2, vr4, vr6
+ vseq.b vr7, vr0, vr1
L(un_end):
- vreplgr2vr.b $vr3, a2
+ vreplgr2vr.b vr3, a2
- vslt.b $vr3, $vr5, $vr3
- vorn.v $vr7, $vr7, $vr3
+ vslt.b vr3, vr5, vr3
+ vorn.v vr7, vr7, vr3
L(un_found):
- vnori.b $vr7, $vr7, 0
- vfrstpi.b $vr7, $vr7, 0
+ vnori.b vr7, vr7, 0
+ vfrstpi.b vr7, vr7, 0
- vshuf.b $vr0, $vr0, $vr0, $vr7
- vshuf.b $vr1, $vr1, $vr1, $vr7
+ vshuf.b vr0, vr0, vr0, vr7
+ vshuf.b vr1, vr1, vr1, vr7
L(calc_result):
- vpickve2gr.bu t0, $vr0, 0
- vpickve2gr.bu t1, $vr1, 0
+ vpickve2gr.bu t0, vr0, 0
+ vpickve2gr.bu t1, vr1, 0
sub.d t2, t0, t1
sub.d t3, t1, t0
@@ -231,14 +231,14 @@ L(calc_result):
or a0, t0, t1
jr ra
L(un_pair_end):
- vsetanyeqz.b $fcc0, $vr7
- bcnez $fcc0, L(un_found)
+ vsetanyeqz.b fcc0, vr7
+ bcnez fcc0, L(un_found)
- vnori.b $vr7, $vr8, 0
- vfrstpi.b $vr7, $vr7, 0
- vshuf.b $vr0, $vr2, $vr2, $vr7
- vshuf.b $vr1, $vr3, $vr3, $vr7
+ vnori.b vr7, vr8, 0
+ vfrstpi.b vr7, vr7, 0
+ vshuf.b vr0, vr2, vr2, vr7
+ vshuf.b vr1, vr3, vr3, vr7
b L(calc_result)
L(out):
diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S
index e8b2c441..c317592f 100644
--- a/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S
+++ b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S
@@ -26,22 +26,22 @@ LEAF(MEMCPY_NAME, 6)
li.d t1, 64
bltu t1, a2, L(copy_long) # a2 > 64
- xvld $xr0, a1, 0
- xvld $xr1, a4, -32
+ xvld xr0, a1, 0
+ xvld xr1, a4, -32
- xvst $xr0, a0, 0
- xvst $xr1, a3, -32
+ xvst xr0, a0, 0
+ xvst xr1, a3, -32
jr ra
L(less_32bytes):
srli.d t0, a2, 4
beqz t0, L(less_16bytes)
- vld $vr0, a1, 0
- vld $vr1, a4, -16
- vst $vr0, a0, 0
+ vld vr0, a1, 0
+ vld vr1, a4, -16
+ vst vr0, a0, 0
- vst $vr1, a3, -16
+ vst vr1, a3, -16
jr ra
L(less_16bytes):
srli.d t0, a2, 3
@@ -91,11 +91,11 @@ LEAF(MEMMOVE_NAME, 6)
li.d t1, 64
bltu t1, a2, L(move_long) # a2 > 64
- xvld $xr0, a1, 0
- xvld $xr1, a4, -32
+ xvld xr0, a1, 0
+ xvld xr1, a4, -32
- xvst $xr0, a0, 0
- xvst $xr1, a3, -32
+ xvst xr0, a0, 0
+ xvst xr1, a3, -32
jr ra
L(move_long):
sub.d t2, a0, a1
@@ -107,8 +107,8 @@ L(copy_long):
sub.d t2, t0, t2
- xvld $xr8, a1, 0
- xvld $xr9, a4, -32
+ xvld xr8, a1, 0
+ xvld xr9, a4, -32
sub.d t3, a2, t2
add.d a5, a0, t2
@@ -119,69 +119,69 @@ L(copy_long):
addi.d a6, a6, -1
L(loop_256):
- xvld $xr0, a1, 0
- xvld $xr1, a1, 32
- xvld $xr2, a1, 64
+ xvld xr0, a1, 0
+ xvld xr1, a1, 32
+ xvld xr2, a1, 64
- xvld $xr3, a1, 96
- xvld $xr4, a1, 128
- xvld $xr5, a1, 160
- xvld $xr6, a1, 192
+ xvld xr3, a1, 96
+ xvld xr4, a1, 128
+ xvld xr5, a1, 160
+ xvld xr6, a1, 192
- xvld $xr7, a1, 224
+ xvld xr7, a1, 224
addi.d a1, a1, 256
- xvst $xr0, a5, 0
- xvst $xr1, a5, 32
+ xvst xr0, a5, 0
+ xvst xr1, a5, 32
- xvst $xr2, a5, 64
- xvst $xr3, a5, 96
- xvst $xr4, a5, 128
- xvst $xr5, a5, 160
+ xvst xr2, a5, 64
+ xvst xr3, a5, 96
+ xvst xr4, a5, 128
+ xvst xr5, a5, 160
- xvst $xr6, a5, 192
- xvst $xr7, a5, 224
+ xvst xr6, a5, 192
+ xvst xr7, a5, 224
addi.d a5, a5, 256
bne a1, a6, L(loop_256)
L(lt256):
srli.d t2, a2, 7
beqz t2, L(lt128)
- xvld $xr0, a1, 0
- xvld $xr1, a1, 32
+ xvld xr0, a1, 0
+ xvld xr1, a1, 32
- xvld $xr2, a1, 64
- xvld $xr3, a1, 96
+ xvld xr2, a1, 64
+ xvld xr3, a1, 96
addi.d a1, a1, 128
addi.d a2, a2, -128
- xvst $xr0, a5, 0
- xvst $xr1, a5, 32
- xvst $xr2, a5, 64
- xvst $xr3, a5, 96
+ xvst xr0, a5, 0
+ xvst xr1, a5, 32
+ xvst xr2, a5, 64
+ xvst xr3, a5, 96
addi.d a5, a5, 128
L(lt128):
bltu a2, t1, L(lt64)
- xvld $xr0, a1, 0
- xvld $xr1, a1, 32
+ xvld xr0, a1, 0
+ xvld xr1, a1, 32
addi.d a1, a1, 64
addi.d a2, a2, -64
- xvst $xr0, a5, 0
- xvst $xr1, a5, 32
+ xvst xr0, a5, 0
+ xvst xr1, a5, 32
addi.d a5, a5, 64
L(lt64):
bltu a2, t0, L(lt32)
- xvld $xr0, a1, 0
- xvst $xr0, a5, 0
+ xvld xr0, a1, 0
+ xvst xr0, a5, 0
L(lt32):
- xvst $xr8, a0, 0
- xvst $xr9, a3, -32
+ xvst xr8, a0, 0
+ xvst xr9, a3, -32
jr ra
nop
@@ -189,9 +189,9 @@ L(copy_back):
addi.d a3, a3, -1
addi.d a2, a2, -2
andi t2, a3, 0x1f
- xvld $xr8, a1, 0
+ xvld xr8, a1, 0
- xvld $xr9, a4, -32
+ xvld xr9, a4, -32
sub.d t3, a2, t2
sub.d a5, a3, t2
sub.d a4, a4, t2
@@ -203,69 +203,69 @@ L(copy_back):
addi.d a6, a6, 2
L(back_loop_256):
- xvld $xr0, a4, -33
- xvld $xr1, a4, -65
- xvld $xr2, a4, -97
- xvld $xr3, a4, -129
+ xvld xr0, a4, -33
+ xvld xr1, a4, -65
+ xvld xr2, a4, -97
+ xvld xr3, a4, -129
- xvld $xr4, a4, -161
- xvld $xr5, a4, -193
- xvld $xr6, a4, -225
- xvld $xr7, a4, -257
+ xvld xr4, a4, -161
+ xvld xr5, a4, -193
+ xvld xr6, a4, -225
+ xvld xr7, a4, -257
addi.d a4, a4, -256
- xvst $xr0, a5, -32
- xvst $xr1, a5, -64
- xvst $xr2, a5, -96
+ xvst xr0, a5, -32
+ xvst xr1, a5, -64
+ xvst xr2, a5, -96
- xvst $xr3, a5, -128
- xvst $xr4, a5, -160
- xvst $xr5, a5, -192
- xvst $xr6, a5, -224
+ xvst xr3, a5, -128
+ xvst xr4, a5, -160
+ xvst xr5, a5, -192
+ xvst xr6, a5, -224
- xvst $xr7, a5, -256
+ xvst xr7, a5, -256
addi.d a5, a5, -256
bne a4, a6, L(back_loop_256)
L(back_lt256):
srli.d t2, a2, 7
beqz t2, L(back_lt128)
- xvld $xr0, a4, -33
- xvld $xr1, a4, -65
- xvld $xr2, a4, -97
+ xvld xr0, a4, -33
+ xvld xr1, a4, -65
+ xvld xr2, a4, -97
- xvld $xr3, a4, -129
+ xvld xr3, a4, -129
addi.d a2, a2, -128
addi.d a4, a4, -128
- xvst $xr0, a5, -32
+ xvst xr0, a5, -32
- xvst $xr1, a5, -64
- xvst $xr2, a5, -96
- xvst $xr3, a5, -128
+ xvst xr1, a5, -64
+ xvst xr2, a5, -96
+ xvst xr3, a5, -128
addi.d a5, a5, -128
L(back_lt128):
blt a2, t1, L(back_lt64)
- xvld $xr0, a4, -33
- xvld $xr1, a4, -65
+ xvld xr0, a4, -33
+ xvld xr1, a4, -65
addi.d a2, a2, -64
addi.d a4, a4, -64
- xvst $xr0, a5, -32
- xvst $xr1, a5, -64
+ xvst xr0, a5, -32
+ xvst xr1, a5, -64
addi.d a5, a5, -64
L(back_lt64):
bltu a2, t0, L(back_lt32)
- xvld $xr0, a4, -33
- xvst $xr0, a5, -32
+ xvld xr0, a4, -33
+ xvst xr0, a5, -32
L(back_lt32):
- xvst $xr8, a0, 0
+ xvst xr8, a0, 0
- xvst $xr9, a3, -31
+ xvst xr9, a3, -31
jr ra
END(MEMMOVE_NAME)
diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
index 90f89c7a..77f1b4ab 100644
--- a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
@@ -23,54 +23,54 @@ LEAF(MEMCPY_NAME, 6)
bltu t8, a2, L(copy_long) # a2 > 64
bltu t7, a2, L(more_32bytes) # a2 > 32
- vld $vr0, a1, 0
- vld $vr1, a4, -16
- vst $vr0, a0, 0
- vst $vr1, a3, -16
+ vld vr0, a1, 0
+ vld vr1, a4, -16
+ vst vr0, a0, 0
+ vst vr1, a3, -16
jr ra
L(more_32bytes):
- vld $vr0, a1, 0
- vld $vr1, a1, 16
- vld $vr2, a4, -32
+ vld vr0, a1, 0
+ vld vr1, a1, 16
+ vld vr2, a4, -32
- vld $vr3, a4, -16
- vst $vr0, a0, 0
- vst $vr1, a0, 16
- vst $vr2, a3, -32
+ vld vr3, a4, -16
+ vst vr0, a0, 0
+ vst vr1, a0, 16
+ vst vr2, a3, -32
- vst $vr3, a3, -16
+ vst vr3, a3, -16
jr ra
L(less_16bytes):
srli.d t0, a2, 3
beqz t0, L(less_8bytes)
- vldrepl.d $vr0, a1, 0
- vldrepl.d $vr1, a4, -8
- vstelm.d $vr0, a0, 0, 0
- vstelm.d $vr1, a3, -8, 0
+ vldrepl.d vr0, a1, 0
+ vldrepl.d vr1, a4, -8
+ vstelm.d vr0, a0, 0, 0
+ vstelm.d vr1, a3, -8, 0
jr ra
L(less_8bytes):
srli.d t0, a2, 2
beqz t0, L(less_4bytes)
- vldrepl.w $vr0, a1, 0
+ vldrepl.w vr0, a1, 0
- vldrepl.w $vr1, a4, -4
- vstelm.w $vr0, a0, 0, 0
- vstelm.w $vr1, a3, -4, 0
+ vldrepl.w vr1, a4, -4
+ vstelm.w vr0, a0, 0, 0
+ vstelm.w vr1, a3, -4, 0
jr ra
L(less_4bytes):
srli.d t0, a2, 1
beqz t0, L(less_2bytes)
- vldrepl.h $vr0, a1, 0
- vldrepl.h $vr1, a4, -2
+ vldrepl.h vr0, a1, 0
+ vldrepl.h vr1, a4, -2
- vstelm.h $vr0, a0, 0, 0
- vstelm.h $vr1, a3, -2, 0
+ vstelm.h vr0, a0, 0, 0
+ vstelm.h vr1, a3, -2, 0
jr ra
L(less_2bytes):
beqz a2, L(less_1bytes)
@@ -93,10 +93,10 @@ LEAF(MEMMOVE_NAME, 6)
bltu t8, a2, L(move_long) # a2 > 64
bltu t7, a2, L(more_32bytes) # a2 > 32
- vld $vr0, a1, 0
- vld $vr1, a4, -16
- vst $vr0, a0, 0
- vst $vr1, a3, -16
+ vld vr0, a1, 0
+ vld vr1, a4, -16
+ vst vr0, a0, 0
+ vst vr1, a3, -16
jr ra
nop
@@ -106,7 +106,7 @@ L(move_long):
L(copy_long):
- vld $vr2, a1, 0
+ vld vr2, a1, 0
andi t0, a0, 0xf
sub.d t0, t6, t0
add.d a1, a1, t0
@@ -114,10 +114,10 @@ L(copy_long):
sub.d a2, a2, t0
andi t1, a1, 0xf
bnez t1, L(unaligned)
- vld $vr0, a1, 0
+ vld vr0, a1, 0
addi.d a2, a2, -16
- vst $vr2, a0, 0
+ vst vr2, a0, 0
andi t2, a2, 0x7f
add.d a5, a0, t0
@@ -128,69 +128,69 @@ L(copy_long):
L(al_loop):
- vld $vr1, a1, 16
- vld $vr2, a1, 32
- vld $vr3, a1, 48
- vld $vr4, a1, 64
+ vld vr1, a1, 16
+ vld vr2, a1, 32
+ vld vr3, a1, 48
+ vld vr4, a1, 64
- vld $vr5, a1, 80
- vld $vr6, a1, 96
- vld $vr7, a1, 112
- vst $vr0, a5, 0
+ vld vr5, a1, 80
+ vld vr6, a1, 96
+ vld vr7, a1, 112
+ vst vr0, a5, 0
- vld $vr0, a1, 128
+ vld vr0, a1, 128
addi.d a1, a1, 128
- vst $vr1, a5, 16
- vst $vr2, a5, 32
+ vst vr1, a5, 16
+ vst vr2, a5, 32
- vst $vr3, a5, 48
- vst $vr4, a5, 64
- vst $vr5, a5, 80
- vst $vr6, a5, 96
+ vst vr3, a5, 48
+ vst vr4, a5, 64
+ vst vr5, a5, 80
+ vst vr6, a5, 96
- vst $vr7, a5, 112
+ vst vr7, a5, 112
addi.d a5, a5, 128
bne a1, a6, L(al_loop)
L(al_less_128):
blt a2, t8, L(al_less_64)
- vld $vr1, a1, 16
- vld $vr2, a1, 32
- vld $vr3, a1, 48
+ vld vr1, a1, 16
+ vld vr2, a1, 32
+ vld vr3, a1, 48
addi.d a2, a2, -64
- vst $vr0, a5, 0
- vld $vr0, a1, 64
+ vst vr0, a5, 0
+ vld vr0, a1, 64
addi.d a1, a1, 64
- vst $vr1, a5, 16
+ vst vr1, a5, 16
- vst $vr2, a5, 32
- vst $vr3, a5, 48
+ vst vr2, a5, 32
+ vst vr3, a5, 48
addi.d a5, a5, 64
L(al_less_64):
blt a2, t7, L(al_less_32)
- vld $vr1, a1, 16
+ vld vr1, a1, 16
addi.d a2, a2, -32
- vst $vr0, a5, 0
- vld $vr0, a1, 32
+ vst vr0, a5, 0
+ vld vr0, a1, 32
addi.d a1, a1, 32
- vst $vr1, a5, 16
+ vst vr1, a5, 16
addi.d a5, a5, 32
L(al_less_32):
blt a2, t6, L(al_less_16)
- vst $vr0, a5, 0
- vld $vr0, a1, 16
+ vst vr0, a5, 0
+ vld vr0, a1, 16
addi.d a5, a5, 16
L(al_less_16):
- vld $vr1, a4, -16
+ vld vr1, a4, -16
- vst $vr0, a5, 0
- vst $vr1, a3, -16
+ vst vr0, a5, 0
+ vst vr1, a3, -16
jr ra
nop
@@ -201,17 +201,17 @@ L(magic_num):
L(unaligned):
pcaddi t2, -4
bstrins.d a1, zero, 3, 0
- vld $vr8, t2, 0
- vld $vr0, a1, 0
+ vld vr8, t2, 0
+ vld vr0, a1, 0
- vld $vr1, a1, 16
+ vld vr1, a1, 16
addi.d a2, a2, -16
- vst $vr2, a0, 0
+ vst vr2, a0, 0
add.d a5, a0, t0
- vreplgr2vr.b $vr9, t1
+ vreplgr2vr.b vr9, t1
andi t2, a2, 0x7f
- vadd.b $vr9, $vr9, $vr8
+ vadd.b vr9, vr9, vr8
addi.d a1, a1, 32
@@ -221,97 +221,97 @@ L(unaligned):
add.d a6, a1, t3
L(un_loop):
- vld $vr2, a1, 0
- vld $vr3, a1, 16
- vld $vr4, a1, 32
- vld $vr5, a1, 48
+ vld vr2, a1, 0
+ vld vr3, a1, 16
+ vld vr4, a1, 32
+ vld vr5, a1, 48
- vld $vr6, a1, 64
- vld $vr7, a1, 80
- vshuf.b $vr8, $vr1, $vr0, $vr9
- vld $vr0, a1, 96
+ vld vr6, a1, 64
+ vld vr7, a1, 80
+ vshuf.b vr8, vr1, vr0, vr9
+ vld vr0, a1, 96
- vst $vr8, a5, 0
- vshuf.b $vr8, $vr2, $vr1, $vr9
- vld $vr1, a1, 112
- vst $vr8, a5, 16
+ vst vr8, a5, 0
+ vshuf.b vr8, vr2, vr1, vr9
+ vld vr1, a1, 112
+ vst vr8, a5, 16
addi.d a1, a1, 128
- vshuf.b $vr2, $vr3, $vr2, $vr9
- vshuf.b $vr3, $vr4, $vr3, $vr9
- vst $vr2, a5, 32
+ vshuf.b vr2, vr3, vr2, vr9
+ vshuf.b vr3, vr4, vr3, vr9
+ vst vr2, a5, 32
- vshuf.b $vr4, $vr5, $vr4, $vr9
- vst $vr3, a5, 48
- vshuf.b $vr5, $vr6, $vr5, $vr9
- vst $vr4, a5, 64
+ vshuf.b vr4, vr5, vr4, vr9
+ vst vr3, a5, 48
+ vshuf.b vr5, vr6, vr5, vr9
+ vst vr4, a5, 64
- vshuf.b $vr6, $vr7, $vr6, $vr9
- vst $vr5, a5, 80
- vshuf.b $vr7, $vr0, $vr7, $vr9
- vst $vr6, a5, 96
+ vshuf.b vr6, vr7, vr6, vr9
+ vst vr5, a5, 80
+ vshuf.b vr7, vr0, vr7, vr9
+ vst vr6, a5, 96
- vst $vr7, a5, 112
+ vst vr7, a5, 112
addi.d a5, a5, 128
bne a1, a6, L(un_loop)
L(un_less_128):
blt a2, t8, L(un_less_64)
- vld $vr2, a1, 0
- vld $vr3, a1, 16
- vshuf.b $vr4, $vr1, $vr0, $vr9
- vld $vr0, a1, 32
+ vld vr2, a1, 0
+ vld vr3, a1, 16
+ vshuf.b vr4, vr1, vr0, vr9
+ vld vr0, a1, 32
- vst $vr4, a5, 0
+ vst vr4, a5, 0
addi.d a2, a2, -64
- vshuf.b $vr4, $vr2, $vr1, $vr9
- vld $vr1, a1, 48
+ vshuf.b vr4, vr2, vr1, vr9
+ vld vr1, a1, 48
addi.d a1, a1, 64
- vst $vr4, a5, 16
- vshuf.b $vr2, $vr3, $vr2, $vr9
- vshuf.b $vr3, $vr0, $vr3, $vr9
+ vst vr4, a5, 16
+ vshuf.b vr2, vr3, vr2, vr9
+ vshuf.b vr3, vr0, vr3, vr9
- vst $vr2, a5, 32
- vst $vr3, a5, 48
+ vst vr2, a5, 32
+ vst vr3, a5, 48
addi.d a5, a5, 64
L(un_less_64):
blt a2, t7, L(un_less_32)
- vshuf.b $vr3, $vr1, $vr0, $vr9
- vld $vr0, a1, 0
- vst $vr3, a5, 0
+ vshuf.b vr3, vr1, vr0, vr9
+ vld vr0, a1, 0
+ vst vr3, a5, 0
addi.d a2, a2, -32
- vshuf.b $vr3, $vr0, $vr1, $vr9
- vld $vr1, a1, 16
+ vshuf.b vr3, vr0, vr1, vr9
+ vld vr1, a1, 16
addi.d a1, a1, 32
- vst $vr3, a5, 16
+ vst vr3, a5, 16
addi.d a5, a5, 32
L(un_less_32):
blt a2, t6, L(un_less_16)
- vshuf.b $vr2, $vr1, $vr0, $vr9
- vor.v $vr0, $vr1, $vr1
+ vshuf.b vr2, vr1, vr0, vr9
+ vor.v vr0, vr1, vr1
- vld $vr1, a1, 0
- vst $vr2, a5, 0
+ vld vr1, a1, 0
+ vst vr2, a5, 0
addi.d a5, a5, 16
L(un_less_16):
- vld $vr2, a4, -16
+ vld vr2, a4, -16
- vshuf.b $vr0, $vr1, $vr0, $vr9
- vst $vr0, a5, 0
- vst $vr2, a3, -16
+ vshuf.b vr0, vr1, vr0, vr9
+ vst vr0, a5, 0
+ vst vr2, a3, -16
jr ra
L(copy_back):
addi.d t0, a3, -1
- vld $vr2, a4, -16
+ vld vr2, a4, -16
andi t0, t0, 0xf
addi.d t0, t0, 1 # in case a3 is already aligned, load 16bytes and store 16bytes
@@ -320,9 +320,9 @@ L(copy_back):
andi t1, a4, 0xf
bnez t1, L(back_unaligned)
- vld $vr0, a4, -16
+ vld vr0, a4, -16
addi.d a2, a2, -16
- vst $vr2, a3, -16
+ vst vr2, a3, -16
andi t2, a2, 0x7f
@@ -333,70 +333,70 @@ L(copy_back):
sub.d a6, a4, t3
L(back_al_loop):
- vld $vr1, a4, -32
- vld $vr2, a4, -48
- vld $vr3, a4, -64
+ vld vr1, a4, -32
+ vld vr2, a4, -48
+ vld vr3, a4, -64
- vld $vr4, a4, -80
- vld $vr5, a4, -96
- vld $vr6, a4, -112
- vld $vr7, a4, -128
+ vld vr4, a4, -80
+ vld vr5, a4, -96
+ vld vr6, a4, -112
+ vld vr7, a4, -128
- vst $vr0, a3, -16
- vld $vr0, a4, -144
+ vst vr0, a3, -16
+ vld vr0, a4, -144
addi.d a4, a4, -128
- vst $vr1, a3, -32
+ vst vr1, a3, -32
- vst $vr2, a3, -48
- vst $vr3, a3, -64
- vst $vr4, a3, -80
- vst $vr5, a3, -96
+ vst vr2, a3, -48
+ vst vr3, a3, -64
+ vst vr4, a3, -80
+ vst vr5, a3, -96
- vst $vr6, a3, -112
- vst $vr7, a3, -128
+ vst vr6, a3, -112
+ vst vr7, a3, -128
addi.d a3, a3, -128
bne a4, a6, L(back_al_loop)
L(back_al_less_128):
blt a2, t8, L(back_al_less_64)
- vld $vr1, a4, -32
- vld $vr2, a4, -48
- vld $vr3, a4, -64
+ vld vr1, a4, -32
+ vld vr2, a4, -48
+ vld vr3, a4, -64
addi.d a2, a2, -64
- vst $vr0, a3, -16
- vld $vr0, a4, -80
+ vst vr0, a3, -16
+ vld vr0, a4, -80
addi.d a4, a4, -64
- vst $vr1, a3, -32
- vst $vr2, a3, -48
- vst $vr3, a3, -64
+ vst vr1, a3, -32
+ vst vr2, a3, -48
+ vst vr3, a3, -64
addi.d a3, a3, -64
L(back_al_less_64):
blt a2, t7, L(back_al_less_32)
- vld $vr1, a4, -32
+ vld vr1, a4, -32
addi.d a2, a2, -32
- vst $vr0, a3, -16
+ vst vr0, a3, -16
- vld $vr0, a4, -48
- vst $vr1, a3, -32
+ vld vr0, a4, -48
+ vst vr1, a3, -32
addi.d a3, a3, -32
addi.d a4, a4, -32
L(back_al_less_32):
blt a2, t6, L(back_al_less_16)
- vst $vr0, a3, -16
- vld $vr0, a4, -32
+ vst vr0, a3, -16
+ vld vr0, a4, -32
addi.d a3, a3, -16
L(back_al_less_16):
- vld $vr1, a1, 0
- vst $vr0, a3, -16
- vst $vr1, a0, 0
+ vld vr1, a1, 0
+ vst vr0, a3, -16
+ vst vr1, a0, 0
jr ra
L(magic_num_2):
@@ -405,18 +405,18 @@ L(magic_num_2):
L(back_unaligned):
pcaddi t2, -4
bstrins.d a4, zero, 3, 0
- vld $vr8, t2, 0
- vld $vr0, a4, 0
+ vld vr8, t2, 0
+ vld vr0, a4, 0
- vld $vr1, a4, -16
+ vld vr1, a4, -16
addi.d a2, a2, -16
- vst $vr2, a3, -16
+ vst vr2, a3, -16
sub.d a3, a3, t0
- vreplgr2vr.b $vr9, t1
+ vreplgr2vr.b vr9, t1
andi t2, a2, 0x7f
- vadd.b $vr9, $vr9, $vr8
+ vadd.b vr9, vr9, vr8
addi.d a4, a4, -16
beq t2, a2, L(back_un_less_128)
@@ -425,92 +425,92 @@ L(back_unaligned):
sub.d a6, a4, t3
L(back_un_loop):
- vld $vr2, a4, -16
- vld $vr3, a4, -32
- vld $vr4, a4, -48
+ vld vr2, a4, -16
+ vld vr3, a4, -32
+ vld vr4, a4, -48
- vld $vr5, a4, -64
- vld $vr6, a4, -80
- vld $vr7, a4, -96
- vshuf.b $vr8, $vr0, $vr1, $vr9
+ vld vr5, a4, -64
+ vld vr6, a4, -80
+ vld vr7, a4, -96
+ vshuf.b vr8, vr0, vr1, vr9
- vld $vr0, a4, -112
- vst $vr8, a3, -16
- vshuf.b $vr8, $vr1, $vr2, $vr9
- vld $vr1, a4, -128
+ vld vr0, a4, -112
+ vst vr8, a3, -16
+ vshuf.b vr8, vr1, vr2, vr9
+ vld vr1, a4, -128
- vst $vr8, a3, -32
+ vst vr8, a3, -32
addi.d a4, a4, -128
- vshuf.b $vr2, $vr2, $vr3, $vr9
- vshuf.b $vr3, $vr3, $vr4, $vr9
+ vshuf.b vr2, vr2, vr3, vr9
+ vshuf.b vr3, vr3, vr4, vr9
- vst $vr2, a3, -48
- vshuf.b $vr4, $vr4, $vr5, $vr9
- vst $vr3, a3, -64
- vshuf.b $vr5, $vr5, $vr6, $vr9
+ vst vr2, a3, -48
+ vshuf.b vr4, vr4, vr5, vr9
+ vst vr3, a3, -64
+ vshuf.b vr5, vr5, vr6, vr9
- vst $vr4, a3, -80
- vshuf.b $vr6, $vr6, $vr7, $vr9
- vst $vr5, a3, -96
- vshuf.b $vr7, $vr7, $vr0, $vr9
+ vst vr4, a3, -80
+ vshuf.b vr6, vr6, vr7, vr9
+ vst vr5, a3, -96
+ vshuf.b vr7, vr7, vr0, vr9
- vst $vr6, a3, -112
- vst $vr7, a3, -128
+ vst vr6, a3, -112
+ vst vr7, a3, -128
addi.d a3, a3, -128
bne a4, a6, L(back_un_loop)
L(back_un_less_128):
blt a2, t8, L(back_un_less_64)
- vld $vr2, a4, -16
- vld $vr3, a4, -32
- vshuf.b $vr4, $vr0, $vr1, $vr9
+ vld vr2, a4, -16
+ vld vr3, a4, -32
+ vshuf.b vr4, vr0, vr1, vr9
- vld $vr0, a4, -48
- vst $vr4, a3, -16
+ vld vr0, a4, -48
+ vst vr4, a3, -16
addi.d a2, a2, -64
- vshuf.b $vr4, $vr1, $vr2, $vr9
+ vshuf.b vr4, vr1, vr2, vr9
- vld $vr1, a4, -64
+ vld vr1, a4, -64
addi.d a4, a4, -64
- vst $vr4, a3, -32
- vshuf.b $vr2, $vr2, $vr3, $vr9
+ vst vr4, a3, -32
+ vshuf.b vr2, vr2, vr3, vr9
- vshuf.b $vr3, $vr3, $vr0, $vr9
- vst $vr2, a3, -48
- vst $vr3, a3, -64
+ vshuf.b vr3, vr3, vr0, vr9
+ vst vr2, a3, -48
+ vst vr3, a3, -64
addi.d a3, a3, -64
L(back_un_less_64):
blt a2, t7, L(back_un_less_32)
- vshuf.b $vr3, $vr0, $vr1, $vr9
- vld $vr0, a4, -16
- vst $vr3, a3, -16
+ vshuf.b vr3, vr0, vr1, vr9
+ vld vr0, a4, -16
+ vst vr3, a3, -16
addi.d a2, a2, -32
- vshuf.b $vr3, $vr1, $vr0, $vr9
- vld $vr1, a4, -32
+ vshuf.b vr3, vr1, vr0, vr9
+ vld vr1, a4, -32
addi.d a4, a4, -32
- vst $vr3, a3, -32
+ vst vr3, a3, -32
addi.d a3, a3, -32
L(back_un_less_32):
blt a2, t6, L(back_un_less_16)
- vshuf.b $vr2, $vr0, $vr1, $vr9
+ vshuf.b vr2, vr0, vr1, vr9
- vor.v $vr0, $vr1, $vr1
- vld $vr1, a4, -16
- vst $vr2, a3, -16
+ vor.v vr0, vr1, vr1
+ vld vr1, a4, -16
+ vst vr2, a3, -16
addi.d a3, a3, -16
L(back_un_less_16):
- vld $vr2, a1, 0
- vshuf.b $vr0, $vr0, $vr1, $vr9
- vst $vr0, a3, -16
- vst $vr2, a0, 0
+ vld vr2, a1, 0
+ vshuf.b vr0, vr0, vr1, vr9
+ vst vr0, a3, -16
+ vst vr2, a0, 0
jr ra
END(MEMMOVE_NAME)
diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S
index 9ecd0257..41554552 100644
--- a/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S
+++ b/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S
@@ -21,56 +21,56 @@ LEAF(MEMRCHR, 6)
bstrins.d a3, zero, 5, 0
addi.d t1, t1, 1 # len for unaligned address
- xvld $xr0, a3, 0
- xvld $xr1, a3, 32
+ xvld xr0, a3, 0
+ xvld xr1, a3, 32
sub.d t2, zero, t1
li.d t3, -1
- xvreplgr2vr.b $xr2, a1
+ xvreplgr2vr.b xr2, a1
andi t4, a0, 0x3f
srl.d t2, t3, t2
- xvseq.b $xr0, $xr0, $xr2
- xvseq.b $xr1, $xr1, $xr2
- xvmsknz.b $xr0, $xr0
+ xvseq.b xr0, xr0, xr2
+ xvseq.b xr1, xr1, xr2
+ xvmsknz.b xr0, xr0
- xvmsknz.b $xr1, $xr1
- xvpickve.w $xr3, $xr0, 4
- xvpickve.w $xr4, $xr1, 4
- vilvl.h $vr0, $vr3, $vr0
+ xvmsknz.b xr1, xr1
+ xvpickve.w xr3, xr0, 4
+ xvpickve.w xr4, xr1, 4
+ vilvl.h vr0, vr3, vr0
- vilvl.h $vr1, $vr4, $vr1
- vilvl.w $vr0, $vr1, $vr0
- movfr2gr.d t0, $f0
+ vilvl.h vr1, vr4, vr1
+ vilvl.w vr0, vr1, vr0
+ movfr2gr.d t0, fa0
and t0, t0, t2
bltu a2, t1, L(end)
bnez t0, L(found)
bstrins.d a0, zero, 5, 0
L(loop):
- xvld $xr0, a3, -64
+ xvld xr0, a3, -64
- xvld $xr1, a3, -32
+ xvld xr1, a3, -32
addi.d a3, a3, -64
- xvseq.b $xr0, $xr0, $xr2
- xvseq.b $xr1, $xr1, $xr2
+ xvseq.b xr0, xr0, xr2
+ xvseq.b xr1, xr1, xr2
beq a0, a3, L(out)
- xvmax.bu $xr3, $xr0, $xr1
- xvseteqz.v $fcc0, $xr3
- bcnez $fcc0, L(loop)
+ xvmax.bu xr3, xr0, xr1
+ xvseteqz.v fcc0, xr3
+ bcnez fcc0, L(loop)
- xvmsknz.b $xr0, $xr0
- xvmsknz.b $xr1, $xr1
- xvpickve.w $xr3, $xr0, 4
- xvpickve.w $xr4, $xr1, 4
+ xvmsknz.b xr0, xr0
+ xvmsknz.b xr1, xr1
+ xvpickve.w xr3, xr0, 4
+ xvpickve.w xr4, xr1, 4
- vilvl.h $vr0, $vr3, $vr0
- vilvl.h $vr1, $vr4, $vr1
- vilvl.w $vr0, $vr1, $vr0
- movfr2gr.d t0, $f0
+ vilvl.h vr0, vr3, vr0
+ vilvl.h vr1, vr4, vr1
+ vilvl.w vr0, vr1, vr0
+ movfr2gr.d t0, fa0
L(found):
addi.d a0, a3, 63
@@ -80,15 +80,15 @@ L(found):
L(out):
- xvmsknz.b $xr0, $xr0
- xvmsknz.b $xr1, $xr1
- xvpickve.w $xr3, $xr0, 4
- xvpickve.w $xr4, $xr1, 4
-
- vilvl.h $vr0, $vr3, $vr0
- vilvl.h $vr1, $vr4, $vr1
- vilvl.w $vr0, $vr1, $vr0
- movfr2gr.d t0, $f0
+ xvmsknz.b xr0, xr0
+ xvmsknz.b xr1, xr1
+ xvpickve.w xr3, xr0, 4
+ xvpickve.w xr4, xr1, 4
+
+ vilvl.h vr0, vr3, vr0
+ vilvl.h vr1, vr4, vr1
+ vilvl.w vr0, vr1, vr0
+ movfr2gr.d t0, fa0
L(end):
sll.d t2, t3, t4
diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S
index 4bdc18d8..4a302cac 100644
--- a/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S
@@ -19,46 +19,46 @@ LEAF(MEMRCHR, 6)
bstrins.d a3, zero, 4, 0
addi.d t1, t1, 1 # len for unaligned address
- vld $vr0, a3, 0
- vld $vr1, a3, 16
+ vld vr0, a3, 0
+ vld vr1, a3, 16
sub.d t2, zero, t1
li.d t3, -1
- vreplgr2vr.b $vr2, a1
+ vreplgr2vr.b vr2, a1
andi t4, a0, 0x1f
srl.d t2, t3, t2
- vseq.b $vr0, $vr0, $vr2
- vseq.b $vr1, $vr1, $vr2
- vmsknz.b $vr0, $vr0
+ vseq.b vr0, vr0, vr2
+ vseq.b vr1, vr1, vr2
+ vmsknz.b vr0, vr0
- vmsknz.b $vr1, $vr1
- vilvl.h $vr0, $vr1, $vr0
- movfr2gr.s t0, $f0
+ vmsknz.b vr1, vr1
+ vilvl.h vr0, vr1, vr0
+ movfr2gr.s t0, fa0
and t0, t0, t2
bltu a2, t1, L(end)
bnez t0, L(found)
bstrins.d a0, zero, 4, 0
L(loop):
- vld $vr0, a3, -32
+ vld vr0, a3, -32
- vld $vr1, a3, -16
+ vld vr1, a3, -16
addi.d a3, a3, -32
- vseq.b $vr0, $vr0, $vr2
- vseq.b $vr1, $vr1, $vr2
+ vseq.b vr0, vr0, vr2
+ vseq.b vr1, vr1, vr2
beq a0, a3, L(out)
- vmax.bu $vr3, $vr0, $vr1
- vseteqz.v $fcc0, $vr3
- bcnez $fcc0, L(loop)
+ vmax.bu vr3, vr0, vr1
+ vseteqz.v fcc0, vr3
+ bcnez fcc0, L(loop)
- vmsknz.b $vr0, $vr0
- vmsknz.b $vr1, $vr1
- vilvl.h $vr0, $vr1, $vr0
- movfr2gr.s t0, $f0
+ vmsknz.b vr0, vr0
+ vmsknz.b vr1, vr1
+ vilvl.h vr0, vr1, vr0
+ movfr2gr.s t0, fa0
L(found):
addi.d a0, a3, 31
@@ -67,10 +67,10 @@ L(found):
jr ra
L(out):
- vmsknz.b $vr0, $vr0
- vmsknz.b $vr1, $vr1
- vilvl.h $vr0, $vr1, $vr0
- movfr2gr.s t0, $f0
+ vmsknz.b vr0, vr0
+ vmsknz.b vr1, vr1
+ vilvl.h vr0, vr1, vr0
+ movfr2gr.s t0, fa0
L(end):
sll.d t2, t3, t4
diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lasx.S b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S
index b53c0b7b..5e4908dc 100644
--- a/sysdeps/loongarch/lp64/multiarch/memset-lasx.S
+++ b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S
@@ -14,7 +14,7 @@
LEAF(MEMSET, 6)
li.d t1, 32
move a3, a0
- xvreplgr2vr.b $xr0, a1
+ xvreplgr2vr.b xr0, a1
add.d a4, a0, a2
bgeu t1, a2, L(less_32bytes) # len <= 32
@@ -24,46 +24,46 @@ LEAF(MEMSET, 6)
L(less_128bytes):
bgeu t2, a2, L(less_64bytes) # len <= 64
- xvst $xr0, a3, 0
- xvst $xr0, a3, 32
- xvst $xr0, a4, -32
+ xvst xr0, a3, 0
+ xvst xr0, a3, 32
+ xvst xr0, a4, -32
- xvst $xr0, a4, -64
+ xvst xr0, a4, -64
jr ra
L(less_64bytes):
- xvst $xr0, a3, 0
- xvst $xr0, a4, -32
+ xvst xr0, a3, 0
+ xvst xr0, a4, -32
jr ra
L(less_32bytes):
srli.d t0, a2, 4
beqz t0, L(less_16bytes)
- vst $vr0, a3, 0
+ vst vr0, a3, 0
- vst $vr0, a4, -16
+ vst vr0, a4, -16
jr ra
L(less_16bytes):
srli.d t0, a2, 3
beqz t0, L(less_8bytes)
- vstelm.d $vr0, a3, 0, 0
- vstelm.d $vr0, a4, -8, 0
+ vstelm.d vr0, a3, 0, 0
+ vstelm.d vr0, a4, -8, 0
jr ra
L(less_8bytes):
srli.d t0, a2, 2
beqz t0, L(less_4bytes)
- vstelm.w $vr0, a3, 0, 0
- vstelm.w $vr0, a4, -4, 0
+ vstelm.w vr0, a3, 0, 0
+ vstelm.w vr0, a4, -4, 0
jr ra
L(less_4bytes):
srli.d t0, a2, 1
beqz t0, L(less_2bytes)
- vstelm.h $vr0, a3, 0, 0
- vstelm.h $vr0, a4, -2, 0
+ vstelm.h vr0, a3, 0, 0
+ vstelm.h vr0, a4, -2, 0
jr ra
L(less_2bytes):
@@ -73,7 +73,7 @@ L(less_1bytes):
jr ra
L(long_bytes):
- xvst $xr0, a3, 0
+ xvst xr0, a3, 0
bstrins.d a3, zero, 4, 0
addi.d a3, a3, 32
sub.d a2, a4, a3
@@ -85,15 +85,15 @@ L(long_bytes):
L(loop_256):
- xvst $xr0, a3, 0
- xvst $xr0, a3, 32
- xvst $xr0, a3, 64
- xvst $xr0, a3, 96
+ xvst xr0, a3, 0
+ xvst xr0, a3, 32
+ xvst xr0, a3, 64
+ xvst xr0, a3, 96
- xvst $xr0, a3, 128
- xvst $xr0, a3, 160
- xvst $xr0, a3, 192
- xvst $xr0, a3, 224
+ xvst xr0, a3, 128
+ xvst xr0, a3, 160
+ xvst xr0, a3, 192
+ xvst xr0, a3, 224
addi.d a3, a3, 256
bne a3, t0, L(loop_256)
@@ -101,26 +101,26 @@ L(long_end):
bltu a2, t3, L(end_less_128)
addi.d a2, a2, -128
- xvst $xr0, a3, 0
- xvst $xr0, a3, 32
- xvst $xr0, a3, 64
- xvst $xr0, a3, 96
+ xvst xr0, a3, 0
+ xvst xr0, a3, 32
+ xvst xr0, a3, 64
+ xvst xr0, a3, 96
addi.d a3, a3, 128
L(end_less_128):
bltu a2, t2, L(end_less_64)
addi.d a2, a2, -64
- xvst $xr0, a3, 0
+ xvst xr0, a3, 0
- xvst $xr0, a3, 32
+ xvst xr0, a3, 32
addi.d a3, a3, 64
L(end_less_64):
bltu a2, t1, L(end_less_32)
- xvst $xr0, a3, 0
+ xvst xr0, a3, 0
L(end_less_32):
- xvst $xr0, a4, -32
+ xvst xr0, a4, -32
jr ra
END(MEMSET)
diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lsx.S b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S
index 7ab85283..67b279c8 100644
--- a/sysdeps/loongarch/lp64/multiarch/memset-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S
@@ -14,7 +14,7 @@
LEAF(MEMSET, 6)
li.d t1, 16
move a3, a0
- vreplgr2vr.b $vr0, a1
+ vreplgr2vr.b vr0, a1
add.d a4, a0, a2
bgeu t1, a2, L(less_16bytes) # len <= 16
@@ -24,48 +24,48 @@ LEAF(MEMSET, 6)
L(less_64bytes):
bgeu t2, a2, L(less_32bytes) # len <= 32
- vst $vr0, a3, 0
- vst $vr0, a3, 16
- vst $vr0, a4, -32
+ vst vr0, a3, 0
+ vst vr0, a3, 16
+ vst vr0, a4, -32
- vst $vr0, a4, -16
+ vst vr0, a4, -16
jr ra
L(less_32bytes):
- vst $vr0, a3, 0
- vst $vr0, a4, -16
+ vst vr0, a3, 0
+ vst vr0, a4, -16
jr ra
L(less_16bytes):
srli.d t0, a2, 3
beqz t0, L(less_8bytes)
- vstelm.d $vr0, a3, 0, 0
+ vstelm.d vr0, a3, 0, 0
- vstelm.d $vr0, a4, -8, 0
+ vstelm.d vr0, a4, -8, 0
jr ra
L(less_8bytes):
srli.d t0, a2, 2
beqz t0, L(less_4bytes)
- vstelm.w $vr0, a3, 0, 0
- vstelm.w $vr0, a4, -4, 0
+ vstelm.w vr0, a3, 0, 0
+ vstelm.w vr0, a4, -4, 0
jr ra
L(less_4bytes):
srli.d t0, a2, 1
beqz t0, L(less_2bytes)
- vstelm.h $vr0, a3, 0, 0
- vstelm.h $vr0, a4, -2, 0
+ vstelm.h vr0, a3, 0, 0
+ vstelm.h vr0, a4, -2, 0
jr ra
L(less_2bytes):
beqz a2, L(less_1bytes)
- vstelm.b $vr0, a3, 0, 0
+ vstelm.b vr0, a3, 0, 0
L(less_1bytes):
jr ra
L(long_bytes):
- vst $vr0, a3, 0
+ vst vr0, a3, 0
bstrins.d a3, zero, 3, 0
addi.d a3, a3, 16
@@ -77,43 +77,43 @@ L(long_bytes):
sub.d t0, a4, t0
L(loop_128):
- vst $vr0, a3, 0
+ vst vr0, a3, 0
- vst $vr0, a3, 16
- vst $vr0, a3, 32
- vst $vr0, a3, 48
- vst $vr0, a3, 64
+ vst vr0, a3, 16
+ vst vr0, a3, 32
+ vst vr0, a3, 48
+ vst vr0, a3, 64
- vst $vr0, a3, 80
- vst $vr0, a3, 96
- vst $vr0, a3, 112
+ vst vr0, a3, 80
+ vst vr0, a3, 96
+ vst vr0, a3, 112
addi.d a3, a3, 128
bne a3, t0, L(loop_128)
L(long_end):
bltu a2, t3, L(end_less_64)
addi.d a2, a2, -64
- vst $vr0, a3, 0
+ vst vr0, a3, 0
- vst $vr0, a3, 16
- vst $vr0, a3, 32
- vst $vr0, a3, 48
+ vst vr0, a3, 16
+ vst vr0, a3, 32
+ vst vr0, a3, 48
addi.d a3, a3, 64
L(end_less_64):
bltu a2, t2, L(end_less_32)
addi.d a2, a2, -32
- vst $vr0, a3, 0
- vst $vr0, a3, 16
+ vst vr0, a3, 0
+ vst vr0, a3, 16
addi.d a3, a3, 32
L(end_less_32):
bltu a2, t1, L(end_less_16)
- vst $vr0, a3, 0
+ vst vr0, a3, 0
L(end_less_16):
- vst $vr0, a4, -16
+ vst vr0, a4, -16
jr ra
END(MEMSET)
diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S
index 1e94aa50..856f99ce 100644
--- a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S
+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S
@@ -8,15 +8,15 @@
LEAF(RAWMEMCHR, 6)
move a2, a0
bstrins.d a0, zero, 4, 0
- xvld $xr0, a0, 0
- xvreplgr2vr.b $xr1, a1
+ xvld xr0, a0, 0
+ xvreplgr2vr.b xr1, a1
- xvseq.b $xr0, $xr0, $xr1
- xvmsknz.b $xr0, $xr0
- xvpickve.w $xr2, $xr0, 4
- vilvl.h $vr0, $vr2, $vr0
+ xvseq.b xr0, xr0, xr1
+ xvmsknz.b xr0, xr0
+ xvpickve.w xr2, xr0, 4
+ vilvl.h vr0, vr2, vr0
- movfr2gr.s t0, $f0
+ movfr2gr.s t0, fa0
sra.w t0, t0, a2
beqz t0, L(loop)
ctz.w t0, t0
@@ -27,17 +27,17 @@ LEAF(RAWMEMCHR, 6)
nop
L(loop):
- xvld $xr0, a0, 32
+ xvld xr0, a0, 32
addi.d a0, a0, 32
- xvseq.b $xr0, $xr0, $xr1
- xvseteqz.v $fcc0, $xr0
+ xvseq.b xr0, xr0, xr1
+ xvseteqz.v fcc0, xr0
- bcnez $fcc0, L(loop)
- xvmsknz.b $xr0, $xr0
- xvpickve.w $xr1, $xr0, 4
- vilvl.h $vr0, $vr1, $vr0
+ bcnez fcc0, L(loop)
+ xvmsknz.b xr0, xr0
+ xvpickve.w xr1, xr0, 4
+ vilvl.h vr0, vr1, vr0
- movfr2gr.s t0, $f0
+ movfr2gr.s t0, fa0
ctz.w t0, t0
add.d a0, a0, t0
jr ra
diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S
index 40bf0cda..7e864e96 100644
--- a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S
@@ -14,17 +14,17 @@
LEAF(RAWMEMCHR, 6)
move a2, a0
bstrins.d a0, zero, 4, 0
- vld $vr0, a0, 0
- vld $vr1, a0, 16
+ vld vr0, a0, 0
+ vld vr1, a0, 16
- vreplgr2vr.b $vr2, a1
- vseq.b $vr0, $vr0, $vr2
- vseq.b $vr1, $vr1, $vr2
- vmsknz.b $vr0, $vr0
+ vreplgr2vr.b vr2, a1
+ vseq.b vr0, vr0, vr2
+ vseq.b vr1, vr1, vr2
+ vmsknz.b vr0, vr0
- vmsknz.b $vr1, $vr1
- vilvl.h $vr0, $vr1, $vr0
- movfr2gr.s t0, $f0
+ vmsknz.b vr1, vr1
+ vilvl.h vr0, vr1, vr0
+ movfr2gr.s t0, fa0
sra.w t0, t0, a2
beqz t0, L(loop)
@@ -34,15 +34,15 @@ LEAF(RAWMEMCHR, 6)
L(loop):
- vld $vr0, a0, 32
+ vld vr0, a0, 32
addi.d a0, a0, 16
- vseq.b $vr0, $vr0, $vr2
- vseteqz.v $fcc0, $vr0
+ vseq.b vr0, vr0, vr2
+ vseteqz.v fcc0, vr0
- bcnez $fcc0, L(loop)
+ bcnez fcc0, L(loop)
addi.d a0, a0, 16
- vfrstpi.b $vr0, $vr0, 0
- vpickve2gr.bu t0, $vr0, 0
+ vfrstpi.b vr0, vr0, 0
+ vpickve2gr.bu t0, vr0, 0
add.d a0, a0, t0
jr ra
diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S
index 0836f590..53832de7 100644
--- a/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/stpcpy-lsx.S
@@ -18,67 +18,67 @@ L(magic_num):
ENTRY_NO_ALIGN(STPCPY)
pcaddi t0, -4
andi a4, a1, 0xf
- vld $vr1, t0, 0
+ vld vr1, t0, 0
beqz a4, L(load_start)
xor t0, a1, a4
- vld $vr0, t0, 0
- vreplgr2vr.b $vr2, a4
- vadd.b $vr2, $vr2, $vr1
+ vld vr0, t0, 0
+ vreplgr2vr.b vr2, a4
+ vadd.b vr2, vr2, vr1
- vshuf.b $vr0, $vr2, $vr0, $vr2
- vsetanyeqz.b $fcc0, $vr0
- bcnez $fcc0, L(end)
+ vshuf.b vr0, vr2, vr0, vr2
+ vsetanyeqz.b fcc0, vr0
+ bcnez fcc0, L(end)
L(load_start):
- vld $vr0, a1, 0
+ vld vr0, a1, 0
li.d t1, 16
andi a3, a0, 0xf
- vsetanyeqz.b $fcc0, $vr0
+ vsetanyeqz.b fcc0, vr0
sub.d t0, t1, a3
- bcnez $fcc0, L(end)
+ bcnez fcc0, L(end)
add.d a1, a1, t0
- vst $vr0, a0, 0
+ vst vr0, a0, 0
add.d a0, a0, t0
bne a3, a4, L(unaligned)
- vld $vr0, a1, 0
- vsetanyeqz.b $fcc0, $vr0
- bcnez $fcc0, L(end)
+ vld vr0, a1, 0
+ vsetanyeqz.b fcc0, vr0
+ bcnez fcc0, L(end)
L(loop):
- vst $vr0, a0, 0
- vld $vr0, a1, 16
+ vst vr0, a0, 0
+ vld vr0, a1, 16
addi.d a0, a0, 16
addi.d a1, a1, 16
- vsetanyeqz.b $fcc0, $vr0
- bceqz $fcc0, L(loop)
- vmsknz.b $vr1, $vr0
- movfr2gr.s t0, $f1
+ vsetanyeqz.b fcc0, vr0
+ bceqz fcc0, L(loop)
+ vmsknz.b vr1, vr0
+ movfr2gr.s t0, fa1
cto.w t0, t0
add.d a1, a1, t0
- vld $vr0, a1, -15
+ vld vr0, a1, -15
add.d a0, a0, t0
- vst $vr0, a0, -15
+ vst vr0, a0, -15
jr ra
L(end):
- vseqi.b $vr1, $vr0, 0
- vfrstpi.b $vr1, $vr1, 0
+ vseqi.b vr1, vr0, 0
+ vfrstpi.b vr1, vr1, 0
- vpickve2gr.bu t0, $vr1, 0
+ vpickve2gr.bu t0, vr1, 0
addi.d t0, t0, 1
L(end_16):
andi t1, t0, 16
beqz t1, L(end_8)
- vst $vr0, a0, 0
+ vst vr0, a0, 0
addi.d a0, a0, 15
jr ra
L(end_8):
@@ -89,26 +89,26 @@ L(end_8):
andi t5, t0, 1
beqz t2, L(end_4)
- vstelm.d $vr0, a0, 0, 0
+ vstelm.d vr0, a0, 0, 0
addi.d a0, a0, 8
- vbsrl.v $vr0, $vr0, 8
+ vbsrl.v vr0, vr0, 8
L(end_4):
beqz t3, L(end_2)
- vstelm.w $vr0, a0, 0, 0
+ vstelm.w vr0, a0, 0, 0
addi.d a0, a0, 4
- vbsrl.v $vr0, $vr0, 4
+ vbsrl.v vr0, vr0, 4
L(end_2):
beqz t4, L(end_1)
- vstelm.h $vr0, a0, 0, 0
+ vstelm.h vr0, a0, 0, 0
addi.d a0, a0, 2
- vbsrl.v $vr0, $vr0, 2
+ vbsrl.v vr0, vr0, 2
L(end_1):
beqz t5, L(out)
- vstelm.b $vr0, a0, 0, 0
+ vstelm.b vr0, a0, 0, 0
addi.d a0, a0, 1
L(out):
addi.d a0, a0, -1
@@ -120,49 +120,49 @@ L(unaligned):
andi a3, a1, 0xf
bstrins.d a1, zero, 3, 0
- vld $vr2, a1, 0
- vreplgr2vr.b $vr3, a3
- vslt.b $vr4, $vr1, $vr3
- vor.v $vr0, $vr2, $vr4
+ vld vr2, a1, 0
+ vreplgr2vr.b vr3, a3
+ vslt.b vr4, vr1, vr3
+ vor.v vr0, vr2, vr4
- vsetanyeqz.b $fcc0, $vr0
- bcnez $fcc0, L(un_first_end)
- vld $vr0, a1, 16
- vadd.b $vr3, $vr3, $vr1
+ vsetanyeqz.b fcc0, vr0
+ bcnez fcc0, L(un_first_end)
+ vld vr0, a1, 16
+ vadd.b vr3, vr3, vr1
addi.d a1, a1, 16
- vshuf.b $vr4, $vr0, $vr2, $vr3
- vsetanyeqz.b $fcc0, $vr0
- bcnez $fcc0, L(un_end)
+ vshuf.b vr4, vr0, vr2, vr3
+ vsetanyeqz.b fcc0, vr0
+ bcnez fcc0, L(un_end)
L(un_loop):
- vor.v $vr2, $vr0, $vr0
- vld $vr0, a1, 16
- vst $vr4, a0, 0
+ vor.v vr2, vr0, vr0
+ vld vr0, a1, 16
+ vst vr4, a0, 0
addi.d a1, a1, 16
addi.d a0, a0, 16
- vshuf.b $vr4, $vr0, $vr2, $vr3
- vsetanyeqz.b $fcc0, $vr0
- bceqz $fcc0, L(un_loop)
+ vshuf.b vr4, vr0, vr2, vr3
+ vsetanyeqz.b fcc0, vr0
+ bceqz fcc0, L(un_loop)
L(un_end):
- vsetanyeqz.b $fcc0, $vr4
- bcnez $fcc0, 1f
- vst $vr4, a0, 0
+ vsetanyeqz.b fcc0, vr4
+ bcnez fcc0, 1f
+ vst vr4, a0, 0
1:
- vmsknz.b $vr1, $vr0
+ vmsknz.b vr1, vr0
- movfr2gr.s t0, $f1
+ movfr2gr.s t0, fa1
cto.w t0, t0
add.d a1, a1, t0
- vld $vr0, a1, -15
+ vld vr0, a1, -15
add.d a0, a0, t0
sub.d a0, a0, a3
- vst $vr0, a0, 1
+ vst vr0, a0, 1
addi.d a0, a0, 16
jr ra
diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S
index 3f6ad915..fab6edc7 100644
--- a/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S
+++ b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S
@@ -16,18 +16,18 @@
LEAF(STRCHR, 6)
andi t1, a0, 0x1f
bstrins.d a0, zero, 4, 0
- xvld $xr0, a0, 0
+ xvld xr0, a0, 0
li.d t2, -1
- xvreplgr2vr.b $xr1, a1
+ xvreplgr2vr.b xr1, a1
sll.d t1, t2, t1
- xvxor.v $xr2, $xr0, $xr1
- xvmin.bu $xr0, $xr0, $xr2
+ xvxor.v xr2, xr0, xr1
+ xvmin.bu xr0, xr0, xr2
- xvmsknz.b $xr0, $xr0
- xvpickve.w $xr3, $xr0, 4
- vilvl.h $vr0, $vr3, $vr0
- movfr2gr.s t0, $f0
+ xvmsknz.b xr0, xr0
+ xvpickve.w xr3, xr0, 4
+ vilvl.h vr0, vr3, vr0
+ movfr2gr.s t0, fa0
orn t0, t0, t1
bne t0, t2, L(end)
@@ -36,37 +36,37 @@ LEAF(STRCHR, 6)
L(loop):
- xvld $xr0, a0, 0
- xvxor.v $xr2, $xr0, $xr1
- xvmin.bu $xr0, $xr0, $xr2
- xvsetanyeqz.b $fcc0, $xr0
+ xvld xr0, a0, 0
+ xvxor.v xr2, xr0, xr1
+ xvmin.bu xr0, xr0, xr2
+ xvsetanyeqz.b fcc0, xr0
- bcnez $fcc0, L(loop_end)
- xvld $xr0, a0, 32
+ bcnez fcc0, L(loop_end)
+ xvld xr0, a0, 32
addi.d a0, a0, 64
- xvxor.v $xr2, $xr0, $xr1
+ xvxor.v xr2, xr0, xr1
- xvmin.bu $xr0, $xr0, $xr2
- xvsetanyeqz.b $fcc0, $xr0
- bceqz $fcc0, L(loop)
+ xvmin.bu xr0, xr0, xr2
+ xvsetanyeqz.b fcc0, xr0
+ bceqz fcc0, L(loop)
addi.d a0, a0, -32
L(loop_end):
- xvmsknz.b $xr0, $xr0
- xvpickve.w $xr1, $xr0, 4
- vilvl.h $vr0, $vr1, $vr0
- movfr2gr.s t0, $f0
+ xvmsknz.b xr0, xr0
+ xvpickve.w xr1, xr0, 4
+ vilvl.h vr0, vr1, vr0
+ movfr2gr.s t0, fa0
L(end):
cto.w t0, t0
add.d a0, a0, t0
#ifndef AS_STRCHRNUL
- vreplgr2vr.b $vr0, t0
- xvpermi.q $xr3, $xr2, 1
+ vreplgr2vr.b vr0, t0
+ xvpermi.q xr3, xr2, 1
- vshuf.b $vr0, $vr3, $vr2, $vr0
- vpickve2gr.bu t0, $vr0, 0
+ vshuf.b vr0, vr3, vr2, vr0
+ vpickve2gr.bu t0, vr0, 0
masknez a0, a0, t0
#endif
jr ra
diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S
index 4ad9a4ad..ebeb332e 100644
--- a/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S
@@ -16,16 +16,16 @@
LEAF(STRCHR, 6)
andi t1, a0, 0xf
bstrins.d a0, zero, 3, 0
- vld $vr0, a0, 0
+ vld vr0, a0, 0
li.d t2, -1
- vreplgr2vr.b $vr1, a1
+ vreplgr2vr.b vr1, a1
sll.d t3, t2, t1
- vxor.v $vr2, $vr0, $vr1
- vmin.bu $vr0, $vr0, $vr2
+ vxor.v vr2, vr0, vr1
+ vmin.bu vr0, vr0, vr2
- vmsknz.b $vr0, $vr0
- movfr2gr.s t0, $f0
+ vmsknz.b vr0, vr0
+ movfr2gr.s t0, fa0
ext.w.h t0, t0
orn t0, t0, t3
@@ -34,23 +34,23 @@ L(found):
cto.w t0, t0
add.d a0, a0, t0
#ifndef AS_STRCHRNUL
- vreplve.b $vr2, $vr2, t0
- vpickve2gr.bu t1, $vr2, 0
+ vreplve.b vr2, vr2, t0
+ vpickve2gr.bu t1, vr2, 0
masknez a0, a0, t1
#endif
jr ra
L(loop):
- vld $vr0, a0, 16
+ vld vr0, a0, 16
addi.d a0, a0, 16
- vxor.v $vr2, $vr0, $vr1
- vmin.bu $vr0, $vr0, $vr2
+ vxor.v vr2, vr0, vr1
+ vmin.bu vr0, vr0, vr2
- vsetanyeqz.b $fcc0, $vr0
- bceqz $fcc0, L(loop)
- vmsknz.b $vr0, $vr0
- movfr2gr.s t0, $f0
+ vsetanyeqz.b fcc0, vr0
+ bceqz fcc0, L(loop)
+ vmsknz.b vr0, vr0
+ movfr2gr.s t0, fa0
b L(found)
END(STRCHR)
diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
index c86e3ecd..c6e1110c 100644
--- a/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
@@ -20,45 +20,45 @@ L(magic_num):
ENTRY_NO_ALIGN(STRCMP)
pcaddi t0, -4
andi a2, a0, 0xf
- vld $vr2, t0, 0
+ vld vr2, t0, 0
andi a3, a1, 0xf
bne a2, a3, L(unaligned)
bstrins.d a0, zero, 3, 0
bstrins.d a1, zero, 3, 0
- vld $vr0, a0, 0
+ vld vr0, a0, 0
- vld $vr1, a1, 0
- vreplgr2vr.b $vr3, a2
- vslt.b $vr2, $vr2, $vr3
- vseq.b $vr3, $vr0, $vr1
+ vld vr1, a1, 0
+ vreplgr2vr.b vr3, a2
+ vslt.b vr2, vr2, vr3
+ vseq.b vr3, vr0, vr1
- vmin.bu $vr3, $vr0, $vr3
- vor.v $vr3, $vr3, $vr2
- vsetanyeqz.b $fcc0, $vr3
- bcnez $fcc0, L(al_out)
+ vmin.bu vr3, vr0, vr3
+ vor.v vr3, vr3, vr2
+ vsetanyeqz.b fcc0, vr3
+ bcnez fcc0, L(al_out)
L(al_loop):
- vld $vr0, a0, 16
- vld $vr1, a1, 16
+ vld vr0, a0, 16
+ vld vr1, a1, 16
addi.d a0, a0, 16
addi.d a1, a1, 16
- vseq.b $vr3, $vr0, $vr1
- vmin.bu $vr3, $vr0, $vr3
- vsetanyeqz.b $fcc0, $vr3
- bceqz $fcc0, L(al_loop)
+ vseq.b vr3, vr0, vr1
+ vmin.bu vr3, vr0, vr3
+ vsetanyeqz.b fcc0, vr3
+ bceqz fcc0, L(al_loop)
L(al_out):
- vseqi.b $vr3, $vr3, 0
- vfrstpi.b $vr3, $vr3, 0
- vshuf.b $vr0, $vr0, $vr0, $vr3
- vshuf.b $vr1, $vr1, $vr1, $vr3
+ vseqi.b vr3, vr3, 0
+ vfrstpi.b vr3, vr3, 0
+ vshuf.b vr0, vr0, vr0, vr3
+ vshuf.b vr1, vr1, vr1, vr3
- vpickve2gr.bu t0, $vr0, 0
- vpickve2gr.bu t1, $vr1, 0
+ vpickve2gr.bu t0, vr0, 0
+ vpickve2gr.bu t1, vr1, 0
sub.d a0, t0, t1
jr ra
@@ -79,52 +79,52 @@ L(unaligned):
bstrins.d a1, zero, 3, 0
- vld $vr0, a0, 0
- vld $vr3, a1, 0
- vreplgr2vr.b $vr4, a2
- vreplgr2vr.b $vr5, a3
+ vld vr0, a0, 0
+ vld vr3, a1, 0
+ vreplgr2vr.b vr4, a2
+ vreplgr2vr.b vr5, a3
- vslt.b $vr7, $vr2, $vr4
- vsub.b $vr4, $vr4, $vr5
- vaddi.bu $vr6, $vr2, 16
- vsub.b $vr6, $vr6, $vr4
+ vslt.b vr7, vr2, vr4
+ vsub.b vr4, vr4, vr5
+ vaddi.bu vr6, vr2, 16
+ vsub.b vr6, vr6, vr4
- vshuf.b $vr1, $vr3, $vr3, $vr6
- vseq.b $vr4, $vr0, $vr1
- vmin.bu $vr4, $vr0, $vr4
- vor.v $vr4, $vr4, $vr7
+ vshuf.b vr1, vr3, vr3, vr6
+ vseq.b vr4, vr0, vr1
+ vmin.bu vr4, vr0, vr4
+ vor.v vr4, vr4, vr7
- vsetanyeqz.b $fcc0, $vr4
- bcnez $fcc0, L(un_end)
- vslt.b $vr5, $vr2, $vr5
- vor.v $vr3, $vr3, $vr5
+ vsetanyeqz.b fcc0, vr4
+ bcnez fcc0, L(un_end)
+ vslt.b vr5, vr2, vr5
+ vor.v vr3, vr3, vr5
L(un_loop):
- vld $vr0, a0, 16
- vsetanyeqz.b $fcc0, $vr3
- bcnez $fcc0, L(remaining_end)
- vor.v $vr1, $vr3, $vr3
+ vld vr0, a0, 16
+ vsetanyeqz.b fcc0, vr3
+ bcnez fcc0, L(remaining_end)
+ vor.v vr1, vr3, vr3
- vld $vr3, a1, 16
+ vld vr3, a1, 16
addi.d a0, a0, 16
addi.d a1, a1, 16
- vshuf.b $vr1, $vr3, $vr1, $vr6
+ vshuf.b vr1, vr3, vr1, vr6
- vseq.b $vr4, $vr0, $vr1
- vmin.bu $vr4, $vr0, $vr4
- vsetanyeqz.b $fcc0, $vr4
- bceqz $fcc0, L(un_loop)
+ vseq.b vr4, vr0, vr1
+ vmin.bu vr4, vr0, vr4
+ vsetanyeqz.b fcc0, vr4
+ bceqz fcc0, L(un_loop)
L(un_end):
- vseqi.b $vr4, $vr4, 0
- vfrstpi.b $vr4, $vr4, 0
- vshuf.b $vr0, $vr0, $vr0, $vr4
- vshuf.b $vr1, $vr1, $vr1, $vr4
+ vseqi.b vr4, vr4, 0
+ vfrstpi.b vr4, vr4, 0
+ vshuf.b vr0, vr0, vr0, vr4
+ vshuf.b vr1, vr1, vr1, vr4
- vpickve2gr.bu t0, $vr0, 0
- vpickve2gr.bu t1, $vr1, 0
+ vpickve2gr.bu t0, vr0, 0
+ vpickve2gr.bu t1, vr1, 0
sub.d t3, t0, t1
sub.d t4, t1, t0
@@ -134,9 +134,9 @@ L(un_end):
jr ra
L(remaining_end):
- vshuf.b $vr1, $vr3, $vr3, $vr6
- vseq.b $vr4, $vr0, $vr1
- vmin.bu $vr4, $vr4, $vr0
+ vshuf.b vr1, vr3, vr3, vr6
+ vseq.b vr4, vr0, vr1
+ vmin.bu vr4, vr4, vr0
b L(un_end)
END(STRCMP)
diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S
index dbc061ad..52d77fa3 100644
--- a/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-lsx.S
@@ -21,61 +21,61 @@ L(magic_num):
ENTRY_NO_ALIGN(STRCPY)
pcaddi t0, -4
andi a4, a1, 0xf
- vld $vr1, t0, 0
+ vld vr1, t0, 0
move a2, a0
beqz a4, L(load_start)
xor t0, a1, a4
- vld $vr0, t0, 0
- vreplgr2vr.b $vr2, a4
+ vld vr0, t0, 0
+ vreplgr2vr.b vr2, a4
- vadd.b $vr2, $vr2, $vr1
- vshuf.b $vr0, $vr2, $vr0, $vr2
- vsetanyeqz.b $fcc0, $vr0
- bcnez $fcc0, L(end)
+ vadd.b vr2, vr2, vr1
+ vshuf.b vr0, vr2, vr0, vr2
+ vsetanyeqz.b fcc0, vr0
+ bcnez fcc0, L(end)
L(load_start):
- vld $vr0, a1, 0
+ vld vr0, a1, 0
li.d t1, 16
andi a3, a2, 0xf
- vsetanyeqz.b $fcc0, $vr0
+ vsetanyeqz.b fcc0, vr0
sub.d t0, t1, a3
- bcnez $fcc0, L(end)
+ bcnez fcc0, L(end)
add.d a1, a1, t0
- vst $vr0, a2, 0
+ vst vr0, a2, 0
andi a3, a1, 0xf
add.d a2, a2, t0
bnez a3, L(unaligned)
- vld $vr0, a1, 0
+ vld vr0, a1, 0
- vsetanyeqz.b $fcc0, $vr0
- bcnez $fcc0, L(end)
+ vsetanyeqz.b fcc0, vr0
+ bcnez fcc0, L(end)
L(loop):
- vst $vr0, a2, 0
- vld $vr0, a1, 16
+ vst vr0, a2, 0
+ vld vr0, a1, 16
addi.d a2, a2, 16
addi.d a1, a1, 16
- vsetanyeqz.b $fcc0, $vr0
- bceqz $fcc0, L(loop)
+ vsetanyeqz.b fcc0, vr0
+ bceqz fcc0, L(loop)
- vmsknz.b $vr1, $vr0
- movfr2gr.s t0, $f1
+ vmsknz.b vr1, vr0
+ movfr2gr.s t0, fa1
cto.w t0, t0
add.d a1, a1, t0
- vld $vr0, a1, -15
+ vld vr0, a1, -15
add.d a2, a2, t0
- vst $vr0, a2, -15
+ vst vr0, a2, -15
jr ra
L(end):
- vmsknz.b $vr1, $vr0
- movfr2gr.s t0, $f1
+ vmsknz.b vr1, vr0
+ movfr2gr.s t0, fa1
cto.w t0, t0
addi.d t0, t0, 1
@@ -83,7 +83,7 @@ L(end):
L(end_16):
andi t1, t0, 16
beqz t1, L(end_8)
- vst $vr0, a2, 0
+ vst vr0, a2, 0
jr ra
L(end_8):
@@ -93,74 +93,74 @@ L(end_8):
andi t5, t0, 1
beqz t2, L(end_4)
- vstelm.d $vr0, a2, 0, 0
+ vstelm.d vr0, a2, 0, 0
addi.d a2, a2, 8
- vbsrl.v $vr0, $vr0, 8
+ vbsrl.v vr0, vr0, 8
L(end_4):
beqz t3, L(end_2)
- vstelm.w $vr0, a2, 0, 0
+ vstelm.w vr0, a2, 0, 0
addi.d a2, a2, 4
- vbsrl.v $vr0, $vr0, 4
+ vbsrl.v vr0, vr0, 4
L(end_2):
beqz t4, L(end_1)
- vstelm.h $vr0, a2, 0, 0
+ vstelm.h vr0, a2, 0, 0
addi.d a2, a2, 2
- vbsrl.v $vr0, $vr0, 2
+ vbsrl.v vr0, vr0, 2
L(end_1):
beqz t5, L(out)
- vstelm.b $vr0, a2, 0, 0
+ vstelm.b vr0, a2, 0, 0
L(out):
jr ra
L(unaligned):
bstrins.d a1, zero, 3, 0
- vld $vr2, a1, 0
- vreplgr2vr.b $vr3, a3
- vslt.b $vr4, $vr1, $vr3
- vor.v $vr0, $vr2, $vr4
+ vld vr2, a1, 0
+ vreplgr2vr.b vr3, a3
+ vslt.b vr4, vr1, vr3
+ vor.v vr0, vr2, vr4
- vsetanyeqz.b $fcc0, $vr0
- bcnez $fcc0, L(un_first_end)
- vld $vr0, a1, 16
- vadd.b $vr3, $vr3, $vr1
+ vsetanyeqz.b fcc0, vr0
+ bcnez fcc0, L(un_first_end)
+ vld vr0, a1, 16
+ vadd.b vr3, vr3, vr1
addi.d a1, a1, 16
- vshuf.b $vr4, $vr0, $vr2, $vr3
- vsetanyeqz.b $fcc0, $vr0
- bcnez $fcc0, L(un_end)
+ vshuf.b vr4, vr0, vr2, vr3
+ vsetanyeqz.b fcc0, vr0
+ bcnez fcc0, L(un_end)
L(un_loop):
- vor.v $vr2, $vr0, $vr0
- vld $vr0, a1, 16
- vst $vr4, a2, 0
+ vor.v vr2, vr0, vr0
+ vld vr0, a1, 16
+ vst vr4, a2, 0
addi.d a1, a1, 16
addi.d a2, a2, 16
- vshuf.b $vr4, $vr0, $vr2, $vr3
- vsetanyeqz.b $fcc0, $vr0
- bceqz $fcc0, L(un_loop)
+ vshuf.b vr4, vr0, vr2, vr3
+ vsetanyeqz.b fcc0, vr0
+ bceqz fcc0, L(un_loop)
L(un_end):
- vsetanyeqz.b $fcc0, $vr4
- bcnez $fcc0, 1f
- vst $vr4, a2, 0
+ vsetanyeqz.b fcc0, vr4
+ bcnez fcc0, 1f
+ vst vr4, a2, 0
1:
- vmsknz.b $vr1, $vr0
+ vmsknz.b vr1, vr0
- movfr2gr.s t0, $f1
+ movfr2gr.s t0, fa1
cto.w t0, t0
add.d a1, a1, t0
- vld $vr0, a1, -15
+ vld vr0, a1, -15
add.d a2, a2, t0
sub.d a2, a2, a3
- vst $vr0, a2, 1
+ vst vr0, a2, 1
jr ra
L(un_first_end):
diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
index fd6c002d..fc25dd50 100644
--- a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
+++ b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
@@ -17,12 +17,12 @@ LEAF(STRLEN, 6)
move a1, a0
bstrins.d a0, zero, 4, 0
li.d t1, -1
- xvld $xr0, a0, 0
+ xvld xr0, a0, 0
- xvmsknz.b $xr0, $xr0
- xvpickve.w $xr1, $xr0, 4
- vilvl.h $vr0, $vr1, $vr0
- movfr2gr.s t0, $f0 # sign extend
+ xvmsknz.b xr0, xr0
+ xvpickve.w xr1, xr0, 4
+ vilvl.h vr0, vr1, vr0
+ movfr2gr.s t0, fa0 # sign extend
sra.w t0, t0, a1
beq t0, t1, L(loop)
@@ -30,18 +30,18 @@ LEAF(STRLEN, 6)
jr ra
L(loop):
- xvld $xr0, a0, 32
+ xvld xr0, a0, 32
addi.d a0, a0, 32
- xvsetanyeqz.b $fcc0, $xr0
- bceqz $fcc0, L(loop)
+ xvsetanyeqz.b fcc0, xr0
+ bceqz fcc0, L(loop)
- xvmsknz.b $xr0, $xr0
+ xvmsknz.b xr0, xr0
sub.d a0, a0, a1
- xvpickve.w $xr1, $xr0, 4
- vilvl.h $vr0, $vr1, $vr0
+ xvpickve.w xr1, xr0, 4
+ vilvl.h vr0, vr1, vr0
- movfr2gr.s t0, $f0
+ movfr2gr.s t0, fa0
cto.w t0, t0
add.d a0, a0, t0
jr ra
diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
index 6f311506..45c3db93 100644
--- a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
@@ -16,15 +16,15 @@
LEAF(STRLEN, 6)
move a1, a0
bstrins.d a0, zero, 4, 0
- vld $vr0, a0, 0
- vld $vr1, a0, 16
+ vld vr0, a0, 0
+ vld vr1, a0, 16
li.d t1, -1
- vmsknz.b $vr0, $vr0
- vmsknz.b $vr1, $vr1
- vilvl.h $vr0, $vr1, $vr0
+ vmsknz.b vr0, vr0
+ vmsknz.b vr1, vr1
+ vilvl.h vr0, vr1, vr0
- movfr2gr.s t0, $f0
+ movfr2gr.s t0, fa0
sra.w t0, t0, a1
beq t0, t1, L(loop)
cto.w a0, t0
@@ -36,19 +36,19 @@ LEAF(STRLEN, 6)
L(loop):
- vld $vr0, a0, 32
- vld $vr1, a0, 48
+ vld vr0, a0, 32
+ vld vr1, a0, 48
addi.d a0, a0, 32
- vmin.bu $vr2, $vr0, $vr1
+ vmin.bu vr2, vr0, vr1
- vsetanyeqz.b $fcc0, $vr2
- bceqz $fcc0, L(loop)
- vmsknz.b $vr0, $vr0
- vmsknz.b $vr1, $vr1
+ vsetanyeqz.b fcc0, vr2
+ bceqz fcc0, L(loop)
+ vmsknz.b vr0, vr0
+ vmsknz.b vr1, vr1
- vilvl.h $vr0, $vr1, $vr0
+ vilvl.h vr0, vr1, vr0
sub.d a0, a0, a1
- movfr2gr.s t0, $f0
+ movfr2gr.s t0, fa0
cto.w t0, t0
add.d a0, a0, t0
diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
index 2c6f9614..21f3e689 100644
--- a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
@@ -22,7 +22,7 @@ ENTRY_NO_ALIGN(STRNCMP)
beqz a2, L(ret0)
pcaddi t0, -5
andi a3, a0, 0xf
- vld $vr2, t0, 0
+ vld vr2, t0, 0
andi a4, a1, 0xf
li.d t2, 16
@@ -30,57 +30,57 @@ ENTRY_NO_ALIGN(STRNCMP)
xor t0, a0, a3
xor t1, a1, a4
- vld $vr0, t0, 0
- vld $vr1, t1, 0
- vreplgr2vr.b $vr3, a3
+ vld vr0, t0, 0
+ vld vr1, t1, 0
+ vreplgr2vr.b vr3, a3
sub.d t2, t2, a3
- vadd.b $vr3, $vr3, $vr2
- vshuf.b $vr0, $vr3, $vr0, $vr3
- vshuf.b $vr1, $vr3, $vr1, $vr3
+ vadd.b vr3, vr3, vr2
+ vshuf.b vr0, vr3, vr0, vr3
+ vshuf.b vr1, vr3, vr1, vr3
- vseq.b $vr3, $vr0, $vr1
- vmin.bu $vr3, $vr0, $vr3
+ vseq.b vr3, vr0, vr1
+ vmin.bu vr3, vr0, vr3
bgeu t2, a2, L(al_early_end)
- vsetanyeqz.b $fcc0, $vr3
+ vsetanyeqz.b fcc0, vr3
- bcnez $fcc0, L(al_end)
+ bcnez fcc0, L(al_end)
add.d a3, a0, a2
addi.d a4, a3, -1
bstrins.d a4, zero, 3, 0
sub.d a2, a3, a4
L(al_loop):
- vld $vr0, t0, 16
- vld $vr1, t1, 16
+ vld vr0, t0, 16
+ vld vr1, t1, 16
addi.d t0, t0, 16
addi.d t1, t1, 16
- vseq.b $vr3, $vr0, $vr1
- vmin.bu $vr3, $vr0, $vr3
+ vseq.b vr3, vr0, vr1
+ vmin.bu vr3, vr0, vr3
beq t0, a4, L(al_early_end)
- vsetanyeqz.b $fcc0, $vr3
- bceqz $fcc0, L(al_loop)
+ vsetanyeqz.b fcc0, vr3
+ bceqz fcc0, L(al_loop)
L(al_end):
- vseqi.b $vr3, $vr3, 0
- vfrstpi.b $vr3, $vr3, 0
+ vseqi.b vr3, vr3, 0
+ vfrstpi.b vr3, vr3, 0
- vshuf.b $vr0, $vr0, $vr0, $vr3
- vshuf.b $vr1, $vr1, $vr1, $vr3
- vpickve2gr.bu t0, $vr0, 0
- vpickve2gr.bu t1, $vr1, 0
+ vshuf.b vr0, vr0, vr0, vr3
+ vshuf.b vr1, vr1, vr1, vr3
+ vpickve2gr.bu t0, vr0, 0
+ vpickve2gr.bu t1, vr1, 0
sub.d a0, t0, t1
jr ra
L(al_early_end):
- vreplgr2vr.b $vr4, a2
- vslt.b $vr4, $vr2, $vr4
+ vreplgr2vr.b vr4, a2
+ vslt.b vr4, vr2, vr4
- vorn.v $vr3, $vr3, $vr4
+ vorn.v vr3, vr3, vr4
b L(al_end)
L(unaligned):
slt a5, a3, a4
@@ -94,64 +94,64 @@ L(unaligned):
andi a4, a1, 0xf
xor t0, a0, a3
xor t1, a1, a4
- vld $vr0, t0, 0
+ vld vr0, t0, 0
- vld $vr3, t1, 0
+ vld vr3, t1, 0
sub.d t2, t2, a3
- vreplgr2vr.b $vr4, a3
- vreplgr2vr.b $vr5, a4
+ vreplgr2vr.b vr4, a3
+ vreplgr2vr.b vr5, a4
- vaddi.bu $vr6, $vr2, 16
- vsub.b $vr7, $vr4, $vr5
- vsub.b $vr6, $vr6, $vr7
- vadd.b $vr4, $vr2, $vr4
+ vaddi.bu vr6, vr2, 16
+ vsub.b vr7, vr4, vr5
+ vsub.b vr6, vr6, vr7
+ vadd.b vr4, vr2, vr4
- vshuf.b $vr1, $vr3, $vr3, $vr6
- vshuf.b $vr0, $vr7, $vr0, $vr4
- vshuf.b $vr1, $vr7, $vr1, $vr4
- vseq.b $vr4, $vr0, $vr1
+ vshuf.b vr1, vr3, vr3, vr6
+ vshuf.b vr0, vr7, vr0, vr4
+ vshuf.b vr1, vr7, vr1, vr4
+ vseq.b vr4, vr0, vr1
- vmin.bu $vr4, $vr0, $vr4
+ vmin.bu vr4, vr0, vr4
bgeu t2, a2, L(un_early_end)
- vsetanyeqz.b $fcc0, $vr4
- bcnez $fcc0, L(un_end)
+ vsetanyeqz.b fcc0, vr4
+ bcnez fcc0, L(un_end)
add.d a6, a0, a2
- vslt.b $vr5, $vr2, $vr5
+ vslt.b vr5, vr2, vr5
addi.d a7, a6, -1
- vor.v $vr3, $vr3, $vr5
+ vor.v vr3, vr3, vr5
bstrins.d a7, zero, 3, 0
sub.d a2, a6, a7
L(un_loop):
- vld $vr0, t0, 16
+ vld vr0, t0, 16
addi.d t0, t0, 16
- vsetanyeqz.b $fcc0, $vr3
- bcnez $fcc0, L(has_zero)
+ vsetanyeqz.b fcc0, vr3
+ bcnez fcc0, L(has_zero)
beq t0, a7, L(end_with_len)
- vor.v $vr1, $vr3, $vr3
+ vor.v vr1, vr3, vr3
- vld $vr3, t1, 16
+ vld vr3, t1, 16
addi.d t1, t1, 16
- vshuf.b $vr1, $vr3, $vr1, $vr6
- vseq.b $vr4, $vr0, $vr1
+ vshuf.b vr1, vr3, vr1, vr6
+ vseq.b vr4, vr0, vr1
- vmin.bu $vr4, $vr0, $vr4
- vsetanyeqz.b $fcc0, $vr4
- bceqz $fcc0, L(un_loop)
+ vmin.bu vr4, vr0, vr4
+ vsetanyeqz.b fcc0, vr4
+ bceqz fcc0, L(un_loop)
L(un_end):
- vseqi.b $vr4, $vr4, 0
+ vseqi.b vr4, vr4, 0
- vfrstpi.b $vr4, $vr4, 0
- vshuf.b $vr0, $vr0, $vr0, $vr4
- vshuf.b $vr1, $vr1, $vr1, $vr4
- vpickve2gr.bu t0, $vr0, 0
+ vfrstpi.b vr4, vr4, 0
+ vshuf.b vr0, vr0, vr0, vr4
+ vshuf.b vr1, vr1, vr1, vr4
+ vpickve2gr.bu t0, vr0, 0
- vpickve2gr.bu t1, $vr1, 0
+ vpickve2gr.bu t1, vr1, 0
sub.d t2, t0, t1
sub.d t3, t1, t0
masknez t0, t2, a5
@@ -160,30 +160,30 @@ L(un_end):
or a0, t0, t1
jr ra
L(has_zero):
- vshuf.b $vr1, $vr3, $vr3, $vr6
+ vshuf.b vr1, vr3, vr3, vr6
- vseq.b $vr4, $vr0, $vr1
- vmin.bu $vr4, $vr0, $vr4
+ vseq.b vr4, vr0, vr1
+ vmin.bu vr4, vr0, vr4
bne t0, a7, L(un_end)
L(un_early_end):
- vreplgr2vr.b $vr5, a2
+ vreplgr2vr.b vr5, a2
- vslt.b $vr5, $vr2, $vr5
- vorn.v $vr4, $vr4, $vr5
+ vslt.b vr5, vr2, vr5
+ vorn.v vr4, vr4, vr5
b L(un_end)
L(end_with_len):
sub.d a6, a3, a4
bgeu a6, a2, 1f
- vld $vr4, t1, 16
+ vld vr4, t1, 16
1:
- vshuf.b $vr1, $vr4, $vr3, $vr6
- vseq.b $vr4, $vr0, $vr1
+ vshuf.b vr1, vr4, vr3, vr6
+ vseq.b vr4, vr0, vr1
- vmin.bu $vr4, $vr0, $vr4
- vreplgr2vr.b $vr5, a2
- vslt.b $vr5, $vr2, $vr5
- vorn.v $vr4, $vr4, $vr5
+ vmin.bu vr4, vr0, vr4
+ vreplgr2vr.b vr5, a2
+ vslt.b vr5, vr2, vr5
+ vorn.v vr4, vr4, vr5
b L(un_end)
L(ret0):
diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
index 910b52fe..6410a907 100644
--- a/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
@@ -19,23 +19,23 @@ LEAF(STRNLEN, 6)
li.d t3, 65
sub.d a2, a0, t1
- xvld $xr0, a2, 0
- xvld $xr1, a2, 32
+ xvld xr0, a2, 0
+ xvld xr1, a2, 32
sub.d t1, t3, t1
move a3, a0
sltu t1, a1, t1
- xvmsknz.b $xr0, $xr0
- xvmsknz.b $xr1, $xr1
- xvpickve.w $xr2, $xr0, 4
+ xvmsknz.b xr0, xr0
+ xvmsknz.b xr1, xr1
+ xvpickve.w xr2, xr0, 4
- xvpickve.w $xr3, $xr1, 4
- vilvl.h $vr0, $vr2, $vr0
- vilvl.h $vr1, $vr3, $vr1
- vilvl.w $vr0, $vr1, $vr0
+ xvpickve.w xr3, xr1, 4
+ vilvl.h vr0, vr2, vr0
+ vilvl.h vr1, vr3, vr1
+ vilvl.w vr0, vr1, vr0
- movfr2gr.d t0, $f0
+ movfr2gr.d t0, fa0
sra.d t0, t0, a0
orn t1, t1, t0
bnez t1, L(end)
@@ -46,26 +46,26 @@ LEAF(STRNLEN, 6)
bstrins.d a4, zero, 5, 0
L(loop):
- xvld $xr0, a0, 64
- xvld $xr1, a0, 96
+ xvld xr0, a0, 64
+ xvld xr1, a0, 96
addi.d a0, a0, 64
beq a0, a4, L(out)
- xvmin.bu $xr2, $xr0, $xr1
- xvsetanyeqz.b $fcc0, $xr2
- bceqz $fcc0, L(loop)
+ xvmin.bu xr2, xr0, xr1
+ xvsetanyeqz.b fcc0, xr2
+ bceqz fcc0, L(loop)
L(out):
- xvmsknz.b $xr0, $xr0
+ xvmsknz.b xr0, xr0
- xvmsknz.b $xr1, $xr1
- xvpickve.w $xr2, $xr0, 4
- xvpickve.w $xr3, $xr1, 4
- vilvl.h $vr0, $vr2, $vr0
+ xvmsknz.b xr1, xr1
+ xvpickve.w xr2, xr0, 4
+ xvpickve.w xr3, xr1, 4
+ vilvl.h vr0, vr2, vr0
- vilvl.h $vr1, $vr3, $vr1
- vilvl.w $vr0, $vr1, $vr0
- movfr2gr.d t0, $f0
+ vilvl.h vr1, vr3, vr1
+ vilvl.w vr0, vr1, vr0
+ movfr2gr.d t0, fa0
L(end):
sub.d a0, a0, a3
diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
index db0e90ff..9250a0cd 100644
--- a/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
@@ -19,17 +19,17 @@ LEAF(STRNLEN, 6)
li.d t3, 33
sub.d a2, a0, t1
- vld $vr0, a2, 0
- vld $vr1, a2, 16
+ vld vr0, a2, 0
+ vld vr1, a2, 16
sub.d t1, t3, t1
move a3, a0
sltu t1, a1, t1
- vmsknz.b $vr0, $vr0
- vmsknz.b $vr1, $vr1
- vilvl.h $vr0, $vr1, $vr0
+ vmsknz.b vr0, vr0
+ vmsknz.b vr1, vr1
+ vilvl.h vr0, vr1, vr0
- movfr2gr.s t0, $f0
+ movfr2gr.s t0, fa0
sra.w t0, t0, a0
orn t1, t1, t0
bnez t1, L(end)
@@ -41,20 +41,20 @@ LEAF(STRNLEN, 6)
bstrins.d a4, zero, 4, 0
L(loop):
- vld $vr0, a0, 32
- vld $vr1, a0, 48
+ vld vr0, a0, 32
+ vld vr1, a0, 48
addi.d a0, a0, 32
beq a0, a4, L(out)
- vmin.bu $vr2, $vr0, $vr1
- vsetanyeqz.b $fcc0, $vr2
- bceqz $fcc0, L(loop)
+ vmin.bu vr2, vr0, vr1
+ vsetanyeqz.b fcc0, vr2
+ bceqz fcc0, L(loop)
L(out):
- vmsknz.b $vr0, $vr0
+ vmsknz.b vr0, vr0
- vmsknz.b $vr1, $vr1
- vilvl.h $vr0, $vr1, $vr0
- movfr2gr.s t0, $f0
+ vmsknz.b vr1, vr1
+ vilvl.h vr0, vr1, vr0
+ movfr2gr.s t0, fa0
L(end):
sub.d a0, a0, a3
diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S
index 325458ff..990be973 100644
--- a/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S
+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S
@@ -14,45 +14,45 @@
LEAF(STRRCHR, 6)
andi t1, a0, 0x3f
bstrins.d a0, zero, 5, 0
- xvld $xr0, a0, 0
- xvld $xr1, a0, 32
+ xvld xr0, a0, 0
+ xvld xr1, a0, 32
li.d t2, -1
- xvreplgr2vr.b $xr4, a1
+ xvreplgr2vr.b xr4, a1
move a2, zero
sll.d t3, t2, t1
addi.d a0, a0, 63
- xvseq.b $xr2, $xr0, $xr4
- xvseq.b $xr3, $xr1, $xr4
- xvmsknz.b $xr0, $xr0
+ xvseq.b xr2, xr0, xr4
+ xvseq.b xr3, xr1, xr4
+ xvmsknz.b xr0, xr0
- xvmsknz.b $xr1, $xr1
- xvpickve.w $xr5, $xr0, 4
- xvpickve.w $xr6, $xr1, 4
- vilvl.h $vr0, $vr5, $vr0
+ xvmsknz.b xr1, xr1
+ xvpickve.w xr5, xr0, 4
+ xvpickve.w xr6, xr1, 4
+ vilvl.h vr0, vr5, vr0
- vilvl.h $vr1, $vr6, $vr1
- xvmsknz.b $xr2, $xr2
- xvmsknz.b $xr3, $xr3
- xvpickve.w $xr5, $xr2, 4
+ vilvl.h vr1, vr6, vr1
+ xvmsknz.b xr2, xr2
+ xvmsknz.b xr3, xr3
+ xvpickve.w xr5, xr2, 4
- xvpickve.w $xr6, $xr3, 4
- vilvl.h $vr2, $vr5, $vr2
- vilvl.h $vr3, $vr6, $vr3
- vilvl.w $vr0, $vr1, $vr0
+ xvpickve.w xr6, xr3, 4
+ vilvl.h vr2, vr5, vr2
+ vilvl.h vr3, vr6, vr3
+ vilvl.w vr0, vr1, vr0
- vilvl.w $vr1, $vr3, $vr2
- movfr2gr.d t0, $f0
- movfr2gr.d t1, $f1
+ vilvl.w vr1, vr3, vr2
+ movfr2gr.d t0, fa0
+ movfr2gr.d t1, fa1
orn t0, t0, t3
and t1, t1, t3
bne t0, t2, L(end)
L(loop):
- xvld $xr0, a0, 1
- xvld $xr1, a0, 33
+ xvld xr0, a0, 1
+ xvld xr1, a0, 33
clz.d t0, t1
@@ -62,33 +62,33 @@ L(loop):
masknez t1, a2, t1
or a2, t0, t1
- xvseq.b $xr2, $xr0, $xr4
- xvseq.b $xr3, $xr1, $xr4
+ xvseq.b xr2, xr0, xr4
+ xvseq.b xr3, xr1, xr4
- xvmsknz.b $xr2, $xr2
- xvmsknz.b $xr3, $xr3
- xvpickve.w $xr5, $xr2, 4
- xvpickve.w $xr6, $xr3, 4
+ xvmsknz.b xr2, xr2
+ xvmsknz.b xr3, xr3
+ xvpickve.w xr5, xr2, 4
+ xvpickve.w xr6, xr3, 4
- vilvl.h $vr2, $vr5, $vr2
- vilvl.h $vr3, $vr6, $vr3
- xvmin.bu $xr5, $xr0, $xr1
- vilvl.w $vr2, $vr3, $vr2
+ vilvl.h vr2, vr5, vr2
+ vilvl.h vr3, vr6, vr3
+ xvmin.bu xr5, xr0, xr1
+ vilvl.w vr2, vr3, vr2
- xvsetanyeqz.b $fcc0, $xr5
- movfr2gr.d t1, $f2
- bceqz $fcc0, L(loop)
- xvmsknz.b $xr0, $xr0
+ xvsetanyeqz.b fcc0, xr5
+ movfr2gr.d t1, fa2
+ bceqz fcc0, L(loop)
+ xvmsknz.b xr0, xr0
- xvmsknz.b $xr1, $xr1
- xvpickve.w $xr5, $xr0, 4
- xvpickve.w $xr6, $xr1, 4
- vilvl.h $vr0, $vr5, $vr0
+ xvmsknz.b xr1, xr1
+ xvpickve.w xr5, xr0, 4
+ xvpickve.w xr6, xr1, 4
+ vilvl.h vr0, vr5, vr0
- vilvl.h $vr1, $vr6, $vr1
- vilvl.w $vr0, $vr1, $vr0
- movfr2gr.d t0, $f0
+ vilvl.h vr1, vr6, vr1
+ vilvl.w vr0, vr1, vr0
+ movfr2gr.d t0, fa0
L(end):
slli.d t3, t2, 1 # shift one more for the last '\0'
diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S
index e082eaab..6aede6ae 100644
--- a/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S
@@ -14,35 +14,35 @@
LEAF(STRRCHR, 6)
andi t1, a0, 0x1f
bstrins.d a0, zero, 4, 0
- vld $vr0, a0, 0
- vld $vr1, a0, 16
+ vld vr0, a0, 0
+ vld vr1, a0, 16
- vreplgr2vr.b $vr4, a1
+ vreplgr2vr.b vr4, a1
li.d t2, -1
move a2, zero
addi.d a0, a0, 31
- vseq.b $vr2, $vr0, $vr4
- vseq.b $vr3, $vr1, $vr4
- vmsknz.b $vr0, $vr0
- vmsknz.b $vr1, $vr1
+ vseq.b vr2, vr0, vr4
+ vseq.b vr3, vr1, vr4
+ vmsknz.b vr0, vr0
+ vmsknz.b vr1, vr1
- vmsknz.b $vr2, $vr2
- vmsknz.b $vr3, $vr3
- vilvl.h $vr0, $vr1, $vr0
- vilvl.h $vr1, $vr3, $vr2
+ vmsknz.b vr2, vr2
+ vmsknz.b vr3, vr3
+ vilvl.h vr0, vr1, vr0
+ vilvl.h vr1, vr3, vr2
- movfr2gr.s t0, $f0
+ movfr2gr.s t0, fa0
sll.d t3, t2, t1
- movfr2gr.s t1, $f1
+ movfr2gr.s t1, fa1
orn t0, t0, t3
and t1, t1, t3
bne t0, t2, L(end)
L(loop):
- vld $vr0, a0, 1
- vld $vr1, a0, 17
+ vld vr0, a0, 1
+ vld vr1, a0, 17
clz.w t0, t1
sub.d t0, a0, t0
@@ -51,23 +51,23 @@ L(loop):
masknez t1, a2, t1
or a2, t0, t1
- vseq.b $vr2, $vr0, $vr4
- vseq.b $vr3, $vr1, $vr4
+ vseq.b vr2, vr0, vr4
+ vseq.b vr3, vr1, vr4
- vmsknz.b $vr2, $vr2
- vmsknz.b $vr3, $vr3
- vmin.bu $vr5, $vr0, $vr1
- vilvl.h $vr2, $vr3, $vr2
+ vmsknz.b vr2, vr2
+ vmsknz.b vr3, vr3
+ vmin.bu vr5, vr0, vr1
+ vilvl.h vr2, vr3, vr2
- vsetanyeqz.b $fcc0, $vr5
- movfr2gr.s t1, $f2
- bceqz $fcc0, L(loop)
- vmsknz.b $vr0, $vr0
+ vsetanyeqz.b fcc0, vr5
+ movfr2gr.s t1, fa2
+ bceqz fcc0, L(loop)
+ vmsknz.b vr0, vr0
- vmsknz.b $vr1, $vr1
- vilvl.h $vr0, $vr1, $vr0
- movfr2gr.s t0, $f0
+ vmsknz.b vr1, vr1
+ vilvl.h vr0, vr1, vr0
+ movfr2gr.s t0, fa0
L(end):
slli.d t3, t2, 1 # shift one more for the last '\0'
diff --git a/sysdeps/loongarch/lp64/s_cosf.S b/sysdeps/loongarch/lp64/s_cosf.S
index 9fcbe6ca..cb3a4faa 100644
--- a/sysdeps/loongarch/lp64/s_cosf.S
+++ b/sysdeps/loongarch/lp64/s_cosf.S
@@ -213,9 +213,9 @@ L_even_integer:
fadd.d fa0, fa0, fa1
fadd.d fa2, fa2, fa3
fadd.d fa0, fa0, fa2
- fcmp.sle.d $fcc0, fa0, fa5
+ fcmp.sle.d fcc0, fa0, fa5
addi.d t0, t0, 3
- bcnez $fcc0, L_leq_one
+ bcnez fcc0, L_leq_one
/*L_gt_one:*/
fld.d fa2, t1, 16 /* 2.0 */
addi.d t0, t0, 1
diff --git a/sysdeps/loongarch/lp64/s_sinf.S b/sysdeps/loongarch/lp64/s_sinf.S
index 45d1c4b5..1e77282d 100644
--- a/sysdeps/loongarch/lp64/s_sinf.S
+++ b/sysdeps/loongarch/lp64/s_sinf.S
@@ -215,9 +215,9 @@ L_even_integer:
fadd.d fa0, fa0, fa1
fadd.d fa2, fa2, fa3
fadd.d fa0, fa0, fa2
- fcmp.sle.d $fcc0, fa0, fa5
+ fcmp.sle.d fcc0, fa0, fa5
addi.d t0, t0, 1
- bcnez $fcc0, L_leq_one
+ bcnez fcc0, L_leq_one
/*L_gt_one:*/
fld.d fa2, t1, 16 /* 2.0 */
addi.d t0, t0, 1
diff --git a/sysdeps/loongarch/sys/regdef.h b/sysdeps/loongarch/sys/regdef.h
index 36f00939..b5ee57cf 100644
--- a/sysdeps/loongarch/sys/regdef.h
+++ b/sysdeps/loongarch/sys/regdef.h
@@ -71,6 +71,14 @@
# define fs5 $f29
# define fs6 $f30
# define fs7 $f31
+# define fcc0 $fcc0
+# define fcc1 $fcc1
+# define fcc2 $fcc2
+# define fcc3 $fcc3
+# define fcc4 $fcc4
+# define fcc5 $fcc5
+# define fcc6 $fcc6
+# define fcc7 $fcc7
#elif _LOONGARCH_SIM == _ABILP32
# error ABILP32 not support yet
@@ -78,4 +86,70 @@
# error noABI
#endif
+#define vr0 $vr0
+#define vr1 $vr1
+#define vr2 $vr2
+#define vr3 $vr3
+#define vr4 $vr4
+#define vr5 $vr5
+#define vr6 $vr6
+#define vr7 $vr7
+#define vr8 $vr8
+#define vr9 $vr9
+#define vr10 $vr10
+#define vr11 $vr11
+#define vr12 $vr12
+#define vr13 $vr13
+#define vr14 $vr14
+#define vr15 $vr15
+#define vr16 $vr16
+#define vr17 $vr17
+#define vr18 $vr18
+#define vr19 $vr19
+#define vr20 $vr20
+#define vr21 $vr21
+#define vr22 $vr22
+#define vr23 $vr23
+#define vr24 $vr24
+#define vr25 $vr25
+#define vr26 $vr26
+#define vr27 $vr27
+#define vr28 $vr28
+#define vr29 $vr29
+#define vr30 $vr30
+#define vr31 $vr31
+
+#define xr0 $xr0
+#define xr1 $xr1
+#define xr2 $xr2
+#define xr3 $xr3
+#define xr4 $xr4
+#define xr5 $xr5
+#define xr6 $xr6
+#define xr7 $xr7
+#define xr8 $xr8
+#define xr9 $xr9
+#define xr10 $xr10
+#define xr11 $xr11
+#define xr12 $xr12
+#define xr13 $xr13
+#define xr14 $xr14
+#define xr15 $xr15
+#define xr16 $xr16
+#define xr17 $xr17
+#define xr18 $xr18
+#define xr19 $xr19
+#define xr20 $xr20
+#define xr21 $xr21
+#define xr22 $xr22
+#define xr23 $xr23
+#define xr24 $xr24
+#define xr25 $xr25
+#define xr26 $xr26
+#define xr27 $xr27
+#define xr28 $xr28
+#define xr29 $xr29
+#define xr30 $xr30
+#define xr31 $xr31
+
#endif /* _SYS_REGDEF_H */
--
2.33.0