anolis-glibc/glibc-2.28-Refactor-code-of-raw-mem-functions.patch
ticat_fp d91eae1237 LoongArch: Sync loongarch64 code to lnd.36
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
2024-05-29 10:24:08 +08:00

3031 lines
67 KiB
Diff

From 4879bd4e0aff7d884d9b026b6081a0e8cffc491c Mon Sep 17 00:00:00 2001
From: caiyinyu <caiyinyu@loongson.cn>
Date: Wed, 21 Jun 2023 09:30:54 +0800
Subject: [PATCH 06/14] glibc-2.28: Refactor code of {raw,}mem* functions.
Change-Id: Icafaf6bc8216f48be64cf25a40b9fe28ce127914
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/lp64/memchr.S | 92 --
sysdeps/loongarch/lp64/memcmp.S | 280 ------
sysdeps/loongarch/lp64/memcpy.S | 804 ------------------
sysdeps/loongarch/lp64/memmove.S | 2 -
sysdeps/loongarch/lp64/memset.S | 166 ----
.../loongarch/lp64/multiarch/memchr-aligned.S | 91 +-
.../loongarch/lp64/multiarch/memcmp-aligned.S | 282 +++++-
.../loongarch/lp64/multiarch/memcpy-aligned.S | 799 ++++++++++++++++-
.../loongarch/lp64/multiarch/memset-aligned.S | 166 +++-
.../lp64/multiarch/rawmemchr-aligned.S | 110 ++-
sysdeps/loongarch/lp64/rawmemchr.S | 113 ---
11 files changed, 1438 insertions(+), 1467 deletions(-)
delete mode 100644 sysdeps/loongarch/lp64/memchr.S
delete mode 100644 sysdeps/loongarch/lp64/memcmp.S
delete mode 100644 sysdeps/loongarch/lp64/memcpy.S
delete mode 100644 sysdeps/loongarch/lp64/memmove.S
delete mode 100644 sysdeps/loongarch/lp64/memset.S
delete mode 100644 sysdeps/loongarch/lp64/rawmemchr.S
diff --git a/sysdeps/loongarch/lp64/memchr.S b/sysdeps/loongarch/lp64/memchr.S
deleted file mode 100644
index 23f1fd13..00000000
--- a/sysdeps/loongarch/lp64/memchr.S
+++ /dev/null
@@ -1,92 +0,0 @@
-#ifdef _LIBC
-#include <sysdep.h>
-#include <sys/regdef.h>
-#include <sys/asm.h>
-#else
-#include <sys/asm.h>
-#include <sys/regdef.h>
-#endif
-
-#ifndef MEMCHR_NAME
-#define MEMCHR_NAME memchr
-#endif
-
-LEAF(MEMCHR_NAME, 6)
- beqz a2, L(out)
- andi t1, a0, 0x7
- lu12i.w a3, 0x01010
- sub.d a5, a0, t1
-
- bstrins.d a1, a1, 15, 8
- ld.d t0, a5, 0
- slli.d t2, t1, 3
- ori a3, a3, 0x101
-
- bstrins.d a1, a1, 31, 16
- li.w t7, -1
- li.w t8, 9
- bstrins.d a3, a3, 63, 32
-
- srl.d t3, t7, t2
- bstrins.d a1, a1, 63, 32
- sub.d t4, t8, t1
- orn t3, a1, t3
-
- srl.d t0, t0, t2
- slli.d a4, a3, 7 # 0x8080808080808080
- sltu t4, a2, t4
- xor t2, t0, t3
-
- sub.d a6, t2, a3
- andn a7, a4, t2
- and t2, a6, a7
- or t3, t2, t4
-
- bnez t3, L(count_pos)
- addi.d a2, a2, -8
- addi.d a0, a5, 8
- add.d a2, a2, t1
-
-L(loop):
- ld.d t0, a0, 0
- sltui t4, a2, 9
- xor t2, t0, a1
- sub.d a6, t2, a3
-
- andn a7, a4, t2
- and t2, a6, a7
- or t3, t2, t4
- bnez t3, L(count_pos)
-
- ld.d t1, a0, 8
- addi.d a0, a0, 16
- sltui t4, a2, 17
- xor t2, t1, a1
-
- sub.d a6, t2, a3
- andn a7, a4, t2
- and t2, a6, a7
- addi.d a2, a2, -16
-
- or t3, t2, t4
- beqz t3, L(loop)
- addi.d a0, a0, -8
- addi.d a2, a2, 8
-
-L(count_pos):
- ctz.d t0, t2
- srli.d t0, t0, 3
- sltu t1, t0, a2
- add.d a0, a0, t0
-
- maskeqz a0, a0, t1
- jr ra
-
-L(out):
- move a0, zero
- jr ra
-END(MEMCHR_NAME)
-
-#ifdef _LIBC
-libc_hidden_builtin_def (MEMCHR_NAME)
-#endif
diff --git a/sysdeps/loongarch/lp64/memcmp.S b/sysdeps/loongarch/lp64/memcmp.S
deleted file mode 100644
index 457a4dc7..00000000
--- a/sysdeps/loongarch/lp64/memcmp.S
+++ /dev/null
@@ -1,280 +0,0 @@
-#ifdef _LIBC
-#include <sysdep.h>
-#include <sys/regdef.h>
-#include <sys/asm.h>
-#else
-#include <sys/asm.h>
-#include <sys/regdef.h>
-#endif
-
-#ifndef MEMCMP_NAME
-#define MEMCMP_NAME memcmp
-#endif
-
-LEAF(MEMCMP_NAME, 6)
- beqz a2, L(ret)
- andi a4, a1, 0x7
- andi a3, a0, 0x7
- sltu a5, a4, a3
-
- xor t0, a0, a1
- li.w t8, 8
- maskeqz t0, t0, a5
- li.w t7, -1
-
- xor a0, a0, t0 // a0 hold smaller one
- xor a1, a1, t0 // a1 hold larger one
- andi a3, a0, 0x7 // a3 hold small offset
- andi a4, a1, 0x7 // a4 hold larger offset
-
- xor a0, a0, a3
- xor a1, a1, a4
- ld.d t2, a0, 0 // t2 = "fedcbaXX"
- ld.d t1, a1, 0 // t1 = "54321YYY"
-
- slli.d t3, a3, 3
- slli.d t4, a4, 3
- sub.d a6, t3, t4 // a6 = 0xfffffffffffffff8
- srl.d t1, t1, t4 // t1 = "00054321"
-
- srl.d t0, t2, t3 // t0 = "00fedcba"
- srl.d t5, t7, t4 // t5 = 0x000000FFFFFFFFFF
- sub.d t6, t0, t1 // t6 hold diff
- and t6, t6, t5 // t6 = "000xxxxx"
-
- sub.d t5, t8, a4 // t5 hold margin 8 - 3 = 5
- bnez t6, L(first_out)
- bgeu t5, a2, L(ret)
- sub.d a2, a2, t5
-
- bnez a6, L(unaligned)
- blt a2, t8, L(al_less_8bytes)
- andi t1, a2, 31
- beq t1, a2, L(al_less_32bytes)
-
- sub.d t2, a2, t1
- add.d a4, a0, t2
- move a2, t1
-
-L(al_loop):
- ld.d t0, a0, 8
-
- ld.d t1, a1, 8
- ld.d t2, a0, 16
- ld.d t3, a1, 16
- ld.d t4, a0, 24
-
- ld.d t5, a1, 24
- ld.d t6, a0, 32
- ld.d t7, a1, 32
- addi.d a0, a0, 32
-
- addi.d a1, a1, 32
- bne t0, t1, L(out1)
- bne t2, t3, L(out2)
- bne t4, t5, L(out3)
-
- bne t6, t7, L(out4)
- bne a0, a4, L(al_loop)
-
-L(al_less_32bytes):
- srai.d a4, a2, 4
- beqz a4, L(al_less_16bytes)
-
- ld.d t0, a0, 8
- ld.d t1, a1, 8
- ld.d t2, a0, 16
- ld.d t3, a1, 16
-
- addi.d a0, a0, 16
- addi.d a1, a1, 16
- addi.d a2, a2, -16
- bne t0, t1, L(out1)
-
- bne t2, t3, L(out2)
-
-L(al_less_16bytes):
- srai.d a4, a2, 3
- beqz a4, L(al_less_8bytes)
- ld.d t0, a0, 8
-
- ld.d t1, a1, 8
- addi.d a0, a0, 8
- addi.d a1, a1, 8
- addi.d a2, a2, -8
-
- bne t0, t1, L(out1)
-
-L(al_less_8bytes):
- beqz a2, L(ret)
- ld.d t0, a0, 8
- ld.d t1, a1, 8
-
- li.d t7, -1
- slli.d t2, a2, 3
- sll.d t2, t7, t2
- sub.d t3, t0, t1
-
- andn t6, t3, t2
- bnez t6, L(count_diff)
-
-L(ret):
- move a0, zero
- jr ra
-
-L(out4):
- move t0, t6
- move t1, t7
- sub.d t6, t6, t7
- b L(count_diff)
-
-L(out3):
- move t0, t4
- move t1, t5
- sub.d t6, t4, t5
- b L(count_diff)
-
-L(out2):
- move t0, t2
- move t1, t3
-L(out1):
- sub.d t6, t0, t1
- b L(count_diff)
-
-L(first_out):
- slli.d t4, a2, 3
- slt t3, a2, t5
- sll.d t4, t7, t4
- maskeqz t4, t4, t3
-
- andn t6, t6, t4
-
-L(count_diff):
- ctz.d t2, t6
- bstrins.d t2, zero, 2, 0
- srl.d t0, t0, t2
-
- srl.d t1, t1, t2
- andi t0, t0, 0xff
- andi t1, t1, 0xff
- sub.d t2, t0, t1
-
- sub.d t3, t1, t0
- masknez t2, t2, a5
- maskeqz t3, t3, a5
- or a0, t2, t3
-
- jr ra
-
-L(unaligned):
- sub.d a7, zero, a6
- srl.d t0, t2, a6
- blt a2, t8, L(un_less_8bytes)
-
- andi t1, a2, 31
- beq t1, a2, L(un_less_32bytes)
- sub.d t2, a2, t1
- add.d a4, a0, t2
-
- move a2, t1
-
-L(un_loop):
- ld.d t2, a0, 8
- ld.d t1, a1, 8
- ld.d t4, a0, 16
-
- ld.d t3, a1, 16
- ld.d t6, a0, 24
- ld.d t5, a1, 24
- ld.d t8, a0, 32
-
- ld.d t7, a1, 32
- addi.d a0, a0, 32
- addi.d a1, a1, 32
- sll.d a3, t2, a7
-
- or t0, a3, t0
- bne t0, t1, L(out1)
- srl.d t0, t2, a6
- sll.d a3, t4, a7
-
- or t2, a3, t0
- bne t2, t3, L(out2)
- srl.d t0, t4, a6
- sll.d a3, t6, a7
-
- or t4, a3, t0
- bne t4, t5, L(out3)
- srl.d t0, t6, a6
- sll.d a3, t8, a7
-
- or t6, t0, a3
- bne t6, t7, L(out4)
- srl.d t0, t8, a6
- bne a0, a4, L(un_loop)
-
-L(un_less_32bytes):
- srai.d a4, a2, 4
- beqz a4, L(un_less_16bytes)
- ld.d t2, a0, 8
- ld.d t1, a1, 8
-
- ld.d t4, a0, 16
- ld.d t3, a1, 16
- addi.d a0, a0, 16
- addi.d a1, a1, 16
-
- addi.d a2, a2, -16
- sll.d a3, t2, a7
- or t0, a3, t0
- bne t0, t1, L(out1)
-
- srl.d t0, t2, a6
- sll.d a3, t4, a7
- or t2, a3, t0
- bne t2, t3, L(out2)
-
- srl.d t0, t4, a6
-
-L(un_less_16bytes):
- srai.d a4, a2, 3
- beqz a4, L(un_less_8bytes)
- ld.d t2, a0, 8
-
- ld.d t1, a1, 8
- addi.d a0, a0, 8
- addi.d a1, a1, 8
- addi.d a2, a2, -8
-
- sll.d a3, t2, a7
- or t0, a3, t0
- bne t0, t1, L(out1)
- srl.d t0, t2, a6
-
-L(un_less_8bytes):
- beqz a2, L(ret)
- andi a7, a7, 63
- slli.d a4, a2, 3
- bgeu a7, a4, L(last_cmp)
-
- ld.d t2, a0, 8
- sll.d a3, t2, a7
- or t0, a3, t0
-
-L(last_cmp):
- ld.d t1, a1, 8
-
- li.d t7, -1
- sll.d t2, t7, a4
- sub.d t3, t0, t1
- andn t6, t3, t2
-
- bnez t6, L(count_diff)
- move a0, zero
- jr ra
-
-END(MEMCMP_NAME)
-
-#ifdef _LIBC
-libc_hidden_builtin_def (MEMCMP_NAME)
-#endif
diff --git a/sysdeps/loongarch/lp64/memcpy.S b/sysdeps/loongarch/lp64/memcpy.S
deleted file mode 100644
index 4791e1a4..00000000
--- a/sysdeps/loongarch/lp64/memcpy.S
+++ /dev/null
@@ -1,804 +0,0 @@
-#ifdef _LIBC
-#include <sysdep.h>
-#include <sys/regdef.h>
-#include <sys/asm.h>
-#else
-#include <regdef.h>
-#include <sys/asm.h>
-#endif
-
-#ifndef MEMCPY_NAME
-#define MEMCPY_NAME memcpy
-#endif
-
-#ifndef MEMMOVE_NAME
-#define MEMMOVE_NAME memmove
-#endif
-
-#define LD_64(reg, n) \
- ld.d t0, reg, n; \
- ld.d t1, reg, n+8; \
- ld.d t2, reg, n+16; \
- ld.d t3, reg, n+24; \
- ld.d t4, reg, n+32; \
- ld.d t5, reg, n+40; \
- ld.d t6, reg, n+48; \
- ld.d t7, reg, n+56;
-
-#define ST_64(reg, n) \
- st.d t0, reg, n; \
- st.d t1, reg, n+8; \
- st.d t2, reg, n+16; \
- st.d t3, reg, n+24; \
- st.d t4, reg, n+32; \
- st.d t5, reg, n+40; \
- st.d t6, reg, n+48; \
- st.d t7, reg, n+56;
-
-LEAF(MEMMOVE_NAME, 6)
- sub.d t0, a0, a1
- bltu t0, a2, L(copy_back)
-
-END(MEMMOVE_NAME)
-
-#ifdef _LIBC
-libc_hidden_builtin_def (MEMMOVE_NAME)
-#endif
-
-LEAF_NO_ALIGN(MEMCPY_NAME)
-
- srai.d a3, a2, 4
- beqz a3, L(short_data) # less than 16 bytes
-
- move a4, a0
- andi a5, a0, 0x7
- andi a6, a1, 0x7
- li.d t8, 8
- beqz a5, L(check_align)
-
- # make dest aligned 8 bytes
- sub.d t2, t8, a5
- sub.d a2, a2, t2
-
- pcaddi t1, 20
- slli.d t3, t2, 3
- add.d a1, a1, t2
- sub.d t1, t1, t3
- add.d a4, a4, t2
- jr t1
-
-L(al7):
- ld.b t0, a1, -7
- st.b t0, a4, -7
-L(al6):
- ld.b t0, a1, -6
- st.b t0, a4, -6
-L(al5):
- ld.b t0, a1, -5
- st.b t0, a4, -5
-L(al4):
- ld.b t0, a1, -4
- st.b t0, a4, -4
-L(al3):
- ld.b t0, a1, -3
- st.b t0, a4, -3
-L(al2):
- ld.b t0, a1, -2
- st.b t0, a4, -2
-L(al1):
- ld.b t0, a1, -1
- st.b t0, a4, -1
-
-L(check_align):
- bne a5, a6, L(unalign)
-
- srai.d a3, a2, 4
- beqz a3, L(al_less_16bytes)
-
- andi a3, a2, 0x3f
- beq a3, a2, L(al_less_64bytes)
-
- sub.d t0, a2, a3
- move a2, a3
- add.d a5, a1, t0
-
-L(loop_64bytes):
- LD_64(a1, 0)
- addi.d a1, a1, 64
- ST_64(a4, 0)
-
- addi.d a4, a4, 64
- bne a1, a5, L(loop_64bytes)
-
-L(al_less_64bytes):
- srai.d a3, a2, 5
- beqz a3, L(al_less_32bytes)
-
- ld.d t0, a1, 0
- ld.d t1, a1, 8
- ld.d t2, a1, 16
- ld.d t3, a1, 24
-
- addi.d a1, a1, 32
- addi.d a2, a2, -32
-
- st.d t0, a4, 0
- st.d t1, a4, 8
- st.d t2, a4, 16
- st.d t3, a4, 24
-
- addi.d a4, a4, 32
-
-L(al_less_32bytes):
- srai.d a3, a2, 4
- beqz a3, L(al_less_16bytes)
-
- ld.d t0, a1, 0
- ld.d t1, a1, 8
- addi.d a1, a1, 16
- addi.d a2, a2, -16
-
- st.d t0, a4, 0
- st.d t1, a4, 8
- addi.d a4, a4, 16
-
-L(al_less_16bytes):
- srai.d a3, a2, 3
- beqz a3, L(al_less_8bytes)
-
- ld.d t0, a1, 0
- addi.d a1, a1, 8
- addi.d a2, a2, -8
-
- st.d t0, a4, 0
- addi.d a4, a4, 8
-
-L(al_less_8bytes):
- srai.d a3, a2, 2
- beqz a3, L(al_less_4bytes)
-
- ld.w t0, a1, 0
- addi.d a1, a1, 4
- addi.d a2, a2, -4
-
- st.w t0, a4, 0
- addi.d a4, a4, 4
-
-L(al_less_4bytes):
- srai.d a3, a2, 1
- beqz a3, L(al_less_2bytes)
-
- ld.h t0, a1, 0
- addi.d a1, a1, 2
- addi.d a2, a2, -2
-
- st.h t0, a4, 0
- addi.d a4, a4, 2
-
-L(al_less_2bytes):
- beqz a2, L(al_less_1byte)
-
- ld.b t0, a1, 0
- st.b t0, a4, 0
-
-L(al_less_1byte):
- jr ra
-
-L(unalign):
- andi a5, a1, 0x7
- bstrins.d a1, zero, 2, 0 # make src 8 bytes aligned
-
- sub.d t8, t8, a5 # use t8 to save count of bytes for aligning
- slli.d a5, a5, 3
-
- ld.d t0, a1, 0
- addi.d a1, a1, 8
-
- slli.d a6, t8, 3
- srl.d a7, t0, a5
-
- srai.d a3, a2, 4
- beqz a3, L(un_less_16bytes)
-
- andi a3, a2, 0x3f
- beq a3, a2, L(un_less_64bytes)
-
- sub.d t0, a2, a3
- move a2, a3
- add.d a3, a1, t0
-
-# a5 shift right num
-# a6 shift left num
-# a7 remaining part
-L(un_long_bytes):
- ld.d t0, a1, 0
- ld.d t1, a1, 8
- ld.d t2, a1, 16
- ld.d t3, a1, 24
-
- srl.d t4, t0, a5
- sll.d t0, t0, a6
-
- srl.d t5, t1, a5
- sll.d t1, t1, a6
-
- srl.d t6, t2, a5
- sll.d t2, t2, a6
-
- srl.d t7, t3, a5
- sll.d t3, t3, a6
-
- or t0, a7, t0
- or t1, t4, t1
- or t2, t5, t2
- or t3, t6, t3
-
- ld.d t4, a1, 32
- ld.d t5, a1, 40
- ld.d t6, a1, 48
- ld.d a7, a1, 56
-
- st.d t0, a4, 0
- st.d t1, a4, 8
- st.d t2, a4, 16
- st.d t3, a4, 24
-
- addi.d a1, a1, 64
-
- srl.d t0, t4, a5
- sll.d t4, t4, a6
-
- srl.d t1, t5, a5
- sll.d t5, t5, a6
-
- srl.d t2, t6, a5
- sll.d t6, t6, a6
-
- sll.d t3, a7, a6
- srl.d a7, a7, a5
-
- or t4, t7, t4
- or t5, t0, t5
- or t6, t1, t6
- or t3, t2, t3
-
- st.d t4, a4, 32
- st.d t5, a4, 40
- st.d t6, a4, 48
- st.d t3, a4, 56
-
- addi.d a4, a4, 64
- bne a3, a1, L(un_long_bytes)
-
-L(un_less_64bytes):
- srai.d a3, a2, 5
- beqz a3, L(un_less_32bytes)
-
- ld.d t0, a1, 0
- ld.d t1, a1, 8
- ld.d t2, a1, 16
- ld.d t3, a1, 24
-
- addi.d a1, a1, 32
- addi.d a2, a2, -32
-
- srl.d t4, t0, a5
- sll.d t0, t0, a6
-
- srl.d t5, t1, a5
- sll.d t1, t1, a6
-
- srl.d t6, t2, a5
- sll.d t2, t2, a6
-
- or t0, a7, t0
-
- srl.d a7, t3, a5
- sll.d t3, t3, a6
-
- or t1, t4, t1
- or t2, t5, t2
- or t3, t6, t3
-
- st.d t0, a4, 0
- st.d t1, a4, 8
- st.d t2, a4, 16
- st.d t3, a4, 24
-
- addi.d a4, a4, 32
-
-L(un_less_32bytes):
- srai.d a3, a2, 4
- beqz a3, L(un_less_16bytes)
-
- ld.d t0, a1, 0
- ld.d t1, a1, 8
-
- addi.d a1, a1, 16
- addi.d a2, a2, -16
-
- srl.d t2, t0, a5
- sll.d t3, t0, a6
-
- sll.d t4, t1, a6
- or t3, a7, t3
- or t4, t2, t4
- srl.d a7, t1, a5
-
- st.d t3, a4, 0
- st.d t4, a4, 8
-
- addi.d a4, a4, 16
-
-L(un_less_16bytes):
- srai.d a3, a2, 3
- beqz a3, L(un_less_8bytes)
-
- ld.d t0, a1, 0
-
- addi.d a1, a1, 8
- addi.d a2, a2, -8
-
- sll.d t1, t0, a6
- or t2, a7, t1
- srl.d a7, t0, a5
-
- st.d t2, a4, 0
- addi.d a4, a4, 8
-
-L(un_less_8bytes):
- beqz a2, L(un_less_1byte)
- bge t8, a2, 1f # no more data in memory, un_less_8bytes data is stored in a7
-
- # combine data in memory and a7(remaining part)
- ld.d t0, a1, 0
- sll.d t0, t0, a6
- or a7, a7, t0
-
-1:
- srai.d a3, a2, 2
- beqz a3, L(un_less_4bytes)
-
- addi.d a2, a2, -4
- st.w a7, a4, 0
- addi.d a4, a4, 4
- srai.d a7, a7, 32
-
-L(un_less_4bytes):
- srai.d a3, a2, 1
- beqz a3, L(un_less_2bytes)
-
- addi.d a2, a2, -2
- st.h a7, a4, 0
- addi.d a4, a4, 2
- srai.d a7, a7, 16
-
-L(un_less_2bytes):
- beqz a2, L(un_less_1byte)
- st.b a7, a4, 0
-
-L(un_less_1byte):
- jr ra
-
-# Bytes copying for data less than 16 bytes
-L(short_data):
- pcaddi t1, 36
- slli.d t2, a2, 3
- add.d a4, a0, a2
- sub.d t1, t1, t2
- add.d a1, a1, a2
- jr t1
-
-L(short_15_bytes):
- ld.b t0, a1, -15
- st.b t0, a4, -15
-L(short_14_bytes):
- ld.b t0, a1, -14
- st.b t0, a4, -14
-L(short_13_bytes):
- ld.b t0, a1, -13
- st.b t0, a4, -13
-L(short_12_bytes):
- ld.b t0, a1, -12
- st.b t0, a4, -12
-L(short_11_bytes):
- ld.b t0, a1, -11
- st.b t0, a4, -11
-L(short_10_bytes):
- ld.b t0, a1, -10
- st.b t0, a4, -10
-L(short_9_bytes):
- ld.b t0, a1, -9
- st.b t0, a4, -9
-L(short_8_bytes):
- ld.b t0, a1, -8
- st.b t0, a4, -8
-L(short_7_bytes):
- ld.b t0, a1, -7
- st.b t0, a4, -7
-L(short_6_bytes):
- ld.b t0, a1, -6
- st.b t0, a4, -6
-L(short_5_bytes):
- ld.b t0, a1, -5
- st.b t0, a4, -5
-L(short_4_bytes):
- ld.b t0, a1, -4
- st.b t0, a4, -4
-L(short_3_bytes):
- ld.b t0, a1, -3
- st.b t0, a4, -3
-L(short_2_bytes):
- ld.b t0, a1, -2
- st.b t0, a4, -2
-L(short_1_bytes):
- ld.b t0, a1, -1
- st.b t0, a4, -1
- jr ra
-
-L(copy_back):
- srai.d a3, a2, 4
- beqz a3, L(back_short_data) # less than 16 bytes
-
- add.d a4, a0, a2 # store the tail of dest
- add.d a1, a1, a2 # store the tail of src
-
- andi a5, a4, 0x7
- andi a6, a1, 0x7
- beqz a5, L(back_check_align)
-
- # make dest aligned 8 bytes
- sub.d a2, a2, a5
- sub.d a1, a1, a5
- sub.d a4, a4, a5
-
- pcaddi t1, 18
- slli.d t3, a5, 3
- sub.d t1, t1, t3
- jr t1
-
- ld.b t0, a1, 6
- st.b t0, a4, 6
- ld.b t0, a1, 5
- st.b t0, a4, 5
- ld.b t0, a1, 4
- st.b t0, a4, 4
- ld.b t0, a1, 3
- st.b t0, a4, 3
- ld.b t0, a1, 2
- st.b t0, a4, 2
- ld.b t0, a1, 1
- st.b t0, a4, 1
- ld.b t0, a1, 0
- st.b t0, a4, 0
-
-L(back_check_align):
- bne a5, a6, L(back_unalign)
-
- srai.d a3, a2, 4
- beqz a3, L(back_less_16bytes)
-
- andi a3, a2, 0x3f
- beq a3, a2, L(back_less_64bytes)
-
- sub.d t0, a2, a3
- move a2, a3
- sub.d a5, a1, t0
-
-L(back_loop_64bytes):
- LD_64(a1, -64)
- addi.d a1, a1, -64
- ST_64(a4, -64)
-
- addi.d a4, a4, -64
- bne a1, a5, L(back_loop_64bytes)
-
-L(back_less_64bytes):
- srai.d a3, a2, 5
- beqz a3, L(back_less_32bytes)
-
- ld.d t0, a1, -32
- ld.d t1, a1, -24
- ld.d t2, a1, -16
- ld.d t3, a1, -8
-
- addi.d a1, a1, -32
- addi.d a2, a2, -32
-
- st.d t0, a4, -32
- st.d t1, a4, -24
- st.d t2, a4, -16
- st.d t3, a4, -8
-
- addi.d a4, a4, -32
-
-L(back_less_32bytes):
- srai.d a3, a2, 4
- beqz a3, L(back_less_16bytes)
-
- ld.d t0, a1, -16
- ld.d t1, a1, -8
-
- addi.d a2, a2, -16
- addi.d a1, a1, -16
-
- st.d t0, a4, -16
- st.d t1, a4, -8
- addi.d a4, a4, -16
-
-L(back_less_16bytes):
- srai.d a3, a2, 3
- beqz a3, L(back_less_8bytes)
-
- ld.d t0, a1, -8
- addi.d a2, a2, -8
- addi.d a1, a1, -8
-
- st.d t0, a4, -8
- addi.d a4, a4, -8
-
-L(back_less_8bytes):
- srai.d a3, a2, 2
- beqz a3, L(back_less_4bytes)
-
- ld.w t0, a1, -4
- addi.d a2, a2, -4
- addi.d a1, a1, -4
-
- st.w t0, a4, -4
- addi.d a4, a4, -4
-
-L(back_less_4bytes):
- srai.d a3, a2, 1
- beqz a3, L(back_less_2bytes)
-
- ld.h t0, a1, -2
- addi.d a2, a2, -2
- addi.d a1, a1, -2
-
- st.h t0, a4, -2
- addi.d a4, a4, -2
-
-L(back_less_2bytes):
- beqz a2, L(back_less_1byte)
-
- ld.b t0, a1, -1
- st.b t0, a4, -1
-
-L(back_less_1byte):
- jr ra
-
-L(back_unalign):
- andi t8, a1, 0x7
- bstrins.d a1, zero, 2, 0 # make src 8 bytes aligned
-
- sub.d a6, zero, t8
-
- ld.d t0, a1, 0
- slli.d a6, a6, 3
- slli.d a5, t8, 3
- sll.d a7, t0, a6
-
- srai.d a3, a2, 4
- beqz a3, L(back_un_less_16bytes)
-
- andi a3, a2, 0x3f
- beq a3, a2, L(back_un_less_64bytes)
-
- sub.d t0, a2, a3
- move a2, a3
- sub.d a3, a1, t0
-
-L(back_un_long_bytes):
- ld.d t0, a1, -8
- ld.d t1, a1, -16
- ld.d t2, a1, -24
- ld.d t3, a1, -32
-
- sll.d t4, t0, a6
- srl.d t0, t0, a5
-
- sll.d t5, t1, a6
- srl.d t1, t1, a5
-
- sll.d t6, t2, a6
- srl.d t2, t2, a5
-
- sll.d t7, t3, a6
- srl.d t3, t3, a5
-
- or t0, t0, a7
- or t1, t1, t4
- or t2, t2, t5
- or t3, t3, t6
-
- ld.d t4, a1, -40
- ld.d t5, a1, -48
- ld.d t6, a1, -56
- ld.d a7, a1, -64
- st.d t0, a4, -8
- st.d t1, a4, -16
- st.d t2, a4, -24
- st.d t3, a4, -32
-
- addi.d a1, a1, -64
-
- sll.d t0, t4, a6
- srl.d t4, t4, a5
-
- sll.d t1, t5, a6
- srl.d t5, t5, a5
-
- sll.d t2, t6, a6
- srl.d t6, t6, a5
-
- srl.d t3, a7, a5
- sll.d a7, a7, a6
-
- or t4, t7, t4
- or t5, t0, t5
- or t6, t1, t6
- or t3, t2, t3
-
- st.d t4, a4, -40
- st.d t5, a4, -48
- st.d t6, a4, -56
- st.d t3, a4, -64
-
- addi.d a4, a4, -64
- bne a3, a1, L(back_un_long_bytes)
-
-L(back_un_less_64bytes):
- srai.d a3, a2, 5
- beqz a3, L(back_un_less_32bytes)
-
- ld.d t0, a1, -8
- ld.d t1, a1, -16
- ld.d t2, a1, -24
- ld.d t3, a1, -32
-
- addi.d a1, a1, -32
- addi.d a2, a2, -32
-
- sll.d t4, t0, a6
- srl.d t0, t0, a5
-
- sll.d t5, t1, a6
- srl.d t1, t1, a5
-
- sll.d t6, t2, a6
- srl.d t2, t2, a5
-
- or t0, a7, t0
-
- sll.d a7, t3, a6
- srl.d t3, t3, a5
-
- or t1, t4, t1
- or t2, t5, t2
- or t3, t6, t3
-
- st.d t0, a4, -8
- st.d t1, a4, -16
- st.d t2, a4, -24
- st.d t3, a4, -32
-
- addi.d a4, a4, -32
-
-L(back_un_less_32bytes):
- srai.d a3, a2, 4
- beqz a3, L(back_un_less_16bytes)
-
- ld.d t0, a1, -8
- ld.d t1, a1, -16
-
- addi.d a1, a1, -16
- addi.d a2, a2, -16
-
- sll.d t2, t0, a6
- srl.d t3, t0, a5
-
- srl.d t4, t1, a5
- or t3, a7, t3
- or t4, t2, t4
- sll.d a7, t1, a6
-
- st.d t3, a4, -8
- st.d t4, a4, -16
-
- addi.d a4, a4, -16
-
-L(back_un_less_16bytes):
- srai.d a3, a2, 3
- beqz a3, L(back_un_less_8bytes)
-
- ld.d t0, a1, -8
-
- addi.d a1, a1, -8
- addi.d a2, a2, -8
-
- srl.d t1, t0, a5
- or t2, a7, t1
- sll.d a7, t0, a6
-
- st.d t2, a4, -8
- addi.d a4, a4, -8
-
-L(back_un_less_8bytes):
- beqz a2, L(back_end)
- bge t8, a2, 1f # no more data in memory, un_less_8bytes data is stored in a7
-
- # combine data in memory and a7(remaining part)
- ld.d t0, a1, -8
- srl.d t0, t0, a5
- or a7, a7, t0
-
-1:
- srai.d a3, a2, 2
- beqz a3, L(back_un_less_4bytes)
-
- srai.d t0, a7, 32
- addi.d a2, a2, -4
- st.w t0, a4, -4
- addi.d a4, a4, -4
- slli.d a7, a7, 32
-
-L(back_un_less_4bytes):
- srai.d a3, a2, 1
- beqz a3, L(back_un_less_2bytes)
- srai.d t0, a7, 48
- addi.d a2, a2, -2
- st.h t0, a4, -2
- addi.d a4, a4, -2
- slli.d a7, a7, 16
-L(back_un_less_2bytes):
- beqz a2, L(back_un_less_1byte)
- srai.d t0, a7, 56
- st.b t0, a4, -1
-L(back_un_less_1byte):
- jr ra
-
-L(back_short_data):
- pcaddi t1, 34
- slli.d t2, a2, 3
- sub.d t1, t1, t2
- jr t1
-
- ld.b t0, a1, 14
- st.b t0, a0, 14
- ld.b t0, a1, 13
- st.b t0, a0, 13
- ld.b t0, a1, 12
- st.b t0, a0, 12
- ld.b t0, a1, 11
- st.b t0, a0, 11
- ld.b t0, a1, 10
- st.b t0, a0, 10
- ld.b t0, a1, 9
- st.b t0, a0, 9
- ld.b t0, a1, 8
- st.b t0, a0, 8
- ld.b t0, a1, 7
- st.b t0, a0, 7
- ld.b t0, a1, 6
- st.b t0, a0, 6
- ld.b t0, a1, 5
- st.b t0, a0, 5
- ld.b t0, a1, 4
- st.b t0, a0, 4
- ld.b t0, a1, 3
- st.b t0, a0, 3
- ld.b t0, a1, 2
- st.b t0, a0, 2
- ld.b t0, a1, 1
- st.b t0, a0, 1
- ld.b t0, a1, 0
- st.b t0, a0, 0
-L(back_end):
- jr ra
-
-END(MEMCPY_NAME)
-
-#ifdef _LIBC
-libc_hidden_builtin_def (MEMCPY_NAME)
-#endif
diff --git a/sysdeps/loongarch/lp64/memmove.S b/sysdeps/loongarch/lp64/memmove.S
deleted file mode 100644
index 6d1922c4..00000000
--- a/sysdeps/loongarch/lp64/memmove.S
+++ /dev/null
@@ -1,2 +0,0 @@
-/* DONT DELETE THIS FILE, OTHERWIES MEMCPY.C WILL BE COMPILED. */
-/* There are too many common code in memcpy and memmove. See memcpy.S */
diff --git a/sysdeps/loongarch/lp64/memset.S b/sysdeps/loongarch/lp64/memset.S
deleted file mode 100644
index eabd7d23..00000000
--- a/sysdeps/loongarch/lp64/memset.S
+++ /dev/null
@@ -1,166 +0,0 @@
-#ifdef _LIBC
-#include <sysdep.h>
-#include <sys/regdef.h>
-#include <sys/asm.h>
-#else
-#include <sys/asm.h>
-#include <sys/regdef.h>
-#endif
-
-#ifndef MEMSET_NAME
-#define MEMSET_NAME memset
-#endif
-
-#define ST_64(n) \
- st.d a1, a0, n; \
- st.d a1, a0, n+8; \
- st.d a1, a0, n+16; \
- st.d a1, a0, n+24; \
- st.d a1, a0, n+32; \
- st.d a1, a0, n+40; \
- st.d a1, a0, n+48; \
- st.d a1, a0, n+56;
-
-LEAF(MEMSET_NAME, 6)
- move t0, a0
- andi a3, a0, 0x7
- li.w t6, 16
- beqz a3, L(align)
- blt a2, t6, L(short_data)
-
-L(make_align):
- li.w t8, 8
- sub.d t2, t8, a3
- pcaddi t1, 11
- slli.d t3, t2, 2
- sub.d t1, t1, t3
- jirl zero, t1, 0
-
-L(al7):
- st.b a1, t0, 6
-L(al6):
- st.b a1, t0, 5
-L(al5):
- st.b a1, t0, 4
-L(al4):
- st.b a1, t0, 3
-L(al3):
- st.b a1, t0, 2
-L(al2):
- st.b a1, t0, 1
-L(al1):
- st.b a1, t0, 0
-L(al0):
- add.d t0, t0, t2
- sub.d a2, a2, t2
-
-L(align):
- bstrins.d a1, a1, 15, 8
- bstrins.d a1, a1, 31, 16
- bstrins.d a1, a1, 63, 32
-
- blt a2, t6, L(less_16bytes)
-
- andi a4, a2, 0x3f
- beq a4, a2, L(less_64bytes)
-
- sub.d t1, a2, a4
- move a2, a4
- add.d a5, t0, t1
-
-L(loop_64bytes):
- addi.d t0, t0, 64
- st.d a1, t0, -64
- st.d a1, t0, -56
- st.d a1, t0, -48
- st.d a1, t0, -40
- st.d a1, t0, -32
- st.d a1, t0, -24
- st.d a1, t0, -16
- st.d a1, t0, -8
- bne t0, a5, L(loop_64bytes)
-
-L(less_64bytes):
- srai.d a4, a2, 5
- beqz a4, L(less_32bytes)
- addi.d a2, a2, -32
- st.d a1, t0, 0
- st.d a1, t0, 8
- st.d a1, t0, 16
- st.d a1, t0, 24
- addi.d t0, t0, 32
-L(less_32bytes):
- blt a2, t6, L(less_16bytes)
- addi.d a2, a2, -16
- st.d a1, t0, 0
- st.d a1, t0, 8
- addi.d t0, t0, 16
-L(less_16bytes):
- srai.d a4, a2, 3
- beqz a4, L(less_8bytes)
- addi.d a2, a2, -8
- st.d a1, t0, 0
- addi.d t0, t0, 8
-L(less_8bytes):
- beqz a2, L(less_1byte)
- srai.d a4, a2, 2
- beqz a4, L(less_4bytes)
- addi.d a2, a2, -4
- st.w a1, t0, 0
- addi.d t0, t0, 4
-L(less_4bytes):
- srai.d a3, a2, 1
- beqz a3, L(less_2bytes)
- addi.d a2, a2, -2
- st.h a1, t0, 0
- addi.d t0, t0, 2
-L(less_2bytes):
- beqz a2, L(less_1byte)
- st.b a1, t0, 0
-L(less_1byte):
- jr ra
-
-L(short_data):
- pcaddi t1, 19
- slli.d t3, a2, 2
- sub.d t1, t1, t3
- jirl zero, t1, 0
-L(short_15):
- st.b a1, a0, 14
-
-L(short_14):
- st.b a1, a0, 13
-L(short_13):
- st.b a1, a0, 12
-L(short_12):
- st.b a1, a0, 11
-L(short_11):
- st.b a1, a0, 10
-L(short_10):
- st.b a1, a0, 9
-L(short_9):
- st.b a1, a0, 8
-L(short_8):
- st.b a1, a0, 7
-L(short_7):
- st.b a1, a0, 6
-L(short_6):
- st.b a1, a0, 5
-L(short_5):
- st.b a1, a0, 4
-L(short_4):
- st.b a1, a0, 3
-L(short_3):
- st.b a1, a0, 2
-L(short_2):
- st.b a1, a0, 1
-L(short_1):
- st.b a1, a0, 0
-L(short_0):
- jr ra
-
-END(MEMSET_NAME)
-
-#ifdef _LIBC
-libc_hidden_builtin_def (MEMSET_NAME)
-#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S
index 4677c912..7dfa3ade 100644
--- a/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S
+++ b/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S
@@ -1,7 +1,96 @@
+#ifdef _LIBC
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+#else
+#include <sys/asm.h>
+#include <sys/regdef.h>
+#endif
+
#if IS_IN (libc)
#define MEMCHR_NAME __memchr_aligned
+#else
+#define MEMCHR_NAME memchr
#endif
-#include "../memchr.S"
+LEAF(MEMCHR_NAME, 6)
+ beqz a2, L(out)
+ andi t1, a0, 0x7
+ lu12i.w a3, 0x01010
+ sub.d a5, a0, t1
+
+ bstrins.d a1, a1, 15, 8
+ ld.d t0, a5, 0
+ slli.d t2, t1, 3
+ ori a3, a3, 0x101
+
+ bstrins.d a1, a1, 31, 16
+ li.w t7, -1
+ li.w t8, 9
+ bstrins.d a3, a3, 63, 32
+
+ srl.d t3, t7, t2
+ bstrins.d a1, a1, 63, 32
+ sub.d t4, t8, t1
+ orn t3, a1, t3
+
+ srl.d t0, t0, t2
+ slli.d a4, a3, 7 # 0x8080808080808080
+ sltu t4, a2, t4
+ xor t2, t0, t3
+
+ sub.d a6, t2, a3
+ andn a7, a4, t2
+ and t2, a6, a7
+ or t3, t2, t4
+
+ bnez t3, L(count_pos)
+ addi.d a2, a2, -8
+ addi.d a0, a5, 8
+ add.d a2, a2, t1
+
+L(loop):
+ ld.d t0, a0, 0
+ sltui t4, a2, 9
+ xor t2, t0, a1
+ sub.d a6, t2, a3
+
+ andn a7, a4, t2
+ and t2, a6, a7
+ or t3, t2, t4
+ bnez t3, L(count_pos)
+
+ ld.d t1, a0, 8
+ addi.d a0, a0, 16
+ sltui t4, a2, 17
+ xor t2, t1, a1
+
+ sub.d a6, t2, a3
+ andn a7, a4, t2
+ and t2, a6, a7
+ addi.d a2, a2, -16
+
+ or t3, t2, t4
+ beqz t3, L(loop)
+ addi.d a0, a0, -8
+ addi.d a2, a2, 8
+
+L(count_pos):
+ ctz.d t0, t2
+ srli.d t0, t0, 3
+ sltu t1, t0, a2
+ add.d a0, a0, t0
+
+ maskeqz a0, a0, t1
+ jr ra
+
+L(out):
+ move a0, zero
+ jr ra
+END(MEMCHR_NAME)
+
+#ifdef _LIBC
+libc_hidden_builtin_def (MEMCHR_NAME)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S
index 512eabca..9505dfce 100644
--- a/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S
+++ b/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S
@@ -1,11 +1,289 @@
-#if IS_IN (libc)
+
+#ifdef _LIBC
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+#else
+#include <sys/asm.h>
+#include <sys/regdef.h>
+#endif
+
+#if IS_IN (libc)
#define MEMCMP_NAME __memcmp_aligned
+#else
+#define MEMCMP_NAME memcmp
+#endif
+
+LEAF(MEMCMP_NAME, 6)
+ beqz a2, L(ret)
+ andi a4, a1, 0x7
+ andi a3, a0, 0x7
+ sltu a5, a4, a3
+
+ xor t0, a0, a1
+ li.w t8, 8
+ maskeqz t0, t0, a5
+ li.w t7, -1
+
+ xor a0, a0, t0 // a0 hold smaller one
+ xor a1, a1, t0 // a1 hold larger one
+ andi a3, a0, 0x7 // a3 hold small offset
+ andi a4, a1, 0x7 // a4 hold larger offset
+
+ xor a0, a0, a3
+ xor a1, a1, a4
+ ld.d t2, a0, 0 // t2 = "fedcbaXX"
+ ld.d t1, a1, 0 // t1 = "54321YYY"
+
+ slli.d t3, a3, 3
+ slli.d t4, a4, 3
+ sub.d a6, t3, t4 // a6 = 0xfffffffffffffff8
+ srl.d t1, t1, t4 // t1 = "00054321"
+
+ srl.d t0, t2, t3 // t0 = "00fedcba"
+ srl.d t5, t7, t4 // t5 = 0x000000FFFFFFFFFF
+ sub.d t6, t0, t1 // t6 hold diff
+ and t6, t6, t5 // t6 = "000xxxxx"
+
+ sub.d t5, t8, a4 // t5 hold margin 8 - 3 = 5
+ bnez t6, L(first_out)
+ bgeu t5, a2, L(ret)
+ sub.d a2, a2, t5
+
+ bnez a6, L(unaligned)
+ blt a2, t8, L(al_less_8bytes)
+ andi t1, a2, 31
+ beq t1, a2, L(al_less_32bytes)
+
+ sub.d t2, a2, t1
+ add.d a4, a0, t2
+ move a2, t1
+
+L(al_loop):
+ ld.d t0, a0, 8
+
+ ld.d t1, a1, 8
+ ld.d t2, a0, 16
+ ld.d t3, a1, 16
+ ld.d t4, a0, 24
+
+ ld.d t5, a1, 24
+ ld.d t6, a0, 32
+ ld.d t7, a1, 32
+ addi.d a0, a0, 32
+
+ addi.d a1, a1, 32
+ bne t0, t1, L(out1)
+ bne t2, t3, L(out2)
+ bne t4, t5, L(out3)
+
+ bne t6, t7, L(out4)
+ bne a0, a4, L(al_loop)
+
+L(al_less_32bytes):
+ srai.d a4, a2, 4
+ beqz a4, L(al_less_16bytes)
+
+ ld.d t0, a0, 8
+ ld.d t1, a1, 8
+ ld.d t2, a0, 16
+ ld.d t3, a1, 16
+
+ addi.d a0, a0, 16
+ addi.d a1, a1, 16
+ addi.d a2, a2, -16
+ bne t0, t1, L(out1)
+
+ bne t2, t3, L(out2)
+
+L(al_less_16bytes):
+ srai.d a4, a2, 3
+ beqz a4, L(al_less_8bytes)
+ ld.d t0, a0, 8
+
+ ld.d t1, a1, 8
+ addi.d a0, a0, 8
+ addi.d a1, a1, 8
+ addi.d a2, a2, -8
+
+ bne t0, t1, L(out1)
+
+L(al_less_8bytes):
+ beqz a2, L(ret)
+ ld.d t0, a0, 8
+ ld.d t1, a1, 8
+
+ li.d t7, -1
+ slli.d t2, a2, 3
+ sll.d t2, t7, t2
+ sub.d t3, t0, t1
+
+ andn t6, t3, t2
+ bnez t6, L(count_diff)
+
+L(ret):
+ move a0, zero
+ jr ra
+
+L(out4):
+ move t0, t6
+ move t1, t7
+ sub.d t6, t6, t7
+ b L(count_diff)
+
+L(out3):
+ move t0, t4
+ move t1, t5
+ sub.d t6, t4, t5
+ b L(count_diff)
+
+L(out2):
+ move t0, t2
+ move t1, t3
+L(out1):
+ sub.d t6, t0, t1
+ b L(count_diff)
+
+L(first_out):
+ slli.d t4, a2, 3
+ slt t3, a2, t5
+ sll.d t4, t7, t4
+ maskeqz t4, t4, t3
+
+ andn t6, t6, t4
+
+L(count_diff):
+ ctz.d t2, t6
+ bstrins.d t2, zero, 2, 0
+ srl.d t0, t0, t2
+
+ srl.d t1, t1, t2
+ andi t0, t0, 0xff
+ andi t1, t1, 0xff
+ sub.d t2, t0, t1
+
+ sub.d t3, t1, t0
+ masknez t2, t2, a5
+ maskeqz t3, t3, a5
+ or a0, t2, t3
+
+ jr ra
+
+L(unaligned):
+ sub.d a7, zero, a6
+ srl.d t0, t2, a6
+ blt a2, t8, L(un_less_8bytes)
+
+ andi t1, a2, 31
+ beq t1, a2, L(un_less_32bytes)
+ sub.d t2, a2, t1
+ add.d a4, a0, t2
+
+ move a2, t1
+
+L(un_loop):
+ ld.d t2, a0, 8
+ ld.d t1, a1, 8
+ ld.d t4, a0, 16
+
+ ld.d t3, a1, 16
+ ld.d t6, a0, 24
+ ld.d t5, a1, 24
+ ld.d t8, a0, 32
+
+ ld.d t7, a1, 32
+ addi.d a0, a0, 32
+ addi.d a1, a1, 32
+ sll.d a3, t2, a7
+
+ or t0, a3, t0
+ bne t0, t1, L(out1)
+ srl.d t0, t2, a6
+ sll.d a3, t4, a7
+
+ or t2, a3, t0
+ bne t2, t3, L(out2)
+ srl.d t0, t4, a6
+ sll.d a3, t6, a7
+
+ or t4, a3, t0
+ bne t4, t5, L(out3)
+ srl.d t0, t6, a6
+ sll.d a3, t8, a7
+
+ or t6, t0, a3
+ bne t6, t7, L(out4)
+ srl.d t0, t8, a6
+ bne a0, a4, L(un_loop)
+
+L(un_less_32bytes):
+ srai.d a4, a2, 4
+ beqz a4, L(un_less_16bytes)
+ ld.d t2, a0, 8
+ ld.d t1, a1, 8
+
+ ld.d t4, a0, 16
+ ld.d t3, a1, 16
+ addi.d a0, a0, 16
+ addi.d a1, a1, 16
+
+ addi.d a2, a2, -16
+ sll.d a3, t2, a7
+ or t0, a3, t0
+ bne t0, t1, L(out1)
+
+ srl.d t0, t2, a6
+ sll.d a3, t4, a7
+ or t2, a3, t0
+ bne t2, t3, L(out2)
+
+ srl.d t0, t4, a6
+
+L(un_less_16bytes):
+ srai.d a4, a2, 3
+ beqz a4, L(un_less_8bytes)
+ ld.d t2, a0, 8
+
+ ld.d t1, a1, 8
+ addi.d a0, a0, 8
+ addi.d a1, a1, 8
+ addi.d a2, a2, -8
+
+ sll.d a3, t2, a7
+ or t0, a3, t0
+ bne t0, t1, L(out1)
+ srl.d t0, t2, a6
+
+L(un_less_8bytes):
+ beqz a2, L(ret)
+ andi a7, a7, 63
+ slli.d a4, a2, 3
+ bgeu a7, a4, L(last_cmp)
+
+ ld.d t2, a0, 8
+ sll.d a3, t2, a7
+ or t0, a3, t0
+
+L(last_cmp):
+ ld.d t1, a1, 8
+
+ li.d t7, -1
+ sll.d t2, t7, a4
+ sub.d t3, t0, t1
+ andn t6, t3, t2
+
+ bnez t6, L(count_diff)
+ move a0, zero
+ jr ra
+
+END(MEMCMP_NAME)
+#ifdef _LIBC
+libc_hidden_builtin_def (MEMCMP_NAME)
#endif
-#include "../memcmp.S"
# undef bcmp
weak_alias (MEMCMP_NAME, bcmp)
diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S
index 5ff8b4e6..3fc86a7f 100644
--- a/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S
+++ b/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S
@@ -1,11 +1,804 @@
-
+#ifdef _LIBC
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+#else
+#include <regdef.h>
+#include <sys/asm.h>
+#endif
#if IS_IN (libc)
-
#define MEMCPY_NAME __memcpy_aligned
#define MEMMOVE_NAME __memmove_aligned
+#else
+#define MEMCPY_NAME memcpy
+#define MEMMOVE_NAME memmove
+#endif
+
+#define LD_64(reg, n) \
+ ld.d t0, reg, n; \
+ ld.d t1, reg, n+8; \
+ ld.d t2, reg, n+16; \
+ ld.d t3, reg, n+24; \
+ ld.d t4, reg, n+32; \
+ ld.d t5, reg, n+40; \
+ ld.d t6, reg, n+48; \
+ ld.d t7, reg, n+56;
+
+#define ST_64(reg, n) \
+ st.d t0, reg, n; \
+ st.d t1, reg, n+8; \
+ st.d t2, reg, n+16; \
+ st.d t3, reg, n+24; \
+ st.d t4, reg, n+32; \
+ st.d t5, reg, n+40; \
+ st.d t6, reg, n+48; \
+ st.d t7, reg, n+56;
+LEAF(MEMMOVE_NAME, 6)
+ sub.d t0, a0, a1
+ bltu t0, a2, L(copy_back)
+
+END(MEMMOVE_NAME)
+
+#ifdef _LIBC
+libc_hidden_builtin_def (MEMMOVE_NAME)
#endif
-#include "../memcpy.S"
+LEAF_NO_ALIGN(MEMCPY_NAME)
+
+ srai.d a3, a2, 4
+ beqz a3, L(short_data) # less than 16 bytes
+
+ move a4, a0
+ andi a5, a0, 0x7
+ andi a6, a1, 0x7
+ li.d t8, 8
+ beqz a5, L(check_align)
+
+ # make dest aligned 8 bytes
+ sub.d t2, t8, a5
+ sub.d a2, a2, t2
+
+ pcaddi t1, 20
+ slli.d t3, t2, 3
+ add.d a1, a1, t2
+ sub.d t1, t1, t3
+ add.d a4, a4, t2
+ jr t1
+
+L(al7):
+ ld.b t0, a1, -7
+ st.b t0, a4, -7
+L(al6):
+ ld.b t0, a1, -6
+ st.b t0, a4, -6
+L(al5):
+ ld.b t0, a1, -5
+ st.b t0, a4, -5
+L(al4):
+ ld.b t0, a1, -4
+ st.b t0, a4, -4
+L(al3):
+ ld.b t0, a1, -3
+ st.b t0, a4, -3
+L(al2):
+ ld.b t0, a1, -2
+ st.b t0, a4, -2
+L(al1):
+ ld.b t0, a1, -1
+ st.b t0, a4, -1
+
+L(check_align):
+ bne a5, a6, L(unalign)
+
+ srai.d a3, a2, 4
+ beqz a3, L(al_less_16bytes)
+
+ andi a3, a2, 0x3f
+ beq a3, a2, L(al_less_64bytes)
+
+ sub.d t0, a2, a3
+ move a2, a3
+ add.d a5, a1, t0
+
+L(loop_64bytes):
+ LD_64(a1, 0)
+ addi.d a1, a1, 64
+ ST_64(a4, 0)
+
+ addi.d a4, a4, 64
+ bne a1, a5, L(loop_64bytes)
+
+L(al_less_64bytes):
+ srai.d a3, a2, 5
+ beqz a3, L(al_less_32bytes)
+
+ ld.d t0, a1, 0
+ ld.d t1, a1, 8
+ ld.d t2, a1, 16
+ ld.d t3, a1, 24
+
+ addi.d a1, a1, 32
+ addi.d a2, a2, -32
+
+ st.d t0, a4, 0
+ st.d t1, a4, 8
+ st.d t2, a4, 16
+ st.d t3, a4, 24
+
+ addi.d a4, a4, 32
+
+L(al_less_32bytes):
+ srai.d a3, a2, 4
+ beqz a3, L(al_less_16bytes)
+
+ ld.d t0, a1, 0
+ ld.d t1, a1, 8
+ addi.d a1, a1, 16
+ addi.d a2, a2, -16
+
+ st.d t0, a4, 0
+ st.d t1, a4, 8
+ addi.d a4, a4, 16
+
+L(al_less_16bytes):
+ srai.d a3, a2, 3
+ beqz a3, L(al_less_8bytes)
+
+ ld.d t0, a1, 0
+ addi.d a1, a1, 8
+ addi.d a2, a2, -8
+
+ st.d t0, a4, 0
+ addi.d a4, a4, 8
+
+L(al_less_8bytes):
+ srai.d a3, a2, 2
+ beqz a3, L(al_less_4bytes)
+
+ ld.w t0, a1, 0
+ addi.d a1, a1, 4
+ addi.d a2, a2, -4
+
+ st.w t0, a4, 0
+ addi.d a4, a4, 4
+
+L(al_less_4bytes):
+ srai.d a3, a2, 1
+ beqz a3, L(al_less_2bytes)
+
+ ld.h t0, a1, 0
+ addi.d a1, a1, 2
+ addi.d a2, a2, -2
+
+ st.h t0, a4, 0
+ addi.d a4, a4, 2
+
+L(al_less_2bytes):
+ beqz a2, L(al_less_1byte)
+
+ ld.b t0, a1, 0
+ st.b t0, a4, 0
+
+L(al_less_1byte):
+ jr ra
+
+L(unalign):
+ andi a5, a1, 0x7
+ bstrins.d a1, zero, 2, 0 # make src 8 bytes aligned
+
+ sub.d t8, t8, a5 # use t8 to save count of bytes for aligning
+ slli.d a5, a5, 3
+
+ ld.d t0, a1, 0
+ addi.d a1, a1, 8
+
+ slli.d a6, t8, 3
+ srl.d a7, t0, a5
+
+ srai.d a3, a2, 4
+ beqz a3, L(un_less_16bytes)
+
+ andi a3, a2, 0x3f
+ beq a3, a2, L(un_less_64bytes)
+
+ sub.d t0, a2, a3
+ move a2, a3
+ add.d a3, a1, t0
+
+# a5 shift right num
+# a6 shift left num
+# a7 remaining part
+L(un_long_bytes):
+ ld.d t0, a1, 0
+ ld.d t1, a1, 8
+ ld.d t2, a1, 16
+ ld.d t3, a1, 24
+
+ srl.d t4, t0, a5
+ sll.d t0, t0, a6
+
+ srl.d t5, t1, a5
+ sll.d t1, t1, a6
+
+ srl.d t6, t2, a5
+ sll.d t2, t2, a6
+
+ srl.d t7, t3, a5
+ sll.d t3, t3, a6
+
+ or t0, a7, t0
+ or t1, t4, t1
+ or t2, t5, t2
+ or t3, t6, t3
+
+ ld.d t4, a1, 32
+ ld.d t5, a1, 40
+ ld.d t6, a1, 48
+ ld.d a7, a1, 56
+
+ st.d t0, a4, 0
+ st.d t1, a4, 8
+ st.d t2, a4, 16
+ st.d t3, a4, 24
+
+ addi.d a1, a1, 64
+
+ srl.d t0, t4, a5
+ sll.d t4, t4, a6
+
+ srl.d t1, t5, a5
+ sll.d t5, t5, a6
+
+ srl.d t2, t6, a5
+ sll.d t6, t6, a6
+
+ sll.d t3, a7, a6
+ srl.d a7, a7, a5
+
+ or t4, t7, t4
+ or t5, t0, t5
+ or t6, t1, t6
+ or t3, t2, t3
+
+ st.d t4, a4, 32
+ st.d t5, a4, 40
+ st.d t6, a4, 48
+ st.d t3, a4, 56
+
+ addi.d a4, a4, 64
+ bne a3, a1, L(un_long_bytes)
+
+L(un_less_64bytes):
+ srai.d a3, a2, 5
+ beqz a3, L(un_less_32bytes)
+
+ ld.d t0, a1, 0
+ ld.d t1, a1, 8
+ ld.d t2, a1, 16
+ ld.d t3, a1, 24
+
+ addi.d a1, a1, 32
+ addi.d a2, a2, -32
+
+ srl.d t4, t0, a5
+ sll.d t0, t0, a6
+
+ srl.d t5, t1, a5
+ sll.d t1, t1, a6
+
+ srl.d t6, t2, a5
+ sll.d t2, t2, a6
+
+ or t0, a7, t0
+
+ srl.d a7, t3, a5
+ sll.d t3, t3, a6
+
+ or t1, t4, t1
+ or t2, t5, t2
+ or t3, t6, t3
+
+ st.d t0, a4, 0
+ st.d t1, a4, 8
+ st.d t2, a4, 16
+ st.d t3, a4, 24
+
+ addi.d a4, a4, 32
+
+L(un_less_32bytes):
+ srai.d a3, a2, 4
+ beqz a3, L(un_less_16bytes)
+
+ ld.d t0, a1, 0
+ ld.d t1, a1, 8
+
+ addi.d a1, a1, 16
+ addi.d a2, a2, -16
+
+ srl.d t2, t0, a5
+ sll.d t3, t0, a6
+
+ sll.d t4, t1, a6
+ or t3, a7, t3
+ or t4, t2, t4
+ srl.d a7, t1, a5
+
+ st.d t3, a4, 0
+ st.d t4, a4, 8
+
+ addi.d a4, a4, 16
+
+L(un_less_16bytes):
+ srai.d a3, a2, 3
+ beqz a3, L(un_less_8bytes)
+
+ ld.d t0, a1, 0
+
+ addi.d a1, a1, 8
+ addi.d a2, a2, -8
+
+ sll.d t1, t0, a6
+ or t2, a7, t1
+ srl.d a7, t0, a5
+
+ st.d t2, a4, 0
+ addi.d a4, a4, 8
+
+L(un_less_8bytes):
+ beqz a2, L(un_less_1byte)
+ bge t8, a2, 1f # no more data in memory, un_less_8bytes data is stored in a7
+
+ # combine data in memory and a7(remaining part)
+ ld.d t0, a1, 0
+ sll.d t0, t0, a6
+ or a7, a7, t0
+
+1:
+ srai.d a3, a2, 2
+ beqz a3, L(un_less_4bytes)
+
+ addi.d a2, a2, -4
+ st.w a7, a4, 0
+ addi.d a4, a4, 4
+ srai.d a7, a7, 32
+
+L(un_less_4bytes):
+ srai.d a3, a2, 1
+ beqz a3, L(un_less_2bytes)
+
+ addi.d a2, a2, -2
+ st.h a7, a4, 0
+ addi.d a4, a4, 2
+ srai.d a7, a7, 16
+L(un_less_2bytes):
+ beqz a2, L(un_less_1byte)
+ st.b a7, a4, 0
+
+L(un_less_1byte):
+ jr ra
+
+# Bytes copying for data less than 16 bytes
+L(short_data):
+ pcaddi t1, 36
+ slli.d t2, a2, 3
+ add.d a4, a0, a2
+ sub.d t1, t1, t2
+ add.d a1, a1, a2
+ jr t1
+
+L(short_15_bytes):
+ ld.b t0, a1, -15
+ st.b t0, a4, -15
+L(short_14_bytes):
+ ld.b t0, a1, -14
+ st.b t0, a4, -14
+L(short_13_bytes):
+ ld.b t0, a1, -13
+ st.b t0, a4, -13
+L(short_12_bytes):
+ ld.b t0, a1, -12
+ st.b t0, a4, -12
+L(short_11_bytes):
+ ld.b t0, a1, -11
+ st.b t0, a4, -11
+L(short_10_bytes):
+ ld.b t0, a1, -10
+ st.b t0, a4, -10
+L(short_9_bytes):
+ ld.b t0, a1, -9
+ st.b t0, a4, -9
+L(short_8_bytes):
+ ld.b t0, a1, -8
+ st.b t0, a4, -8
+L(short_7_bytes):
+ ld.b t0, a1, -7
+ st.b t0, a4, -7
+L(short_6_bytes):
+ ld.b t0, a1, -6
+ st.b t0, a4, -6
+L(short_5_bytes):
+ ld.b t0, a1, -5
+ st.b t0, a4, -5
+L(short_4_bytes):
+ ld.b t0, a1, -4
+ st.b t0, a4, -4
+L(short_3_bytes):
+ ld.b t0, a1, -3
+ st.b t0, a4, -3
+L(short_2_bytes):
+ ld.b t0, a1, -2
+ st.b t0, a4, -2
+L(short_1_bytes):
+ ld.b t0, a1, -1
+ st.b t0, a4, -1
+ jr ra
+
+L(copy_back):
+ srai.d a3, a2, 4
+ beqz a3, L(back_short_data) # less than 16 bytes
+
+ add.d a4, a0, a2 # store the tail of dest
+ add.d a1, a1, a2 # store the tail of src
+
+ andi a5, a4, 0x7
+ andi a6, a1, 0x7
+ beqz a5, L(back_check_align)
+
+ # make dest aligned 8 bytes
+ sub.d a2, a2, a5
+ sub.d a1, a1, a5
+ sub.d a4, a4, a5
+
+ pcaddi t1, 18
+ slli.d t3, a5, 3
+ sub.d t1, t1, t3
+ jr t1
+
+ ld.b t0, a1, 6
+ st.b t0, a4, 6
+ ld.b t0, a1, 5
+ st.b t0, a4, 5
+ ld.b t0, a1, 4
+ st.b t0, a4, 4
+ ld.b t0, a1, 3
+ st.b t0, a4, 3
+ ld.b t0, a1, 2
+ st.b t0, a4, 2
+ ld.b t0, a1, 1
+ st.b t0, a4, 1
+ ld.b t0, a1, 0
+ st.b t0, a4, 0
+
+L(back_check_align):
+ bne a5, a6, L(back_unalign)
+
+ srai.d a3, a2, 4
+ beqz a3, L(back_less_16bytes)
+
+ andi a3, a2, 0x3f
+ beq a3, a2, L(back_less_64bytes)
+
+ sub.d t0, a2, a3
+ move a2, a3
+ sub.d a5, a1, t0
+
+L(back_loop_64bytes):
+ LD_64(a1, -64)
+ addi.d a1, a1, -64
+ ST_64(a4, -64)
+
+ addi.d a4, a4, -64
+ bne a1, a5, L(back_loop_64bytes)
+
+L(back_less_64bytes):
+ srai.d a3, a2, 5
+ beqz a3, L(back_less_32bytes)
+
+ ld.d t0, a1, -32
+ ld.d t1, a1, -24
+ ld.d t2, a1, -16
+ ld.d t3, a1, -8
+
+ addi.d a1, a1, -32
+ addi.d a2, a2, -32
+
+ st.d t0, a4, -32
+ st.d t1, a4, -24
+ st.d t2, a4, -16
+ st.d t3, a4, -8
+
+ addi.d a4, a4, -32
+
+L(back_less_32bytes):
+ srai.d a3, a2, 4
+ beqz a3, L(back_less_16bytes)
+
+ ld.d t0, a1, -16
+ ld.d t1, a1, -8
+
+ addi.d a2, a2, -16
+ addi.d a1, a1, -16
+
+ st.d t0, a4, -16
+ st.d t1, a4, -8
+ addi.d a4, a4, -16
+
+L(back_less_16bytes):
+ srai.d a3, a2, 3
+ beqz a3, L(back_less_8bytes)
+
+ ld.d t0, a1, -8
+ addi.d a2, a2, -8
+ addi.d a1, a1, -8
+
+ st.d t0, a4, -8
+ addi.d a4, a4, -8
+
+L(back_less_8bytes):
+ srai.d a3, a2, 2
+ beqz a3, L(back_less_4bytes)
+
+ ld.w t0, a1, -4
+ addi.d a2, a2, -4
+ addi.d a1, a1, -4
+
+ st.w t0, a4, -4
+ addi.d a4, a4, -4
+
+L(back_less_4bytes):
+ srai.d a3, a2, 1
+ beqz a3, L(back_less_2bytes)
+
+ ld.h t0, a1, -2
+ addi.d a2, a2, -2
+ addi.d a1, a1, -2
+
+ st.h t0, a4, -2
+ addi.d a4, a4, -2
+
+L(back_less_2bytes):
+ beqz a2, L(back_less_1byte)
+
+ ld.b t0, a1, -1
+ st.b t0, a4, -1
+
+L(back_less_1byte):
+ jr ra
+
+L(back_unalign):
+ andi t8, a1, 0x7
+ bstrins.d a1, zero, 2, 0 # make src 8 bytes aligned
+
+ sub.d a6, zero, t8
+
+ ld.d t0, a1, 0
+ slli.d a6, a6, 3
+ slli.d a5, t8, 3
+ sll.d a7, t0, a6
+
+ srai.d a3, a2, 4
+ beqz a3, L(back_un_less_16bytes)
+
+ andi a3, a2, 0x3f
+ beq a3, a2, L(back_un_less_64bytes)
+
+ sub.d t0, a2, a3
+ move a2, a3
+ sub.d a3, a1, t0
+
+L(back_un_long_bytes):
+ ld.d t0, a1, -8
+ ld.d t1, a1, -16
+ ld.d t2, a1, -24
+ ld.d t3, a1, -32
+
+ sll.d t4, t0, a6
+ srl.d t0, t0, a5
+
+ sll.d t5, t1, a6
+ srl.d t1, t1, a5
+
+ sll.d t6, t2, a6
+ srl.d t2, t2, a5
+
+ sll.d t7, t3, a6
+ srl.d t3, t3, a5
+
+ or t0, t0, a7
+ or t1, t1, t4
+ or t2, t2, t5
+ or t3, t3, t6
+
+ ld.d t4, a1, -40
+ ld.d t5, a1, -48
+ ld.d t6, a1, -56
+ ld.d a7, a1, -64
+ st.d t0, a4, -8
+ st.d t1, a4, -16
+ st.d t2, a4, -24
+ st.d t3, a4, -32
+
+ addi.d a1, a1, -64
+
+ sll.d t0, t4, a6
+ srl.d t4, t4, a5
+
+ sll.d t1, t5, a6
+ srl.d t5, t5, a5
+
+ sll.d t2, t6, a6
+ srl.d t6, t6, a5
+
+ srl.d t3, a7, a5
+ sll.d a7, a7, a6
+
+ or t4, t7, t4
+ or t5, t0, t5
+ or t6, t1, t6
+ or t3, t2, t3
+
+ st.d t4, a4, -40
+ st.d t5, a4, -48
+ st.d t6, a4, -56
+ st.d t3, a4, -64
+
+ addi.d a4, a4, -64
+ bne a3, a1, L(back_un_long_bytes)
+
+L(back_un_less_64bytes):
+ srai.d a3, a2, 5
+ beqz a3, L(back_un_less_32bytes)
+
+ ld.d t0, a1, -8
+ ld.d t1, a1, -16
+ ld.d t2, a1, -24
+ ld.d t3, a1, -32
+
+ addi.d a1, a1, -32
+ addi.d a2, a2, -32
+
+ sll.d t4, t0, a6
+ srl.d t0, t0, a5
+
+ sll.d t5, t1, a6
+ srl.d t1, t1, a5
+
+ sll.d t6, t2, a6
+ srl.d t2, t2, a5
+
+ or t0, a7, t0
+
+ sll.d a7, t3, a6
+ srl.d t3, t3, a5
+
+ or t1, t4, t1
+ or t2, t5, t2
+ or t3, t6, t3
+
+ st.d t0, a4, -8
+ st.d t1, a4, -16
+ st.d t2, a4, -24
+ st.d t3, a4, -32
+
+ addi.d a4, a4, -32
+
+L(back_un_less_32bytes):
+ srai.d a3, a2, 4
+ beqz a3, L(back_un_less_16bytes)
+
+ ld.d t0, a1, -8
+ ld.d t1, a1, -16
+
+ addi.d a1, a1, -16
+ addi.d a2, a2, -16
+
+ sll.d t2, t0, a6
+ srl.d t3, t0, a5
+
+ srl.d t4, t1, a5
+ or t3, a7, t3
+ or t4, t2, t4
+ sll.d a7, t1, a6
+
+ st.d t3, a4, -8
+ st.d t4, a4, -16
+
+ addi.d a4, a4, -16
+
+L(back_un_less_16bytes):
+ srai.d a3, a2, 3
+ beqz a3, L(back_un_less_8bytes)
+
+ ld.d t0, a1, -8
+
+ addi.d a1, a1, -8
+ addi.d a2, a2, -8
+
+ srl.d t1, t0, a5
+ or t2, a7, t1
+ sll.d a7, t0, a6
+
+ st.d t2, a4, -8
+ addi.d a4, a4, -8
+
+L(back_un_less_8bytes):
+ beqz a2, L(back_end)
+ bge t8, a2, 1f # no more data in memory, un_less_8bytes data is stored in a7
+
+ # combine data in memory and a7(remaining part)
+ ld.d t0, a1, -8
+ srl.d t0, t0, a5
+ or a7, a7, t0
+
+1:
+ srai.d a3, a2, 2
+ beqz a3, L(back_un_less_4bytes)
+
+ srai.d t0, a7, 32
+ addi.d a2, a2, -4
+ st.w t0, a4, -4
+ addi.d a4, a4, -4
+ slli.d a7, a7, 32
+
+L(back_un_less_4bytes):
+ srai.d a3, a2, 1
+ beqz a3, L(back_un_less_2bytes)
+ srai.d t0, a7, 48
+ addi.d a2, a2, -2
+ st.h t0, a4, -2
+ addi.d a4, a4, -2
+ slli.d a7, a7, 16
+L(back_un_less_2bytes):
+ beqz a2, L(back_un_less_1byte)
+ srai.d t0, a7, 56
+ st.b t0, a4, -1
+L(back_un_less_1byte):
+ jr ra
+
+L(back_short_data):
+ pcaddi t1, 34
+ slli.d t2, a2, 3
+ sub.d t1, t1, t2
+ jr t1
+
+ ld.b t0, a1, 14
+ st.b t0, a0, 14
+ ld.b t0, a1, 13
+ st.b t0, a0, 13
+ ld.b t0, a1, 12
+ st.b t0, a0, 12
+ ld.b t0, a1, 11
+ st.b t0, a0, 11
+ ld.b t0, a1, 10
+ st.b t0, a0, 10
+ ld.b t0, a1, 9
+ st.b t0, a0, 9
+ ld.b t0, a1, 8
+ st.b t0, a0, 8
+ ld.b t0, a1, 7
+ st.b t0, a0, 7
+ ld.b t0, a1, 6
+ st.b t0, a0, 6
+ ld.b t0, a1, 5
+ st.b t0, a0, 5
+ ld.b t0, a1, 4
+ st.b t0, a0, 4
+ ld.b t0, a1, 3
+ st.b t0, a0, 3
+ ld.b t0, a1, 2
+ st.b t0, a0, 2
+ ld.b t0, a1, 1
+ st.b t0, a0, 1
+ ld.b t0, a1, 0
+ st.b t0, a0, 0
+L(back_end):
+ jr ra
+
+END(MEMCPY_NAME)
+
+#ifdef _LIBC
+libc_hidden_builtin_def (MEMCPY_NAME)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/memset-aligned.S b/sysdeps/loongarch/lp64/multiarch/memset-aligned.S
index da2f5ada..412ee849 100644
--- a/sysdeps/loongarch/lp64/multiarch/memset-aligned.S
+++ b/sysdeps/loongarch/lp64/multiarch/memset-aligned.S
@@ -1,9 +1,169 @@
+#ifdef _LIBC
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+#else
+#include <sys/asm.h>
+#include <sys/regdef.h>
+#endif
#if IS_IN (libc)
-
#define MEMSET_NAME __memset_aligned
-
+#else
+#define MEMSET_NAME memset
#endif
-#include "../memset.S"
+#define ST_64(n) \
+ st.d a1, a0, n; \
+ st.d a1, a0, n+8; \
+ st.d a1, a0, n+16; \
+ st.d a1, a0, n+24; \
+ st.d a1, a0, n+32; \
+ st.d a1, a0, n+40; \
+ st.d a1, a0, n+48; \
+ st.d a1, a0, n+56;
+
+LEAF(MEMSET_NAME, 6)
+ move t0, a0
+ andi a3, a0, 0x7
+ li.w t6, 16
+ beqz a3, L(align)
+ blt a2, t6, L(short_data)
+
+L(make_align):
+ li.w t8, 8
+ sub.d t2, t8, a3
+ pcaddi t1, 11
+ slli.d t3, t2, 2
+ sub.d t1, t1, t3
+ jirl zero, t1, 0
+
+L(al7):
+ st.b a1, t0, 6
+L(al6):
+ st.b a1, t0, 5
+L(al5):
+ st.b a1, t0, 4
+L(al4):
+ st.b a1, t0, 3
+L(al3):
+ st.b a1, t0, 2
+L(al2):
+ st.b a1, t0, 1
+L(al1):
+ st.b a1, t0, 0
+L(al0):
+ add.d t0, t0, t2
+ sub.d a2, a2, t2
+
+L(align):
+ bstrins.d a1, a1, 15, 8
+ bstrins.d a1, a1, 31, 16
+ bstrins.d a1, a1, 63, 32
+
+ blt a2, t6, L(less_16bytes)
+
+ andi a4, a2, 0x3f
+ beq a4, a2, L(less_64bytes)
+
+ sub.d t1, a2, a4
+ move a2, a4
+ add.d a5, t0, t1
+
+L(loop_64bytes):
+ addi.d t0, t0, 64
+ st.d a1, t0, -64
+ st.d a1, t0, -56
+ st.d a1, t0, -48
+ st.d a1, t0, -40
+ st.d a1, t0, -32
+ st.d a1, t0, -24
+ st.d a1, t0, -16
+ st.d a1, t0, -8
+ bne t0, a5, L(loop_64bytes)
+
+L(less_64bytes):
+ srai.d a4, a2, 5
+ beqz a4, L(less_32bytes)
+ addi.d a2, a2, -32
+ st.d a1, t0, 0
+ st.d a1, t0, 8
+ st.d a1, t0, 16
+ st.d a1, t0, 24
+ addi.d t0, t0, 32
+L(less_32bytes):
+ blt a2, t6, L(less_16bytes)
+ addi.d a2, a2, -16
+ st.d a1, t0, 0
+ st.d a1, t0, 8
+ addi.d t0, t0, 16
+L(less_16bytes):
+ srai.d a4, a2, 3
+ beqz a4, L(less_8bytes)
+ addi.d a2, a2, -8
+ st.d a1, t0, 0
+ addi.d t0, t0, 8
+L(less_8bytes):
+ beqz a2, L(less_1byte)
+ srai.d a4, a2, 2
+ beqz a4, L(less_4bytes)
+ addi.d a2, a2, -4
+ st.w a1, t0, 0
+ addi.d t0, t0, 4
+L(less_4bytes):
+ srai.d a3, a2, 1
+ beqz a3, L(less_2bytes)
+ addi.d a2, a2, -2
+ st.h a1, t0, 0
+ addi.d t0, t0, 2
+L(less_2bytes):
+ beqz a2, L(less_1byte)
+ st.b a1, t0, 0
+L(less_1byte):
+ jr ra
+
+L(short_data):
+ pcaddi t1, 19
+ slli.d t3, a2, 2
+ sub.d t1, t1, t3
+ jirl zero, t1, 0
+L(short_15):
+ st.b a1, a0, 14
+
+L(short_14):
+ st.b a1, a0, 13
+L(short_13):
+ st.b a1, a0, 12
+L(short_12):
+ st.b a1, a0, 11
+L(short_11):
+ st.b a1, a0, 10
+L(short_10):
+ st.b a1, a0, 9
+L(short_9):
+ st.b a1, a0, 8
+L(short_8):
+ st.b a1, a0, 7
+L(short_7):
+ st.b a1, a0, 6
+L(short_6):
+ st.b a1, a0, 5
+L(short_5):
+ st.b a1, a0, 4
+L(short_4):
+ st.b a1, a0, 3
+L(short_3):
+ st.b a1, a0, 2
+L(short_2):
+ st.b a1, a0, 1
+L(short_1):
+ st.b a1, a0, 0
+L(short_0):
+ jr ra
+
+END(MEMSET_NAME)
+
+#ifdef _LIBC
+libc_hidden_builtin_def (MEMSET_NAME)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S
index 0b46b4ca..a13e293f 100644
--- a/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S
+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S
@@ -1,7 +1,115 @@
+#ifdef _LIBC
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+#else
+#include <sys/asm.h>
+#include <sys/regdef.h>
+#endif
#if IS_IN (libc)
#define RAWMEMCHR_NAME __rawmemchr_aligned
+#else
+#define RAWMEMCHR_NAME __rawmemchr
#endif
-#include "../rawmemchr.S"
+LEAF(RAWMEMCHR_NAME, 6)
+ andi t1, a0, 0x7
+ bstrins.d a0, zero, 2, 0
+ lu12i.w a2, 0x01010
+ bstrins.d a1, a1, 15, 8
+
+ ld.d t0, a0, 0
+ slli.d t1, t1, 3
+ ori a2, a2, 0x101
+ bstrins.d a1, a1, 31, 16
+
+ li.w t8, -1
+ bstrins.d a1, a1, 63, 32
+ bstrins.d a2, a2, 63, 32
+ sll.d t2, t8, t1
+
+ sll.d t3, a1, t1
+ orn t0, t0, t2
+ slli.d a3, a2, 7
+ beqz a1, L(find_zero)
+
+ xor t0, t0, t3
+ sub.d t1, t0, a2
+ andn t2, a3, t0
+ and t3, t1, t2
+
+ bnez t3, L(count_pos)
+ addi.d a0, a0, 8
+
+L(loop):
+ ld.d t0, a0, 0
+ xor t0, t0, a1
+
+ sub.d t1, t0, a2
+ andn t2, a3, t0
+ and t3, t1, t2
+ bnez t3, L(count_pos)
+
+ ld.d t0, a0, 8
+ addi.d a0, a0, 16
+ xor t0, t0, a1
+ sub.d t1, t0, a2
+
+ andn t2, a3, t0
+ and t3, t1, t2
+ beqz t3, L(loop)
+ addi.d a0, a0, -8
+L(count_pos):
+ ctz.d t0, t3
+ srli.d t0, t0, 3
+ add.d a0, a0, t0
+ jr ra
+
+L(loop_7bit):
+ ld.d t0, a0, 0
+L(find_zero):
+ sub.d t1, t0, a2
+ and t2, t1, a3
+ bnez t2, L(more_check)
+
+ ld.d t0, a0, 8
+ addi.d a0, a0, 16
+ sub.d t1, t0, a2
+ and t2, t1, a3
+
+ beqz t2, L(loop_7bit)
+ addi.d a0, a0, -8
+
+L(more_check):
+ andn t2, a3, t0
+ and t3, t1, t2
+ bnez t3, L(count_pos)
+ addi.d a0, a0, 8
+
+L(loop_8bit):
+ ld.d t0, a0, 0
+
+ sub.d t1, t0, a2
+ andn t2, a3, t0
+ and t3, t1, t2
+ bnez t3, L(count_pos)
+
+ ld.d t0, a0, 8
+ addi.d a0, a0, 16
+ sub.d t1, t0, a2
+
+ andn t2, a3, t0
+ and t3, t1, t2
+ beqz t3, L(loop_8bit)
+
+ addi.d a0, a0, -8
+ b L(count_pos)
+
+END(RAWMEMCHR_NAME)
+
+#ifdef _LIBC
+weak_alias (__rawmemchr, rawmemchr)
+libc_hidden_builtin_def (__rawmemchr)
+#endif
diff --git a/sysdeps/loongarch/lp64/rawmemchr.S b/sysdeps/loongarch/lp64/rawmemchr.S
deleted file mode 100644
index ef1db7ed..00000000
--- a/sysdeps/loongarch/lp64/rawmemchr.S
+++ /dev/null
@@ -1,113 +0,0 @@
-#ifdef _LIBC
-#include <sysdep.h>
-#include <sys/regdef.h>
-#include <sys/asm.h>
-#else
-#include <sys/asm.h>
-#include <sys/regdef.h>
-#endif
-
-#ifndef RAWMEMCHR_NAME
-# define RAWMEMCHR_NAME __rawmemchr
-#endif
-
-
-LEAF(RAWMEMCHR_NAME, 6)
- andi t1, a0, 0x7
- bstrins.d a0, zero, 2, 0
- lu12i.w a2, 0x01010
- bstrins.d a1, a1, 15, 8
-
- ld.d t0, a0, 0
- slli.d t1, t1, 3
- ori a2, a2, 0x101
- bstrins.d a1, a1, 31, 16
-
- li.w t8, -1
- bstrins.d a1, a1, 63, 32
- bstrins.d a2, a2, 63, 32
- sll.d t2, t8, t1
-
- sll.d t3, a1, t1
- orn t0, t0, t2
- slli.d a3, a2, 7
- beqz a1, L(find_zero)
-
- xor t0, t0, t3
- sub.d t1, t0, a2
- andn t2, a3, t0
- and t3, t1, t2
-
- bnez t3, L(count_pos)
- addi.d a0, a0, 8
-
-L(loop):
- ld.d t0, a0, 0
- xor t0, t0, a1
-
- sub.d t1, t0, a2
- andn t2, a3, t0
- and t3, t1, t2
- bnez t3, L(count_pos)
-
- ld.d t0, a0, 8
- addi.d a0, a0, 16
- xor t0, t0, a1
- sub.d t1, t0, a2
-
- andn t2, a3, t0
- and t3, t1, t2
- beqz t3, L(loop)
- addi.d a0, a0, -8
-L(count_pos):
- ctz.d t0, t3
- srli.d t0, t0, 3
- add.d a0, a0, t0
- jr ra
-
-L(loop_7bit):
- ld.d t0, a0, 0
-L(find_zero):
- sub.d t1, t0, a2
- and t2, t1, a3
- bnez t2, L(more_check)
-
- ld.d t0, a0, 8
- addi.d a0, a0, 16
- sub.d t1, t0, a2
- and t2, t1, a3
-
- beqz t2, L(loop_7bit)
- addi.d a0, a0, -8
-
-L(more_check):
- andn t2, a3, t0
- and t3, t1, t2
- bnez t3, L(count_pos)
- addi.d a0, a0, 8
-
-L(loop_8bit):
- ld.d t0, a0, 0
-
- sub.d t1, t0, a2
- andn t2, a3, t0
- and t3, t1, t2
- bnez t3, L(count_pos)
-
- ld.d t0, a0, 8
- addi.d a0, a0, 16
- sub.d t1, t0, a2
-
- andn t2, a3, t0
- and t3, t1, t2
- beqz t3, L(loop_8bit)
-
- addi.d a0, a0, -8
- b L(count_pos)
-
-END(RAWMEMCHR_NAME)
-
-#ifdef _LIBC
-weak_alias (__rawmemchr, rawmemchr)
-libc_hidden_builtin_def (__rawmemchr)
-#endif
--
2.33.0