From b720fd44df475685ea164491d76c42e127aab3ea Mon Sep 17 00:00:00 2001 From: caiyinyu Date: Wed, 21 Jun 2023 10:49:39 +0800 Subject: [PATCH 07/14] glibc-2.28: Refactor code of st{r,p}* functions. Change-Id: Ife977373e9ba071b284ee19ca4ba121bc27d5834 Signed-off-by: ticat_fp --- .../loongarch/lp64/multiarch/stpcpy-aligned.S | 179 +++++++++++- .../loongarch/lp64/multiarch/strchr-aligned.S | 91 ++++++- .../lp64/multiarch/strchrnul-aligned.S | 94 ++++++- .../loongarch/lp64/multiarch/strcmp-aligned.S | 225 ++++++++++++++- .../loongarch/lp64/multiarch/strcpy-aligned.S | 173 +++++++++++- .../loongarch/lp64/multiarch/strlen-aligned.S | 85 +++++- .../lp64/multiarch/strncmp-aligned.S | 256 +++++++++++++++++- .../lp64/multiarch/strnlen-aligned.S | 82 +++++- .../lp64/multiarch/strrchr-aligned.S | 105 ++++++- sysdeps/loongarch/lp64/stpcpy.S | 179 ------------ sysdeps/loongarch/lp64/strchr.S | 89 ------ sysdeps/loongarch/lp64/strchrnul.S | 94 ------- sysdeps/loongarch/lp64/strcmp.S | 227 ---------------- sysdeps/loongarch/lp64/strcpy.S | 173 ------------ sysdeps/loongarch/lp64/strlen.S | 85 ------ sysdeps/loongarch/lp64/strncmp.S | 256 ------------------ sysdeps/loongarch/lp64/strnlen.S | 82 ------ sysdeps/loongarch/lp64/strrchr.S | 105 ------- 18 files changed, 1264 insertions(+), 1316 deletions(-) delete mode 100644 sysdeps/loongarch/lp64/stpcpy.S delete mode 100644 sysdeps/loongarch/lp64/strchr.S delete mode 100644 sysdeps/loongarch/lp64/strchrnul.S delete mode 100644 sysdeps/loongarch/lp64/strcmp.S delete mode 100644 sysdeps/loongarch/lp64/strcpy.S delete mode 100644 sysdeps/loongarch/lp64/strlen.S delete mode 100644 sysdeps/loongarch/lp64/strncmp.S delete mode 100644 sysdeps/loongarch/lp64/strnlen.S delete mode 100644 sysdeps/loongarch/lp64/strrchr.S diff --git a/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S index 3d134e3f..7109b0f0 100644 --- a/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S +++ b/sysdeps/loongarch/lp64/multiarch/stpcpy-aligned.S @@ -1,8 +1,181 @@ +#ifdef _LIBC +#include +#include +#include +#else +#include +#include +#endif #if IS_IN (libc) - #define STPCPY_NAME __stpcpy_aligned - +#else +#define STPCPY_NAME __stpcpy #endif -#include "../stpcpy.S" +LEAF(STPCPY_NAME, 6) + andi a3, a0, 0x7 + beqz a3, L(dest_align) + sub.d a5, a1, a3 + addi.d a5, a5, 8 + +L(make_dest_align): + ld.b t0, a1, 0 + addi.d a1, a1, 1 + st.b t0, a0, 0 + addi.d a0, a0, 1 + + beqz t0, L(al_out) + bne a1, a5, L(make_dest_align) + +L(dest_align): + andi a4, a1, 7 + bstrins.d a1, zero, 2, 0 + + lu12i.w t5, 0x1010 + ld.d t0, a1, 0 + ori t5, t5, 0x101 + bstrins.d t5, t5, 63, 32 + + slli.d t6, t5, 0x7 + bnez a4, L(unalign) + sub.d t1, t0, t5 + andn t2, t6, t0 + + and t3, t1, t2 + bnez t3, L(al_end) + +L(al_loop): + st.d t0, a0, 0 + ld.d t0, a1, 8 + + addi.d a1, a1, 8 + addi.d a0, a0, 8 + sub.d t1, t0, t5 + andn t2, t6, t0 + + and t3, t1, t2 + beqz t3, L(al_loop) + +L(al_end): + ctz.d t1, t3 + srli.d t1, t1, 3 + addi.d t1, t1, 1 # add 1, since '\0' needs to be copied to dest + + andi a3, t1, 8 + andi a4, t1, 4 + andi a5, t1, 2 + andi a6, t1, 1 + +L(al_end_8): + beqz a3, L(al_end_4) + st.d t0, a0, 0 + addi.d a0, a0, 7 + jr ra +L(al_end_4): + beqz a4, L(al_end_2) + st.w t0, a0, 0 + addi.d a0, a0, 4 + srli.d t0, t0, 32 +L(al_end_2): + beqz a5, L(al_end_1) + st.h t0, a0, 0 + addi.d a0, a0, 2 + srli.d t0, t0, 16 +L(al_end_1): + beqz a6, L(al_out) + st.b t0, a0, 0 + addi.d a0, a0, 1 +L(al_out): + addi.d a0, a0, -1 + jr ra + +L(unalign): + slli.d a5, a4, 3 + li.d t1, -1 + sub.d a6, zero, a5 + + srl.d a7, t0, a5 + sll.d t7, t1, a6 + + or t0, a7, t7 + sub.d t1, t0, t5 + andn t2, t6, t0 + and t3, t1, t2 + + bnez t3, L(un_end) + + ld.d t4, a1, 8 + addi.d a1, a1, 8 + + sub.d t1, t4, t5 + andn t2, t6, t4 + sll.d t0, t4, a6 + and t3, t1, t2 + + or t0, t0, a7 + bnez t3, L(un_end_with_remaining) + +L(un_loop): + srl.d a7, t4, a5 + + ld.d t4, a1, 8 + addi.d a1, a1, 8 + + st.d t0, a0, 0 + addi.d a0, a0, 8 + + sub.d t1, t4, t5 + andn t2, t6, t4 + sll.d t0, t4, a6 + and t3, t1, t2 + + or t0, t0, a7 + beqz t3, L(un_loop) + +L(un_end_with_remaining): + ctz.d t1, t3 + srli.d t1, t1, 3 + addi.d t1, t1, 1 + sub.d t1, t1, a4 + + blt t1, zero, L(un_end_less_8) + st.d t0, a0, 0 + addi.d a0, a0, 8 + beqz t1, L(un_out) + srl.d t0, t4, a5 # get the remaining part + b L(un_end_less_8) + +L(un_end): + ctz.d t1, t3 + srli.d t1, t1, 3 + addi.d t1, t1, 1 + +L(un_end_less_8): + andi a4, t1, 4 + andi a5, t1, 2 + andi a6, t1, 1 +L(un_end_4): + beqz a4, L(un_end_2) + st.w t0, a0, 0 + addi.d a0, a0, 4 + srli.d t0, t0, 32 +L(un_end_2): + beqz a5, L(un_end_1) + st.h t0, a0, 0 + addi.d a0, a0, 2 + srli.d t0, t0, 16 +L(un_end_1): + beqz a6, L(un_out) + st.b t0, a0, 0 + addi.d a0, a0, 1 +L(un_out): + addi.d a0, a0, -1 + jr ra + +END(STPCPY_NAME) + +#ifdef _LIBC +weak_alias (STPCPY_NAME, stpcpy) +libc_hidden_builtin_def (STPCPY_NAME) +#endif diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S index 92365658..d9bd4587 100644 --- a/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S +++ b/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S @@ -1,10 +1,95 @@ -#if IS_IN (libc) -#define STRCHR_NAME __strchr_aligned +#ifdef _LIBC +#include +#include +#include +#else +#include +#include +#endif +#if IS_IN (libc) +#define STRCHR_NAME __strchr_aligned +#else +#define STRCHR_NAME strchr #endif -#include "../strchr.S" +/* char * strchr (const char *s1, int c); */ + +LEAF(STRCHR_NAME, 6) + slli.d t1, a0, 3 + bstrins.d a0, zero, 2, 0 + lu12i.w a2, 0x01010 + ld.d t2, a0, 0 + + ori a2, a2, 0x101 + andi a1, a1, 0xff + bstrins.d a2, a2, 63, 32 + li.w t0, -1 + + mul.d a1, a1, a2 # "cccccccc" + sll.d t0, t0, t1 + slli.d a3, a2, 7 # 0x8080808080808080 + orn t2, t2, t0 + + sll.d t3, a1, t1 + xor t4, t2, t3 + sub.d a7, t2, a2 + andn a6, a3, t2 + + + sub.d a5, t4, a2 + andn a4, a3, t4 + and a6, a7, a6 + and a5, a5, a4 + + or t0, a6, a5 + bnez t0, L(_mc8_a) + addi.d a0, a0, 8 +L(_aloop): + ld.d t4, a0, 0 + + xor t2, t4, a1 + sub.d a7, t4, a2 + andn a6, a3, t4 + sub.d a5, t2, a2 + + andn a4, a3, t2 + and a6, a7, a6 + and a5, a5, a4 + or a7, a6, a5 + + + bnez a7, L(_mc8_a) + ld.d t4, a0, 8 + addi.d a0, a0, 16 + xor t2, t4, a1 + + sub.d a7, t4, a2 + andn a6, a3, t4 + sub.d a5, t2, a2 + andn a4, a3, t2 + + and a6, a7, a6 + and a5, a5, a4 + or a7, a6, a5 + beqz a7, L(_aloop) + + addi.d a0, a0, -8 + +L(_mc8_a): + ctz.d t0, a5 + ctz.d t2, a6 + srli.w t0, t0, 3 + + + srli.w t2, t2, 3 + sltu t1, t2, t0 + add.d a0, a0, t0 + masknez a0, a0, t1 + + jr ra +END(STRCHR_NAME) weak_alias (STRCHR_NAME, index) diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S index 4fa63ecc..f18b01a3 100644 --- a/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S +++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S @@ -1,8 +1,96 @@ +#ifdef _LIBC +#include +#include +#include +#else +#include +#include +#endif #if IS_IN (libc) - #define STRCHRNUL_NAME __strchrnul_aligned - +#else +#define STRCHRNUL_NAME __strchrnul #endif -#include "../strchrnul.S" +/* char * strchrnul (const char *s1, int c); */ + +LEAF(STRCHRNUL_NAME, 6) + slli.d t1, a0, 3 + bstrins.d a0, zero, 2, 0 + lu12i.w a2, 0x01010 + ld.d t2, a0, 0 + + ori a2, a2, 0x101 + andi a1, a1, 0xff + bstrins.d a2, a2, 63, 32 + li.w t0, -1 + + mul.d a1, a1, a2 # "cccccccc" + sll.d t0, t0, t1 + slli.d a3, a2, 7 # 0x8080808080808080 + orn t2, t2, t0 + + sll.d t3, a1, t1 + xor t4, t2, t3 + sub.d a7, t2, a2 + andn a6, a3, t2 + + + sub.d a5, t4, a2 + andn a4, a3, t4 + and a6, a7, a6 + and a5, a5, a4 + + or t0, a6, a5 + bnez t0, L(_mc8_a) + addi.d a0, a0, 8 +L(_aloop): + ld.d t4, a0, 0 + + xor t2, t4, a1 + sub.d a7, t4, a2 + andn a6, a3, t4 + sub.d a5, t2, a2 + + andn a4, a3, t2 + and a6, a7, a6 + and a5, a5, a4 + or a7, a6, a5 + + + bnez a7, L(_mc8_a) + ld.d t4, a0, 8 + addi.d a0, a0, 16 + xor t2, t4, a1 + + sub.d a7, t4, a2 + andn a6, a3, t4 + sub.d a5, t2, a2 + andn a4, a3, t2 + + and a6, a7, a6 + and a5, a5, a4 + or a7, a6, a5 + beqz a7, L(_aloop) + + addi.d a0, a0, -8 +L(_mc8_a): + ctz.d t0, a5 + ctz.d t2, a6 + srli.w t0, t0, 3 + + srli.w t2, t2, 3 + slt t1, t0, t2 + masknez t3, t2, t1 + maskeqz t4, t0, t1 + + or t0, t3, t4 + add.d a0, a0, t0 + jr ra +END(STRCHRNUL_NAME) + +#ifdef _LIBC +weak_alias(STRCHRNUL_NAME, strchrnul) +libc_hidden_builtin_def (STRCHRNUL_NAME) +#endif diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S index f84f52b8..a9b74b0c 100644 --- a/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S +++ b/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S @@ -1,8 +1,229 @@ +/* 2022\06\15 loongarch64 author: chenxiaolong. */ -#if IS_IN (libc) +#ifdef _LIBC +#include +#include +#include +#else +#include +#include +#endif +#if IS_IN (libc) #define STRCMP_NAME __strcmp_aligned +#else +#define STRCMP_NAME strcmp +#endif + +/* int strcmp (const char *s1, const char *s2); */ + +/* Parameters and Results */ +#define src1 a0 +#define src2 a1 +#define result v0 +LEAF(STRCMP_NAME, 6) + xor a4, src1, src2 + lu12i.w t5, 0x01010 + lu12i.w t6, 0x7f7f7 + andi a2, src1, 0x7 + + ori t5, t5, 0x101 + andi a4, a4, 0x7 + ori t6, t6, 0xf7f + bstrins.d t5, t5, 63, 32 + bstrins.d t6, t6, 63, 32 + + bnez a4, 3f // unaligned + beqz a2, 1f // loop aligned + +// mutual aligned + bstrins.d src1, zero, 2, 0 + bstrins.d src2, zero, 2, 0 + slli.d a4, a2, 0x3 + ld.d t0, src1, 0 + + sub.d a4, zero, a4 + ld.d t1, src2, 0 + addi.d src1, src1, 8 + addi.d src2, src2, 8 + + nor a5, zero, zero + srl.d a5, a5, a4 + or t0, t0, a5 + + or t1, t1, a5 + b 2f //start realigned + +// loop aligned +1: + ld.d t0, src1, 0 + addi.d src1, src1, 8 + ld.d t1, src2, 0 + addi.d src2, src2, 8 + +// start realigned: +2: + sub.d t2, t0, t5 + nor t3, t0, t6 + and t2, t2, t3 + + xor t3, t0, t1 + or t2, t2, t3 + beqz t2, 1b + + ctz.d t7, t2 + bstrins.d t7, zero, 2, 0 + srl.d t0, t0, t7 + srl.d t1, t1, t7 + + andi t0, t0, 0xff + andi t1, t1, 0xff + sub.d v0, t0, t1 + jr ra + +// unaligned +3: + andi a3, src2, 0x7 + slt a5, a2, a3 + masknez t8, a2, a5 + xor a6, src1, src2 + maskeqz a6, a6, t8 + xor src1, src1, a6 + xor src2, src2, a6 + + andi a2, src1, 0x7 + beqz a2, 4f // src1 is aligned + +//strcmp_unaligned: + andi a3, src2, 0x7 + bstrins.d src1, zero, 2, 0 + bstrins.d src2, zero, 2, 0 + nor t3, zero, zero + + ld.d t0, src1, 0 + ld.d t1, src2, 0 + sub.d a2, a3, a2 + addi.d t2, zero, 8 + + sub.d a5, t2, a2 + sub.d a6, t2, a3 + slli.d a5, a5, 0x3 + slli.d a6, a6, 0x3 + + srl.d t4, t3, a6 + srl.d a4, t3, a5 + rotr.d a7, t0, a5 + + addi.d src2, src2, 8 + addi.d src1, src1, 8 + or t1, t1, t4 + or t0, a7, t4 + + sub.d t2, t0, t5 + nor t3, t0, t6 + and t2, t2, t3 + xor t3, t0, t1 + or t2, t2, t3 + bnez t2, 7f + + and a7, a7, a4 + slli.d a6, a2, 0x3 + nor a4, zero, a4 + b 5f + +// src1 is aligned +4: + andi a3, src2, 0x7 + ld.d t0, src1, 0 + + bstrins.d src2, zero, 2, 0 + nor t2, zero, zero + ld.d t1, src2, 0 + + addi.d t3, zero, 0x8 + sub.d a5, t3, a3 + slli.d a5, a5, 0x3 + srl.d a4, t2, a5 + rotr.d t4, t0, a5 + + addi.d src2, src2, 8 + addi.d src1, src1, 8 + or t1, t1, a4 + or t0, t4, a4 + + sub.d t2, t0, t5 + nor t3, t0, t6 + and t2, t2, t3 + xor t3, t0, t1 + or t2, t2, t3 + + bnez t2, 7f + + and a7, t4, a4 + slli.d a6, a3, 0x3 + nor a4, zero, a4 + +// unaligned loop +// a7: remaining number +// a6: shift left number +// a5: shift right number +// a4: mask for checking remaining number +5: + or t0, a7, a4 + sub.d t2, t0, t5 + nor t3, t0, t6 + and t2, t2, t3 + bnez t2, 6f + + ld.d t0, src1, 0 + addi.d src1, src1, 8 + ld.d t1, src2, 0 + addi.d src2, src2, 8 + + srl.d t7, t0, a5 + sll.d t0, t0, a6 + or t0, a7, t0 + + sub.d t2, t0, t5 + nor t3, t0, t6 + and t2, t2, t3 + xor t3, t0, t1 + or t2, t2, t3 + bnez t2, 7f + + or a7, t7, zero + b 5b + +6: + ld.bu t1, src2, 0 + andi t0, a7, 0xff + xor t2, t0, t1 + srli.d a7, a7, 0x8 + masknez t2, t0, t2 + addi.d src2, src2, 1 + beqz t2, 8f + b 6b + +7: + ctz.d t7, t2 + bstrins.d t7, zero, 2, 0 + srl.d t0, t0, t7 + srl.d t1, t1, t7 + + andi t0, t0, 0xff + andi t1, t1, 0xff + +8: + sub.d a4, t0, t1 + sub.d a5, t1, t0 + maskeqz a6, a5, t8 + masknez result, a4, t8 + or result, result, a6 + jr ra + +END(STRCMP_NAME) +#ifdef _LIBC +libc_hidden_builtin_def (STRCMP_NAME) #endif -#include "../strcmp.S" diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S index 4860398b..80954912 100644 --- a/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S +++ b/sysdeps/loongarch/lp64/multiarch/strcpy-aligned.S @@ -1,8 +1,175 @@ +#ifdef _LIBC +#include +#include +#include +#else +#include +#include +#endif #if IS_IN (libc) - #define STRCPY __strcpy_aligned - +#else +#define STRCPY strcpy #endif -#include "../strcpy.S" +LEAF(STRCPY, 6) + andi a3, a0, 0x7 + move a2, a0 + beqz a3, L(dest_align) + sub.d a5, a1, a3 + addi.d a5, a5, 8 + +L(make_dest_align): + ld.b t0, a1, 0 + addi.d a1, a1, 1 + st.b t0, a2, 0 + beqz t0, L(al_out) + + addi.d a2, a2, 1 + bne a1, a5, L(make_dest_align) + +L(dest_align): + andi a4, a1, 7 + bstrins.d a1, zero, 2, 0 + + lu12i.w t5, 0x1010 + ld.d t0, a1, 0 + ori t5, t5, 0x101 + bstrins.d t5, t5, 63, 32 + + slli.d t6, t5, 0x7 + bnez a4, L(unalign) + sub.d t1, t0, t5 + andn t2, t6, t0 + + and t3, t1, t2 + bnez t3, L(al_end) + +L(al_loop): + st.d t0, a2, 0 + ld.d t0, a1, 8 + + addi.d a1, a1, 8 + addi.d a2, a2, 8 + sub.d t1, t0, t5 + andn t2, t6, t0 + + and t3, t1, t2 + beqz t3, L(al_loop) + +L(al_end): + ctz.d t1, t3 + srli.d t1, t1, 3 + addi.d t1, t1, 1 # add 1, since '\0' needs to be copied to dest + + andi a3, t1, 8 + andi a4, t1, 4 + andi a5, t1, 2 + andi a6, t1, 1 + +L(al_end_8): + beqz a3, L(al_end_4) + st.d t0, a2, 0 + jr ra +L(al_end_4): + beqz a4, L(al_end_2) + st.w t0, a2, 0 + addi.d a2, a2, 4 + srli.d t0, t0, 32 +L(al_end_2): + beqz a5, L(al_end_1) + st.h t0, a2, 0 + addi.d a2, a2, 2 + srli.d t0, t0, 16 +L(al_end_1): + beqz a6, L(al_out) + st.b t0, a2, 0 +L(al_out): + jr ra + +L(unalign): + slli.d a5, a4, 3 + li.d t1, -1 + sub.d a6, zero, a5 + + srl.d a7, t0, a5 + sll.d t7, t1, a6 + + or t0, a7, t7 + sub.d t1, t0, t5 + andn t2, t6, t0 + and t3, t1, t2 + + bnez t3, L(un_end) + + ld.d t4, a1, 8 + + sub.d t1, t4, t5 + andn t2, t6, t4 + sll.d t0, t4, a6 + and t3, t1, t2 + + or t0, t0, a7 + bnez t3, L(un_end_with_remaining) + +L(un_loop): + srl.d a7, t4, a5 + + ld.d t4, a1, 16 + addi.d a1, a1, 8 + + st.d t0, a2, 0 + addi.d a2, a2, 8 + + sub.d t1, t4, t5 + andn t2, t6, t4 + sll.d t0, t4, a6 + and t3, t1, t2 + + or t0, t0, a7 + beqz t3, L(un_loop) + +L(un_end_with_remaining): + ctz.d t1, t3 + srli.d t1, t1, 3 + addi.d t1, t1, 1 + sub.d t1, t1, a4 + + blt t1, zero, L(un_end_less_8) + st.d t0, a2, 0 + addi.d a2, a2, 8 + beqz t1, L(un_out) + srl.d t0, t4, a5 # get the remaining part + b L(un_end_less_8) + +L(un_end): + ctz.d t1, t3 + srli.d t1, t1, 3 + addi.d t1, t1, 1 + +L(un_end_less_8): + andi a4, t1, 4 + andi a5, t1, 2 + andi a6, t1, 1 +L(un_end_4): + beqz a4, L(un_end_2) + st.w t0, a2, 0 + addi.d a2, a2, 4 + srli.d t0, t0, 32 +L(un_end_2): + beqz a5, L(un_end_1) + st.h t0, a2, 0 + addi.d a2, a2, 2 + srli.d t0, t0, 16 +L(un_end_1): + beqz a6, L(un_out) + st.b t0, a2, 0 +L(un_out): + jr ra + +END(STRCPY) + +#ifdef _LIBC +libc_hidden_builtin_def (STRCPY) +#endif diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S index d31875fd..fcbc4f6a 100644 --- a/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S +++ b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S @@ -1,8 +1,87 @@ +#ifdef _LIBC +#include +#include +#include +#else +#include +#include +#endif #if IS_IN (libc) - #define STRLEN __strlen_aligned - +#else +#define STRLEN strlen #endif -#include "../strlen.S" +LEAF(STRLEN, 6) + move a1, a0 + bstrins.d a0, zero, 2, 0 + lu12i.w a2, 0x01010 + li.w t0, -1 + + ld.d t2, a0, 0 + andi t1, a1, 0x7 + ori a2, a2, 0x101 + slli.d t1, t1, 3 + + bstrins.d a2, a2, 63, 32 + sll.d t1, t0, t1 + slli.d t3, a2, 7 + nor a3, zero, t3 + + orn t2, t2, t1 + sub.d t0, t2, a2 + nor t1, t2, a3 + and t0, t0, t1 + + + bnez t0, L(count_pos) + addi.d a0, a0, 8 +L(loop_16_7bit): + ld.d t2, a0, 0 + sub.d t1, t2, a2 + + and t0, t1, t3 + bnez t0, L(more_check) + ld.d t2, a0, 8 + addi.d a0, a0, 16 + + sub.d t1, t2, a2 + and t0, t1, t3 + beqz t0, L(loop_16_7bit) + addi.d a0, a0, -8 +L(more_check): + nor t0, t2, a3 + + and t0, t1, t0 + bnez t0, L(count_pos) + addi.d a0, a0, 8 +L(loop_16_8bit): + ld.d t2, a0, 0 + + sub.d t1, t2, a2 + nor t0, t2, a3 + and t0, t0, t1 + bnez t0, L(count_pos) + + ld.d t2, a0, 8 + addi.d a0, a0, 16 + sub.d t1, t2, a2 + nor t0, t2, a3 + + and t0, t0, t1 + beqz t0, L(loop_16_8bit) + addi.d a0, a0, -8 +L(count_pos): + ctz.d t1, t0 + sub.d a0, a0, a1 + + srli.d t1, t1, 3 + add.d a0, a0, t1 + jr ra + +END(STRLEN) + +#ifdef _LIBC +libc_hidden_builtin_def (STRLEN) +#endif diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S index f371b19e..2cd56c44 100644 --- a/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S +++ b/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S @@ -1,8 +1,258 @@ +#ifdef _LIBC +#include +#include +#include +#else +#include +#include +#endif #if IS_IN (libc) - #define STRNCMP __strncmp_aligned - +#else +#define STRNCMP strncmp #endif -#include "../strncmp.S" +/* int strncmp (const char *s1, const char *s2); */ + +LEAF(STRNCMP, 6) + beqz a2, L(ret0) + xor a4, a0, a1 + lu12i.w t5, 0x01010 + lu12i.w t6, 0x7f7f7 + + andi a3, a0, 0x7 + ori t5, t5, 0x101 + andi a4, a4, 0x7 + ori t6, t6, 0xf7f + + bstrins.d t5, t5, 63, 32 + bstrins.d t6, t6, 63, 32 + + bnez a4, L(unalign) + bnez a3, L(mutual_align) + +L(a_loop): + ld.d t0, a0, 0 + ld.d t1, a1, 0 + addi.d a0, a0, 8 + addi.d a1, a1, 8 + + + sltui t7, a2, 9 + +L(start_realign): + sub.d t2, t0, t5 + nor t3, t0, t6 + xor t4, t0, t1 + + and t2, t2, t3 + addi.d a2, a2, -8 + + or t2, t2, t4 + or t3, t2, t7 + beqz t3, L(a_loop) + +L(end): + bge zero, t7, L(out) + andi t4, a2, 7 + li.d t3, -1 + addi.d t4, t4, -1 + slli.d t4, t4, 3 + sll.d t3, t3, t4 + or t2, t2, t3 + + +L(out): + ctz.d t3, t2 + bstrins.d t3, zero, 2, 0 + srl.d t0, t0, t3 + srl.d t1, t1, t3 + + andi t0, t0, 0xff + andi t1, t1, 0xff + sub.d a0, t0, t1 + jr ra + +L(mutual_align): + bstrins.d a0, zero, 2, 0 + bstrins.d a1, zero, 2, 0 + slli.d a5, a3, 0x3 + li.d t2, -1 + + ld.d t0, a0, 0 + ld.d t1, a1, 0 + + li.d t3, 9 + sll.d t2, t2, a5 + + sub.d t3, t3, a3 + addi.d a0, a0, 8 + + sltu t7, a2, t3 + addi.d a1, a1, 8 + + add.d a2, a2, a3 + orn t0, t0, t2 + orn t1, t1, t2 + b L(start_realign) + +L(ret0): + move a0, zero + jr ra + +L(unalign): + li.d t8, 8 + blt a2, t8, L(short_cmp) + + # swap a0 and a1 in case a3 > a4 + andi a4, a1, 0x7 + sltu t8, a4, a3 + xor a6, a0, a1 + maskeqz a6, a6, t8 + xor a0, a0, a6 + xor a1, a1, a6 + + andi a3, a0, 0x7 + andi a4, a1, 0x7 + + bstrins.d a0, zero, 2, 0 + bstrins.d a1, zero, 2, 0 + + li.d t2, -1 + li.d t3, 9 + + ld.d t0, a0, 0 + ld.d t1, a1, 0 + + sub.d t3, t3, a4 + sub.d a3, a4, a3 + + slli.d t4, a4, 3 + slli.d a6, a3, 3 + + sub.d a5, zero, a6 + sltu t7, a2, t3 + + rotr.d a7, t0, a5 + sll.d t4, t2, t4 # mask for first num + + add.d a2, a2, a4 + sll.d a4, t2, a6 # mask for a7 + + orn t0, a7, t4 + orn t1, t1, t4 + + sub.d t2, t0, t5 + nor t4, t0, t6 + and t2, t2, t4 + + xor t3, t0, t1 + or t2, t2, t3 + + or t3, t2, t7 + bnez t3, L(un_end) + + andn a7, a7, a4 + addi.d a3, a3, 1 + +L(un_loop): + addi.d a2, a2, -8 + # in case remaining part has '\0', no more load instructions should be executed on a0 address + or t0, a7, a4 + sltu t7, a2, a3 + + sub.d t2, t0, t5 + nor t3, t0, t6 + and t2, t2, t3 + + or t3, t2, t7 + bnez t3, L(check_remaining) + + ld.d t7, a0, 8 + ld.d t1, a1, 8 + addi.d a0, a0, 8 + addi.d a1, a1, 8 + + sll.d t4, t7, a6 + sub.d t2, t1, t5 + nor t3, t1, t6 + + or t0, t4, a7 + srl.d a7, t7, a5 + + and t2, t2, t3 + xor t3, t0, t1 + + sltui t7, a2, 9 + or t2, t2, t3 + + or t3, t2, t7 + beqz t3, L(un_loop) + b L(un_end) + +L(check_remaining): + ld.d t1, a1, 8 + xor t3, t1, a7 + or t2, t2, t3 + +L(un_end): + bge zero, t7, L(un_out) + andi t4, a2, 7 + li.d t3, -1 + + addi.d t4, t4, -1 + slli.d t4, t4, 3 + sll.d t3, t3, t4 + or t2, t2, t3 + +L(un_out): + ctz.d t3, t2 + bstrins.d t3, zero, 2, 0 + srl.d t0, t0, t3 + srl.d t1, t1, t3 + + andi t0, t0, 0xff + andi t1, t1, 0xff + + sub.d a4, t0, t1 + sub.d a5, t1, t0 + + maskeqz a6, a5, t8 + masknez a0, a4, t8 + + or a0, a0, a6 + jr ra + +L(short_cmp): + ld.bu t0, a0, 0 + ld.bu t1, a1, 0 + addi.d a2, a2, -1 + + xor t2, t0, t1 + masknez t2, t0, t2 + maskeqz t2, a2, t2 + + beqz t2, L(short_out) + + ld.bu t0, a0, 1 + ld.bu t1, a1, 1 + + addi.d a2, a2, -1 + addi.d a0, a0, 2 + + addi.d a1, a1, 2 + xor t2, t0, t1 + masknez t2, t0, t2 + maskeqz t2, a2, t2 + + bnez t2, L(short_cmp) + +L(short_out): + sub.d a0, t0, t1 + jr ra + +END(STRNCMP) +#ifdef _LIBC +libc_hidden_builtin_def (STRNCMP) +#endif diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S index 503442b3..78c8fd5d 100644 --- a/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S +++ b/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S @@ -1,8 +1,84 @@ +#ifdef _LIBC +#include +#include +#include +#else +#include +#include +#endif #if IS_IN (libc) - #define STRNLEN __strnlen_aligned - +#else +#define STRNLEN __strnlen #endif -#include "../strnlen.S" +#. before every load, a1(t5) must > 0; +#. first load with t1 != 0, need to adjust t5; +#. return the less one of both strlen(s) and a1; + +LEAF(STRNLEN, 6) + beqz a1, L(out) + lu12i.w a2, 0x01010 + andi t1, a0, 0x7 + move t4, a0 + + bstrins.d a0, zero, 2, 0 + ori a2, a2, 0x101 + li.w t0, -1 + ld.d t2, a0, 0 + + slli.d t3, t1, 3 + bstrins.d a2, a2, 63, 32 + li.w t5, 8 + slli.d a3, a2, 7 + + sub.w t1, t5, t1 + sll.d t0, t0, t3 + nor a3, zero, a3 + orn t2, t2, t0 + + + sub.d t0, t2, a2 + nor t3, t2, a3 + and t0, t0, t3 + bnez t0, L(count_pos) + + sub.d t5, a1, t1 + bgeu t1, a1, L(out) +L(loop_8bytes): + ld.d t2, a0, 8 + addi.d a0, a0, 8 + + sub.d t0, t2, a2 + nor t1, t2, a3 + sltui t6, t5, 9 + and t0, t0, t1 + + addi.d t5, t5, -8 + or t7, t0, t6 + beqz t7, L(loop_8bytes) +L(count_pos): + ctz.d t1, t0 + + + sub.d a0, a0, t4 + srli.d t1, t1, 3 + add.d a0, t1, a0 + sltu t0, a0, a1 + + masknez t1, a1, t0 + maskeqz a0, a0, t0 + or a0, a0, t1 + jr ra + +L(out): + move a0, a1 + jr ra + +END(STRNLEN) + +#ifdef _LIBC +weak_alias (STRNLEN, strnlen) +libc_hidden_builtin_def (STRNLEN) +#endif diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S index a58ddde8..6931045b 100644 --- a/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S +++ b/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S @@ -1,11 +1,110 @@ +#ifdef _LIBC +#include +#include +#include +#else +#include +#include +#endif #if IS_IN (libc) - #define STRRCHR_NAME __strrchr_aligned - +#else +#define STRRCHR_NAME strrchr #endif -#include "../strrchr.S" +LEAF(STRRCHR_NAME, 6) + slli.d t1, a0, 3 + bstrins.d a0, zero, 2, 0 + lu12i.w a2, 0x01010 + ld.d t2, a0, 0 // t2 = "5ZZ21abc" + + ori a2, a2, 0x101 + andi a1, a1, 0xff // a1 = "0000000Z" + li.d a5, -1 + bstrins.d a2, a2, 63, 32 // a2 = 0x0101010101010101 + + sll.d t1, a5, t1 // t1 = 0xffffffffff000000 + mul.d a1, a1, a2 // a1 = "ZZZZZZZZ" + orn t2, t2, t1 // t2 = "5ZZ21YYY" + slli.d a3, a2, 7 // a3 = 0x8080808080808080 + + sub.d a4, t2, a2 + andn t0, a3, t2 + move t3, zero + and t0, a4, t0 + + + xor a4, t2, a1 + move t5, zero + orn a4, a4, t1 + bnez t0, L(found_end) + + sub.d t1, a4, a2 + andn t0, a3, a4 + and t1, t1, t0 + +L(loop_8bytes): + masknez t4, t3, t1 + + maskeqz t3, t2, t1 + ld.d t2, a0, 8 + masknez t0, t5, t1 + maskeqz t5, a0, t1 + + or t3, t3, t4 + or t5, t0, t5 + sub.d t0, t2, a2 + andn t1, a3, t2 + + + xor a4, t2, a1 + and t0, t0, t1 //t0 hold diff pattern for '\0' + sub.d t1, a4, a2 + andn t4, a3, a4 + + and t1, t1, t4 //t1 hold diff pattern for 'a1' + addi.d a0, a0, 8 + beqz t0, L(loop_8bytes) //ok, neither \0 nor found +L(found_end): + ctz.d t1, t0 + + xor t3, t3, a1 + orn t1, zero, t1 + revb.d t3, t3 + srl.d t1, a5, t1 // mask for '\0' + + sub.d t4, t3, a2 + orn a4, a4, t1 + andn t3, a3, t3 + revb.d t2, a4 + + sub.d t0, t2, a2 + andn t1, a3, t2 + and t3, t3, t4 + and t1, t0, t1 + + li.d t7, 7 + masknez t4, t3, t1 + maskeqz t3, t1, t1 + masknez t5, t5, t1 + + or t3, t3, t4 + maskeqz t6, a0, t1 + ctz.d t0, t3 + or t5, t6, t5 + + srli.d t0, t0, 3 + sub.d t0, t7, t0 + add.d a0, t5, t0 + maskeqz a0, a0, t3 + + jr ra +END(STRRCHR_NAME) + +#ifdef _LIBC +libc_hidden_builtin_def(STRRCHR_NAME) +#endif #undef rindex weak_alias(STRRCHR_NAME, rindex) diff --git a/sysdeps/loongarch/lp64/stpcpy.S b/sysdeps/loongarch/lp64/stpcpy.S deleted file mode 100644 index b6a367dc..00000000 --- a/sysdeps/loongarch/lp64/stpcpy.S +++ /dev/null @@ -1,179 +0,0 @@ -#ifdef _LIBC -#include -#include -#include -#else -#include -#include -#endif - -#ifndef STPCPY_NAME -#define STPCPY_NAME __stpcpy -#endif - -LEAF(STPCPY_NAME, 6) - andi a3, a0, 0x7 - beqz a3, L(dest_align) - sub.d a5, a1, a3 - addi.d a5, a5, 8 - -L(make_dest_align): - ld.b t0, a1, 0 - addi.d a1, a1, 1 - st.b t0, a0, 0 - addi.d a0, a0, 1 - - beqz t0, L(al_out) - bne a1, a5, L(make_dest_align) - -L(dest_align): - andi a4, a1, 7 - bstrins.d a1, zero, 2, 0 - - lu12i.w t5, 0x1010 - ld.d t0, a1, 0 - ori t5, t5, 0x101 - bstrins.d t5, t5, 63, 32 - - slli.d t6, t5, 0x7 - bnez a4, L(unalign) - sub.d t1, t0, t5 - andn t2, t6, t0 - - and t3, t1, t2 - bnez t3, L(al_end) - -L(al_loop): - st.d t0, a0, 0 - ld.d t0, a1, 8 - - addi.d a1, a1, 8 - addi.d a0, a0, 8 - sub.d t1, t0, t5 - andn t2, t6, t0 - - and t3, t1, t2 - beqz t3, L(al_loop) - -L(al_end): - ctz.d t1, t3 - srli.d t1, t1, 3 - addi.d t1, t1, 1 # add 1, since '\0' needs to be copied to dest - - andi a3, t1, 8 - andi a4, t1, 4 - andi a5, t1, 2 - andi a6, t1, 1 - -L(al_end_8): - beqz a3, L(al_end_4) - st.d t0, a0, 0 - addi.d a0, a0, 7 - jr ra -L(al_end_4): - beqz a4, L(al_end_2) - st.w t0, a0, 0 - addi.d a0, a0, 4 - srli.d t0, t0, 32 -L(al_end_2): - beqz a5, L(al_end_1) - st.h t0, a0, 0 - addi.d a0, a0, 2 - srli.d t0, t0, 16 -L(al_end_1): - beqz a6, L(al_out) - st.b t0, a0, 0 - addi.d a0, a0, 1 -L(al_out): - addi.d a0, a0, -1 - jr ra - -L(unalign): - slli.d a5, a4, 3 - li.d t1, -1 - sub.d a6, zero, a5 - - srl.d a7, t0, a5 - sll.d t7, t1, a6 - - or t0, a7, t7 - sub.d t1, t0, t5 - andn t2, t6, t0 - and t3, t1, t2 - - bnez t3, L(un_end) - - ld.d t4, a1, 8 - addi.d a1, a1, 8 - - sub.d t1, t4, t5 - andn t2, t6, t4 - sll.d t0, t4, a6 - and t3, t1, t2 - - or t0, t0, a7 - bnez t3, L(un_end_with_remaining) - -L(un_loop): - srl.d a7, t4, a5 - - ld.d t4, a1, 8 - addi.d a1, a1, 8 - - st.d t0, a0, 0 - addi.d a0, a0, 8 - - sub.d t1, t4, t5 - andn t2, t6, t4 - sll.d t0, t4, a6 - and t3, t1, t2 - - or t0, t0, a7 - beqz t3, L(un_loop) - -L(un_end_with_remaining): - ctz.d t1, t3 - srli.d t1, t1, 3 - addi.d t1, t1, 1 - sub.d t1, t1, a4 - - blt t1, zero, L(un_end_less_8) - st.d t0, a0, 0 - addi.d a0, a0, 8 - beqz t1, L(un_out) - srl.d t0, t4, a5 # get the remaining part - b L(un_end_less_8) - -L(un_end): - ctz.d t1, t3 - srli.d t1, t1, 3 - addi.d t1, t1, 1 - -L(un_end_less_8): - andi a4, t1, 4 - andi a5, t1, 2 - andi a6, t1, 1 -L(un_end_4): - beqz a4, L(un_end_2) - st.w t0, a0, 0 - addi.d a0, a0, 4 - srli.d t0, t0, 32 -L(un_end_2): - beqz a5, L(un_end_1) - st.h t0, a0, 0 - addi.d a0, a0, 2 - srli.d t0, t0, 16 -L(un_end_1): - beqz a6, L(un_out) - st.b t0, a0, 0 - addi.d a0, a0, 1 -L(un_out): - addi.d a0, a0, -1 - jr ra - -END(STPCPY_NAME) - -#ifdef _LIBC -weak_alias (STPCPY_NAME, stpcpy) -libc_hidden_builtin_def (STPCPY_NAME) -#endif diff --git a/sysdeps/loongarch/lp64/strchr.S b/sysdeps/loongarch/lp64/strchr.S deleted file mode 100644 index fde53a30..00000000 --- a/sysdeps/loongarch/lp64/strchr.S +++ /dev/null @@ -1,89 +0,0 @@ -#ifdef _LIBC -#include -#include -#include -#else -#include -#include -#endif - -#ifndef STRCHR_NAME -#define STRCHR_NAME strchr -#endif - -/* char * strchr (const char *s1, int c); */ - -LEAF(STRCHR_NAME, 6) - slli.d t1, a0, 3 - bstrins.d a0, zero, 2, 0 - lu12i.w a2, 0x01010 - ld.d t2, a0, 0 - - ori a2, a2, 0x101 - andi a1, a1, 0xff - bstrins.d a2, a2, 63, 32 - li.w t0, -1 - - mul.d a1, a1, a2 # "cccccccc" - sll.d t0, t0, t1 - slli.d a3, a2, 7 # 0x8080808080808080 - orn t2, t2, t0 - - sll.d t3, a1, t1 - xor t4, t2, t3 - sub.d a7, t2, a2 - andn a6, a3, t2 - - - sub.d a5, t4, a2 - andn a4, a3, t4 - and a6, a7, a6 - and a5, a5, a4 - - or t0, a6, a5 - bnez t0, L(_mc8_a) - addi.d a0, a0, 8 -L(_aloop): - ld.d t4, a0, 0 - - xor t2, t4, a1 - sub.d a7, t4, a2 - andn a6, a3, t4 - sub.d a5, t2, a2 - - andn a4, a3, t2 - and a6, a7, a6 - and a5, a5, a4 - or a7, a6, a5 - - - bnez a7, L(_mc8_a) - ld.d t4, a0, 8 - addi.d a0, a0, 16 - xor t2, t4, a1 - - sub.d a7, t4, a2 - andn a6, a3, t4 - sub.d a5, t2, a2 - andn a4, a3, t2 - - and a6, a7, a6 - and a5, a5, a4 - or a7, a6, a5 - beqz a7, L(_aloop) - - addi.d a0, a0, -8 - -L(_mc8_a): - ctz.d t0, a5 - ctz.d t2, a6 - srli.w t0, t0, 3 - - - srli.w t2, t2, 3 - sltu t1, t2, t0 - add.d a0, a0, t0 - masknez a0, a0, t1 - - jr ra -END(STRCHR_NAME) diff --git a/sysdeps/loongarch/lp64/strchrnul.S b/sysdeps/loongarch/lp64/strchrnul.S deleted file mode 100644 index a5ee09a3..00000000 --- a/sysdeps/loongarch/lp64/strchrnul.S +++ /dev/null @@ -1,94 +0,0 @@ -#ifdef _LIBC -#include -#include -#include -#else -#include -#include -#endif - -#ifndef STRCHRNUL_NAME -#define STRCHRNUL_NAME __strchrnul -#endif - -/* char * strchrnul (const char *s1, int c); */ - -LEAF(STRCHRNUL_NAME, 6) - slli.d t1, a0, 3 - bstrins.d a0, zero, 2, 0 - lu12i.w a2, 0x01010 - ld.d t2, a0, 0 - - ori a2, a2, 0x101 - andi a1, a1, 0xff - bstrins.d a2, a2, 63, 32 - li.w t0, -1 - - mul.d a1, a1, a2 # "cccccccc" - sll.d t0, t0, t1 - slli.d a3, a2, 7 # 0x8080808080808080 - orn t2, t2, t0 - - sll.d t3, a1, t1 - xor t4, t2, t3 - sub.d a7, t2, a2 - andn a6, a3, t2 - - - sub.d a5, t4, a2 - andn a4, a3, t4 - and a6, a7, a6 - and a5, a5, a4 - - or t0, a6, a5 - bnez t0, L(_mc8_a) - addi.d a0, a0, 8 -L(_aloop): - ld.d t4, a0, 0 - - xor t2, t4, a1 - sub.d a7, t4, a2 - andn a6, a3, t4 - sub.d a5, t2, a2 - - andn a4, a3, t2 - and a6, a7, a6 - and a5, a5, a4 - or a7, a6, a5 - - - bnez a7, L(_mc8_a) - ld.d t4, a0, 8 - addi.d a0, a0, 16 - xor t2, t4, a1 - - sub.d a7, t4, a2 - andn a6, a3, t4 - sub.d a5, t2, a2 - andn a4, a3, t2 - - and a6, a7, a6 - and a5, a5, a4 - or a7, a6, a5 - beqz a7, L(_aloop) - - addi.d a0, a0, -8 -L(_mc8_a): - ctz.d t0, a5 - ctz.d t2, a6 - srli.w t0, t0, 3 - - srli.w t2, t2, 3 - slt t1, t0, t2 - masknez t3, t2, t1 - maskeqz t4, t0, t1 - - or t0, t3, t4 - add.d a0, a0, t0 - jr ra -END(STRCHRNUL_NAME) - -#ifdef _LIBC -weak_alias(STRCHRNUL_NAME, strchrnul) -libc_hidden_builtin_def (STRCHRNUL_NAME) -#endif diff --git a/sysdeps/loongarch/lp64/strcmp.S b/sysdeps/loongarch/lp64/strcmp.S deleted file mode 100644 index 3a863992..00000000 --- a/sysdeps/loongarch/lp64/strcmp.S +++ /dev/null @@ -1,227 +0,0 @@ -/* 2022\06\15 loongarch64 author: chenxiaolong. */ - -#ifdef _LIBC -#include -#include -#include -#else -#include -#include -#endif - -#ifndef STRCMP_NAME -#define STRCMP_NAME strcmp -#endif - -/* int strcmp (const char *s1, const char *s2); */ - -/* Parameters and Results */ -#define src1 a0 -#define src2 a1 -#define result v0 -LEAF(STRCMP_NAME, 6) - xor a4, src1, src2 - lu12i.w t5, 0x01010 - lu12i.w t6, 0x7f7f7 - andi a2, src1, 0x7 - - ori t5, t5, 0x101 - andi a4, a4, 0x7 - ori t6, t6, 0xf7f - bstrins.d t5, t5, 63, 32 - bstrins.d t6, t6, 63, 32 - - bnez a4, 3f // unaligned - beqz a2, 1f // loop aligned - -// mutual aligned - bstrins.d src1, zero, 2, 0 - bstrins.d src2, zero, 2, 0 - slli.d a4, a2, 0x3 - ld.d t0, src1, 0 - - sub.d a4, zero, a4 - ld.d t1, src2, 0 - addi.d src1, src1, 8 - addi.d src2, src2, 8 - - nor a5, zero, zero - srl.d a5, a5, a4 - or t0, t0, a5 - - or t1, t1, a5 - b 2f //start realigned - -// loop aligned -1: - ld.d t0, src1, 0 - addi.d src1, src1, 8 - ld.d t1, src2, 0 - addi.d src2, src2, 8 - -// start realigned: -2: - sub.d t2, t0, t5 - nor t3, t0, t6 - and t2, t2, t3 - - xor t3, t0, t1 - or t2, t2, t3 - beqz t2, 1b - - ctz.d t7, t2 - bstrins.d t7, zero, 2, 0 - srl.d t0, t0, t7 - srl.d t1, t1, t7 - - andi t0, t0, 0xff - andi t1, t1, 0xff - sub.d v0, t0, t1 - jr ra - -// unaligned -3: - andi a3, src2, 0x7 - slt a5, a2, a3 - masknez t8, a2, a5 - xor a6, src1, src2 - maskeqz a6, a6, t8 - xor src1, src1, a6 - xor src2, src2, a6 - - andi a2, src1, 0x7 - beqz a2, 4f // src1 is aligned - -//strcmp_unaligned: - andi a3, src2, 0x7 - bstrins.d src1, zero, 2, 0 - bstrins.d src2, zero, 2, 0 - nor t3, zero, zero - - ld.d t0, src1, 0 - ld.d t1, src2, 0 - sub.d a2, a3, a2 - addi.d t2, zero, 8 - - sub.d a5, t2, a2 - sub.d a6, t2, a3 - slli.d a5, a5, 0x3 - slli.d a6, a6, 0x3 - - srl.d t4, t3, a6 - srl.d a4, t3, a5 - rotr.d a7, t0, a5 - - addi.d src2, src2, 8 - addi.d src1, src1, 8 - or t1, t1, t4 - or t0, a7, t4 - - sub.d t2, t0, t5 - nor t3, t0, t6 - and t2, t2, t3 - xor t3, t0, t1 - or t2, t2, t3 - bnez t2, 7f - - and a7, a7, a4 - slli.d a6, a2, 0x3 - nor a4, zero, a4 - b 5f - -// src1 is aligned -4: - andi a3, src2, 0x7 - ld.d t0, src1, 0 - - bstrins.d src2, zero, 2, 0 - nor t2, zero, zero - ld.d t1, src2, 0 - - addi.d t3, zero, 0x8 - sub.d a5, t3, a3 - slli.d a5, a5, 0x3 - srl.d a4, t2, a5 - rotr.d t4, t0, a5 - - addi.d src2, src2, 8 - addi.d src1, src1, 8 - or t1, t1, a4 - or t0, t4, a4 - - sub.d t2, t0, t5 - nor t3, t0, t6 - and t2, t2, t3 - xor t3, t0, t1 - or t2, t2, t3 - - bnez t2, 7f - - and a7, t4, a4 - slli.d a6, a3, 0x3 - nor a4, zero, a4 - -// unaligned loop -// a7: remaining number -// a6: shift left number -// a5: shift right number -// a4: mask for checking remaining number -5: - or t0, a7, a4 - sub.d t2, t0, t5 - nor t3, t0, t6 - and t2, t2, t3 - bnez t2, 6f - - ld.d t0, src1, 0 - addi.d src1, src1, 8 - ld.d t1, src2, 0 - addi.d src2, src2, 8 - - srl.d t7, t0, a5 - sll.d t0, t0, a6 - or t0, a7, t0 - - sub.d t2, t0, t5 - nor t3, t0, t6 - and t2, t2, t3 - xor t3, t0, t1 - or t2, t2, t3 - bnez t2, 7f - - or a7, t7, zero - b 5b - -6: - ld.bu t1, src2, 0 - andi t0, a7, 0xff - xor t2, t0, t1 - srli.d a7, a7, 0x8 - masknez t2, t0, t2 - addi.d src2, src2, 1 - beqz t2, 8f - b 6b - -7: - ctz.d t7, t2 - bstrins.d t7, zero, 2, 0 - srl.d t0, t0, t7 - srl.d t1, t1, t7 - - andi t0, t0, 0xff - andi t1, t1, 0xff - -8: - sub.d a4, t0, t1 - sub.d a5, t1, t0 - maskeqz a6, a5, t8 - masknez result, a4, t8 - or result, result, a6 - jr ra - -END(STRCMP_NAME) - -#ifdef _LIBC -libc_hidden_builtin_def (STRCMP_NAME) -#endif - diff --git a/sysdeps/loongarch/lp64/strcpy.S b/sysdeps/loongarch/lp64/strcpy.S deleted file mode 100644 index 08505192..00000000 --- a/sysdeps/loongarch/lp64/strcpy.S +++ /dev/null @@ -1,173 +0,0 @@ -#ifdef _LIBC -#include -#include -#include -#else -#include -#include -#endif - -#ifndef STRCPY -#define STRCPY strcpy -#endif - -LEAF(STRCPY, 6) - andi a3, a0, 0x7 - move a2, a0 - beqz a3, L(dest_align) - sub.d a5, a1, a3 - addi.d a5, a5, 8 - -L(make_dest_align): - ld.b t0, a1, 0 - addi.d a1, a1, 1 - st.b t0, a2, 0 - beqz t0, L(al_out) - - addi.d a2, a2, 1 - bne a1, a5, L(make_dest_align) - -L(dest_align): - andi a4, a1, 7 - bstrins.d a1, zero, 2, 0 - - lu12i.w t5, 0x1010 - ld.d t0, a1, 0 - ori t5, t5, 0x101 - bstrins.d t5, t5, 63, 32 - - slli.d t6, t5, 0x7 - bnez a4, L(unalign) - sub.d t1, t0, t5 - andn t2, t6, t0 - - and t3, t1, t2 - bnez t3, L(al_end) - -L(al_loop): - st.d t0, a2, 0 - ld.d t0, a1, 8 - - addi.d a1, a1, 8 - addi.d a2, a2, 8 - sub.d t1, t0, t5 - andn t2, t6, t0 - - and t3, t1, t2 - beqz t3, L(al_loop) - -L(al_end): - ctz.d t1, t3 - srli.d t1, t1, 3 - addi.d t1, t1, 1 # add 1, since '\0' needs to be copied to dest - - andi a3, t1, 8 - andi a4, t1, 4 - andi a5, t1, 2 - andi a6, t1, 1 - -L(al_end_8): - beqz a3, L(al_end_4) - st.d t0, a2, 0 - jr ra -L(al_end_4): - beqz a4, L(al_end_2) - st.w t0, a2, 0 - addi.d a2, a2, 4 - srli.d t0, t0, 32 -L(al_end_2): - beqz a5, L(al_end_1) - st.h t0, a2, 0 - addi.d a2, a2, 2 - srli.d t0, t0, 16 -L(al_end_1): - beqz a6, L(al_out) - st.b t0, a2, 0 -L(al_out): - jr ra - -L(unalign): - slli.d a5, a4, 3 - li.d t1, -1 - sub.d a6, zero, a5 - - srl.d a7, t0, a5 - sll.d t7, t1, a6 - - or t0, a7, t7 - sub.d t1, t0, t5 - andn t2, t6, t0 - and t3, t1, t2 - - bnez t3, L(un_end) - - ld.d t4, a1, 8 - - sub.d t1, t4, t5 - andn t2, t6, t4 - sll.d t0, t4, a6 - and t3, t1, t2 - - or t0, t0, a7 - bnez t3, L(un_end_with_remaining) - -L(un_loop): - srl.d a7, t4, a5 - - ld.d t4, a1, 16 - addi.d a1, a1, 8 - - st.d t0, a2, 0 - addi.d a2, a2, 8 - - sub.d t1, t4, t5 - andn t2, t6, t4 - sll.d t0, t4, a6 - and t3, t1, t2 - - or t0, t0, a7 - beqz t3, L(un_loop) - -L(un_end_with_remaining): - ctz.d t1, t3 - srli.d t1, t1, 3 - addi.d t1, t1, 1 - sub.d t1, t1, a4 - - blt t1, zero, L(un_end_less_8) - st.d t0, a2, 0 - addi.d a2, a2, 8 - beqz t1, L(un_out) - srl.d t0, t4, a5 # get the remaining part - b L(un_end_less_8) - -L(un_end): - ctz.d t1, t3 - srli.d t1, t1, 3 - addi.d t1, t1, 1 - -L(un_end_less_8): - andi a4, t1, 4 - andi a5, t1, 2 - andi a6, t1, 1 -L(un_end_4): - beqz a4, L(un_end_2) - st.w t0, a2, 0 - addi.d a2, a2, 4 - srli.d t0, t0, 32 -L(un_end_2): - beqz a5, L(un_end_1) - st.h t0, a2, 0 - addi.d a2, a2, 2 - srli.d t0, t0, 16 -L(un_end_1): - beqz a6, L(un_out) - st.b t0, a2, 0 -L(un_out): - jr ra - -END(STRCPY) - -#ifdef _LIBC -libc_hidden_builtin_def (STRCPY) -#endif diff --git a/sysdeps/loongarch/lp64/strlen.S b/sysdeps/loongarch/lp64/strlen.S deleted file mode 100644 index 71431ce2..00000000 --- a/sysdeps/loongarch/lp64/strlen.S +++ /dev/null @@ -1,85 +0,0 @@ -#ifdef _LIBC -#include -#include -#include -#else -#include -#include -#endif - -#ifndef STRLEN -#define STRLEN strlen -#endif - -LEAF(STRLEN, 6) - move a1, a0 - bstrins.d a0, zero, 2, 0 - lu12i.w a2, 0x01010 - li.w t0, -1 - - ld.d t2, a0, 0 - andi t1, a1, 0x7 - ori a2, a2, 0x101 - slli.d t1, t1, 3 - - bstrins.d a2, a2, 63, 32 - sll.d t1, t0, t1 - slli.d t3, a2, 7 - nor a3, zero, t3 - - orn t2, t2, t1 - sub.d t0, t2, a2 - nor t1, t2, a3 - and t0, t0, t1 - - - bnez t0, L(count_pos) - addi.d a0, a0, 8 -L(loop_16_7bit): - ld.d t2, a0, 0 - sub.d t1, t2, a2 - - and t0, t1, t3 - bnez t0, L(more_check) - ld.d t2, a0, 8 - addi.d a0, a0, 16 - - sub.d t1, t2, a2 - and t0, t1, t3 - beqz t0, L(loop_16_7bit) - addi.d a0, a0, -8 -L(more_check): - nor t0, t2, a3 - - and t0, t1, t0 - bnez t0, L(count_pos) - addi.d a0, a0, 8 -L(loop_16_8bit): - ld.d t2, a0, 0 - - sub.d t1, t2, a2 - nor t0, t2, a3 - and t0, t0, t1 - bnez t0, L(count_pos) - - ld.d t2, a0, 8 - addi.d a0, a0, 16 - sub.d t1, t2, a2 - nor t0, t2, a3 - - and t0, t0, t1 - beqz t0, L(loop_16_8bit) - addi.d a0, a0, -8 -L(count_pos): - ctz.d t1, t0 - sub.d a0, a0, a1 - - srli.d t1, t1, 3 - add.d a0, a0, t1 - jr ra - -END(STRLEN) - -#ifdef _LIBC -libc_hidden_builtin_def (STRLEN) -#endif diff --git a/sysdeps/loongarch/lp64/strncmp.S b/sysdeps/loongarch/lp64/strncmp.S deleted file mode 100644 index 55450e55..00000000 --- a/sysdeps/loongarch/lp64/strncmp.S +++ /dev/null @@ -1,256 +0,0 @@ -#ifdef _LIBC -#include -#include -#include -#else -#include -#include -#endif - -#ifndef STRNCMP -#define STRNCMP strncmp -#endif - -/* int strncmp (const char *s1, const char *s2); */ - -LEAF(STRNCMP, 6) - beqz a2, L(ret0) - xor a4, a0, a1 - lu12i.w t5, 0x01010 - lu12i.w t6, 0x7f7f7 - - andi a3, a0, 0x7 - ori t5, t5, 0x101 - andi a4, a4, 0x7 - ori t6, t6, 0xf7f - - bstrins.d t5, t5, 63, 32 - bstrins.d t6, t6, 63, 32 - - bnez a4, L(unalign) - bnez a3, L(mutual_align) - -L(a_loop): - ld.d t0, a0, 0 - ld.d t1, a1, 0 - addi.d a0, a0, 8 - addi.d a1, a1, 8 - - - sltui t7, a2, 9 - -L(start_realign): - sub.d t2, t0, t5 - nor t3, t0, t6 - xor t4, t0, t1 - - and t2, t2, t3 - addi.d a2, a2, -8 - - or t2, t2, t4 - or t3, t2, t7 - beqz t3, L(a_loop) - -L(end): - bge zero, t7, L(out) - andi t4, a2, 7 - li.d t3, -1 - addi.d t4, t4, -1 - slli.d t4, t4, 3 - sll.d t3, t3, t4 - or t2, t2, t3 - - -L(out): - ctz.d t3, t2 - bstrins.d t3, zero, 2, 0 - srl.d t0, t0, t3 - srl.d t1, t1, t3 - - andi t0, t0, 0xff - andi t1, t1, 0xff - sub.d a0, t0, t1 - jr ra - -L(mutual_align): - bstrins.d a0, zero, 2, 0 - bstrins.d a1, zero, 2, 0 - slli.d a5, a3, 0x3 - li.d t2, -1 - - ld.d t0, a0, 0 - ld.d t1, a1, 0 - - li.d t3, 9 - sll.d t2, t2, a5 - - sub.d t3, t3, a3 - addi.d a0, a0, 8 - - sltu t7, a2, t3 - addi.d a1, a1, 8 - - add.d a2, a2, a3 - orn t0, t0, t2 - orn t1, t1, t2 - b L(start_realign) - -L(ret0): - move a0, zero - jr ra - -L(unalign): - li.d t8, 8 - blt a2, t8, L(short_cmp) - - # swap a0 and a1 in case a3 > a4 - andi a4, a1, 0x7 - sltu t8, a4, a3 - xor a6, a0, a1 - maskeqz a6, a6, t8 - xor a0, a0, a6 - xor a1, a1, a6 - - andi a3, a0, 0x7 - andi a4, a1, 0x7 - - bstrins.d a0, zero, 2, 0 - bstrins.d a1, zero, 2, 0 - - li.d t2, -1 - li.d t3, 9 - - ld.d t0, a0, 0 - ld.d t1, a1, 0 - - sub.d t3, t3, a4 - sub.d a3, a4, a3 - - slli.d t4, a4, 3 - slli.d a6, a3, 3 - - sub.d a5, zero, a6 - sltu t7, a2, t3 - - rotr.d a7, t0, a5 - sll.d t4, t2, t4 # mask for first num - - add.d a2, a2, a4 - sll.d a4, t2, a6 # mask for a7 - - orn t0, a7, t4 - orn t1, t1, t4 - - sub.d t2, t0, t5 - nor t4, t0, t6 - and t2, t2, t4 - - xor t3, t0, t1 - or t2, t2, t3 - - or t3, t2, t7 - bnez t3, L(un_end) - - andn a7, a7, a4 - addi.d a3, a3, 1 - -L(un_loop): - addi.d a2, a2, -8 - # in case remaining part has '\0', no more load instructions should be executed on a0 address - or t0, a7, a4 - sltu t7, a2, a3 - - sub.d t2, t0, t5 - nor t3, t0, t6 - and t2, t2, t3 - - or t3, t2, t7 - bnez t3, L(check_remaining) - - ld.d t7, a0, 8 - ld.d t1, a1, 8 - addi.d a0, a0, 8 - addi.d a1, a1, 8 - - sll.d t4, t7, a6 - sub.d t2, t1, t5 - nor t3, t1, t6 - - or t0, t4, a7 - srl.d a7, t7, a5 - - and t2, t2, t3 - xor t3, t0, t1 - - sltui t7, a2, 9 - or t2, t2, t3 - - or t3, t2, t7 - beqz t3, L(un_loop) - b L(un_end) - -L(check_remaining): - ld.d t1, a1, 8 - xor t3, t1, a7 - or t2, t2, t3 - -L(un_end): - bge zero, t7, L(un_out) - andi t4, a2, 7 - li.d t3, -1 - - addi.d t4, t4, -1 - slli.d t4, t4, 3 - sll.d t3, t3, t4 - or t2, t2, t3 - -L(un_out): - ctz.d t3, t2 - bstrins.d t3, zero, 2, 0 - srl.d t0, t0, t3 - srl.d t1, t1, t3 - - andi t0, t0, 0xff - andi t1, t1, 0xff - - sub.d a4, t0, t1 - sub.d a5, t1, t0 - - maskeqz a6, a5, t8 - masknez a0, a4, t8 - - or a0, a0, a6 - jr ra - -L(short_cmp): - ld.bu t0, a0, 0 - ld.bu t1, a1, 0 - addi.d a2, a2, -1 - - xor t2, t0, t1 - masknez t2, t0, t2 - maskeqz t2, a2, t2 - - beqz t2, L(short_out) - - ld.bu t0, a0, 1 - ld.bu t1, a1, 1 - - addi.d a2, a2, -1 - addi.d a0, a0, 2 - - addi.d a1, a1, 2 - xor t2, t0, t1 - masknez t2, t0, t2 - maskeqz t2, a2, t2 - - bnez t2, L(short_cmp) - -L(short_out): - sub.d a0, t0, t1 - jr ra - -END(STRNCMP) -#ifdef _LIBC -libc_hidden_builtin_def (STRNCMP) -#endif diff --git a/sysdeps/loongarch/lp64/strnlen.S b/sysdeps/loongarch/lp64/strnlen.S deleted file mode 100644 index 5b5ab585..00000000 --- a/sysdeps/loongarch/lp64/strnlen.S +++ /dev/null @@ -1,82 +0,0 @@ -#ifdef _LIBC -#include -#include -#include -#else -#include -#include -#endif - -#ifndef STRNLEN -#define STRNLEN __strnlen -#endif - -#. before every load, a1(t5) must > 0; -#. first load with t1 != 0, need to adjust t5; -#. return the less one of both strlen(s) and a1; - -LEAF(STRNLEN, 6) - beqz a1, L(out) - lu12i.w a2, 0x01010 - andi t1, a0, 0x7 - move t4, a0 - - bstrins.d a0, zero, 2, 0 - ori a2, a2, 0x101 - li.w t0, -1 - ld.d t2, a0, 0 - - slli.d t3, t1, 3 - bstrins.d a2, a2, 63, 32 - li.w t5, 8 - slli.d a3, a2, 7 - - sub.w t1, t5, t1 - sll.d t0, t0, t3 - nor a3, zero, a3 - orn t2, t2, t0 - - - sub.d t0, t2, a2 - nor t3, t2, a3 - and t0, t0, t3 - bnez t0, L(count_pos) - - sub.d t5, a1, t1 - bgeu t1, a1, L(out) -L(loop_8bytes): - ld.d t2, a0, 8 - addi.d a0, a0, 8 - - sub.d t0, t2, a2 - nor t1, t2, a3 - sltui t6, t5, 9 - and t0, t0, t1 - - addi.d t5, t5, -8 - or t7, t0, t6 - beqz t7, L(loop_8bytes) -L(count_pos): - ctz.d t1, t0 - - - sub.d a0, a0, t4 - srli.d t1, t1, 3 - add.d a0, t1, a0 - sltu t0, a0, a1 - - masknez t1, a1, t0 - maskeqz a0, a0, t0 - or a0, a0, t1 - jr ra - -L(out): - move a0, a1 - jr ra - -END(STRNLEN) - -#ifdef _LIBC -weak_alias (STRNLEN, strnlen) -libc_hidden_builtin_def (STRNLEN) -#endif diff --git a/sysdeps/loongarch/lp64/strrchr.S b/sysdeps/loongarch/lp64/strrchr.S deleted file mode 100644 index df7fcb6b..00000000 --- a/sysdeps/loongarch/lp64/strrchr.S +++ /dev/null @@ -1,105 +0,0 @@ -#ifdef _LIBC -#include -#include -#include -#else -#include -#include -#endif - -#ifndef STRRCHR_NAME -#define STRRCHR_NAME strrchr -#endif - -LEAF(STRRCHR_NAME, 6) - slli.d t1, a0, 3 - bstrins.d a0, zero, 2, 0 - lu12i.w a2, 0x01010 - ld.d t2, a0, 0 // t2 = "5ZZ21abc" - - ori a2, a2, 0x101 - andi a1, a1, 0xff // a1 = "0000000Z" - li.d a5, -1 - bstrins.d a2, a2, 63, 32 // a2 = 0x0101010101010101 - - sll.d t1, a5, t1 // t1 = 0xffffffffff000000 - mul.d a1, a1, a2 // a1 = "ZZZZZZZZ" - orn t2, t2, t1 // t2 = "5ZZ21YYY" - slli.d a3, a2, 7 // a3 = 0x8080808080808080 - - sub.d a4, t2, a2 - andn t0, a3, t2 - move t3, zero - and t0, a4, t0 - - - xor a4, t2, a1 - move t5, zero - orn a4, a4, t1 - bnez t0, L(found_end) - - sub.d t1, a4, a2 - andn t0, a3, a4 - and t1, t1, t0 - -L(loop_8bytes): - masknez t4, t3, t1 - - maskeqz t3, t2, t1 - ld.d t2, a0, 8 - masknez t0, t5, t1 - maskeqz t5, a0, t1 - - or t3, t3, t4 - or t5, t0, t5 - sub.d t0, t2, a2 - andn t1, a3, t2 - - - xor a4, t2, a1 - and t0, t0, t1 //t0 hold diff pattern for '\0' - sub.d t1, a4, a2 - andn t4, a3, a4 - - and t1, t1, t4 //t1 hold diff pattern for 'a1' - addi.d a0, a0, 8 - beqz t0, L(loop_8bytes) //ok, neither \0 nor found -L(found_end): - ctz.d t1, t0 - - xor t3, t3, a1 - orn t1, zero, t1 - revb.d t3, t3 - srl.d t1, a5, t1 // mask for '\0' - - sub.d t4, t3, a2 - orn a4, a4, t1 - andn t3, a3, t3 - revb.d t2, a4 - - sub.d t0, t2, a2 - andn t1, a3, t2 - and t3, t3, t4 - and t1, t0, t1 - - li.d t7, 7 - masknez t4, t3, t1 - maskeqz t3, t1, t1 - masknez t5, t5, t1 - - or t3, t3, t4 - maskeqz t6, a0, t1 - ctz.d t0, t3 - or t5, t6, t5 - - srli.d t0, t0, 3 - sub.d t0, t7, t0 - add.d a0, t5, t0 - maskeqz a0, a0, t3 - - jr ra -END(STRRCHR_NAME) - -#ifdef _LIBC -libc_hidden_builtin_def(STRRCHR_NAME) -#endif -- 2.33.0