From 4879bd4e0aff7d884d9b026b6081a0e8cffc491c Mon Sep 17 00:00:00 2001 From: caiyinyu Date: Wed, 21 Jun 2023 09:30:54 +0800 Subject: [PATCH 06/14] glibc-2.28: Refactor code of {raw,}mem* functions. Change-Id: Icafaf6bc8216f48be64cf25a40b9fe28ce127914 Signed-off-by: ticat_fp --- sysdeps/loongarch/lp64/memchr.S | 92 -- sysdeps/loongarch/lp64/memcmp.S | 280 ------ sysdeps/loongarch/lp64/memcpy.S | 804 ------------------ sysdeps/loongarch/lp64/memmove.S | 2 - sysdeps/loongarch/lp64/memset.S | 166 ---- .../loongarch/lp64/multiarch/memchr-aligned.S | 91 +- .../loongarch/lp64/multiarch/memcmp-aligned.S | 282 +++++- .../loongarch/lp64/multiarch/memcpy-aligned.S | 799 ++++++++++++++++- .../loongarch/lp64/multiarch/memset-aligned.S | 166 +++- .../lp64/multiarch/rawmemchr-aligned.S | 110 ++- sysdeps/loongarch/lp64/rawmemchr.S | 113 --- 11 files changed, 1438 insertions(+), 1467 deletions(-) delete mode 100644 sysdeps/loongarch/lp64/memchr.S delete mode 100644 sysdeps/loongarch/lp64/memcmp.S delete mode 100644 sysdeps/loongarch/lp64/memcpy.S delete mode 100644 sysdeps/loongarch/lp64/memmove.S delete mode 100644 sysdeps/loongarch/lp64/memset.S delete mode 100644 sysdeps/loongarch/lp64/rawmemchr.S diff --git a/sysdeps/loongarch/lp64/memchr.S b/sysdeps/loongarch/lp64/memchr.S deleted file mode 100644 index 23f1fd13..00000000 --- a/sysdeps/loongarch/lp64/memchr.S +++ /dev/null @@ -1,92 +0,0 @@ -#ifdef _LIBC -#include -#include -#include -#else -#include -#include -#endif - -#ifndef MEMCHR_NAME -#define MEMCHR_NAME memchr -#endif - -LEAF(MEMCHR_NAME, 6) - beqz a2, L(out) - andi t1, a0, 0x7 - lu12i.w a3, 0x01010 - sub.d a5, a0, t1 - - bstrins.d a1, a1, 15, 8 - ld.d t0, a5, 0 - slli.d t2, t1, 3 - ori a3, a3, 0x101 - - bstrins.d a1, a1, 31, 16 - li.w t7, -1 - li.w t8, 9 - bstrins.d a3, a3, 63, 32 - - srl.d t3, t7, t2 - bstrins.d a1, a1, 63, 32 - sub.d t4, t8, t1 - orn t3, a1, t3 - - srl.d t0, t0, t2 - slli.d a4, a3, 7 # 0x8080808080808080 - sltu t4, a2, t4 - xor t2, t0, t3 - - sub.d a6, t2, a3 - andn a7, a4, t2 - and t2, a6, a7 - or t3, t2, t4 - - bnez t3, L(count_pos) - addi.d a2, a2, -8 - addi.d a0, a5, 8 - add.d a2, a2, t1 - -L(loop): - ld.d t0, a0, 0 - sltui t4, a2, 9 - xor t2, t0, a1 - sub.d a6, t2, a3 - - andn a7, a4, t2 - and t2, a6, a7 - or t3, t2, t4 - bnez t3, L(count_pos) - - ld.d t1, a0, 8 - addi.d a0, a0, 16 - sltui t4, a2, 17 - xor t2, t1, a1 - - sub.d a6, t2, a3 - andn a7, a4, t2 - and t2, a6, a7 - addi.d a2, a2, -16 - - or t3, t2, t4 - beqz t3, L(loop) - addi.d a0, a0, -8 - addi.d a2, a2, 8 - -L(count_pos): - ctz.d t0, t2 - srli.d t0, t0, 3 - sltu t1, t0, a2 - add.d a0, a0, t0 - - maskeqz a0, a0, t1 - jr ra - -L(out): - move a0, zero - jr ra -END(MEMCHR_NAME) - -#ifdef _LIBC -libc_hidden_builtin_def (MEMCHR_NAME) -#endif diff --git a/sysdeps/loongarch/lp64/memcmp.S b/sysdeps/loongarch/lp64/memcmp.S deleted file mode 100644 index 457a4dc7..00000000 --- a/sysdeps/loongarch/lp64/memcmp.S +++ /dev/null @@ -1,280 +0,0 @@ -#ifdef _LIBC -#include -#include -#include -#else -#include -#include -#endif - -#ifndef MEMCMP_NAME -#define MEMCMP_NAME memcmp -#endif - -LEAF(MEMCMP_NAME, 6) - beqz a2, L(ret) - andi a4, a1, 0x7 - andi a3, a0, 0x7 - sltu a5, a4, a3 - - xor t0, a0, a1 - li.w t8, 8 - maskeqz t0, t0, a5 - li.w t7, -1 - - xor a0, a0, t0 // a0 hold smaller one - xor a1, a1, t0 // a1 hold larger one - andi a3, a0, 0x7 // a3 hold small offset - andi a4, a1, 0x7 // a4 hold larger offset - - xor a0, a0, a3 - xor a1, a1, a4 - ld.d t2, a0, 0 // t2 = "fedcbaXX" - ld.d t1, a1, 0 // t1 = "54321YYY" - - slli.d t3, a3, 3 - slli.d t4, a4, 3 - sub.d a6, t3, t4 // a6 = 0xfffffffffffffff8 - srl.d t1, t1, t4 // t1 = "00054321" - - srl.d t0, t2, t3 // t0 = "00fedcba" - srl.d t5, t7, t4 // t5 = 0x000000FFFFFFFFFF - sub.d t6, t0, t1 // t6 hold diff - and t6, t6, t5 // t6 = "000xxxxx" - - sub.d t5, t8, a4 // t5 hold margin 8 - 3 = 5 - bnez t6, L(first_out) - bgeu t5, a2, L(ret) - sub.d a2, a2, t5 - - bnez a6, L(unaligned) - blt a2, t8, L(al_less_8bytes) - andi t1, a2, 31 - beq t1, a2, L(al_less_32bytes) - - sub.d t2, a2, t1 - add.d a4, a0, t2 - move a2, t1 - -L(al_loop): - ld.d t0, a0, 8 - - ld.d t1, a1, 8 - ld.d t2, a0, 16 - ld.d t3, a1, 16 - ld.d t4, a0, 24 - - ld.d t5, a1, 24 - ld.d t6, a0, 32 - ld.d t7, a1, 32 - addi.d a0, a0, 32 - - addi.d a1, a1, 32 - bne t0, t1, L(out1) - bne t2, t3, L(out2) - bne t4, t5, L(out3) - - bne t6, t7, L(out4) - bne a0, a4, L(al_loop) - -L(al_less_32bytes): - srai.d a4, a2, 4 - beqz a4, L(al_less_16bytes) - - ld.d t0, a0, 8 - ld.d t1, a1, 8 - ld.d t2, a0, 16 - ld.d t3, a1, 16 - - addi.d a0, a0, 16 - addi.d a1, a1, 16 - addi.d a2, a2, -16 - bne t0, t1, L(out1) - - bne t2, t3, L(out2) - -L(al_less_16bytes): - srai.d a4, a2, 3 - beqz a4, L(al_less_8bytes) - ld.d t0, a0, 8 - - ld.d t1, a1, 8 - addi.d a0, a0, 8 - addi.d a1, a1, 8 - addi.d a2, a2, -8 - - bne t0, t1, L(out1) - -L(al_less_8bytes): - beqz a2, L(ret) - ld.d t0, a0, 8 - ld.d t1, a1, 8 - - li.d t7, -1 - slli.d t2, a2, 3 - sll.d t2, t7, t2 - sub.d t3, t0, t1 - - andn t6, t3, t2 - bnez t6, L(count_diff) - -L(ret): - move a0, zero - jr ra - -L(out4): - move t0, t6 - move t1, t7 - sub.d t6, t6, t7 - b L(count_diff) - -L(out3): - move t0, t4 - move t1, t5 - sub.d t6, t4, t5 - b L(count_diff) - -L(out2): - move t0, t2 - move t1, t3 -L(out1): - sub.d t6, t0, t1 - b L(count_diff) - -L(first_out): - slli.d t4, a2, 3 - slt t3, a2, t5 - sll.d t4, t7, t4 - maskeqz t4, t4, t3 - - andn t6, t6, t4 - -L(count_diff): - ctz.d t2, t6 - bstrins.d t2, zero, 2, 0 - srl.d t0, t0, t2 - - srl.d t1, t1, t2 - andi t0, t0, 0xff - andi t1, t1, 0xff - sub.d t2, t0, t1 - - sub.d t3, t1, t0 - masknez t2, t2, a5 - maskeqz t3, t3, a5 - or a0, t2, t3 - - jr ra - -L(unaligned): - sub.d a7, zero, a6 - srl.d t0, t2, a6 - blt a2, t8, L(un_less_8bytes) - - andi t1, a2, 31 - beq t1, a2, L(un_less_32bytes) - sub.d t2, a2, t1 - add.d a4, a0, t2 - - move a2, t1 - -L(un_loop): - ld.d t2, a0, 8 - ld.d t1, a1, 8 - ld.d t4, a0, 16 - - ld.d t3, a1, 16 - ld.d t6, a0, 24 - ld.d t5, a1, 24 - ld.d t8, a0, 32 - - ld.d t7, a1, 32 - addi.d a0, a0, 32 - addi.d a1, a1, 32 - sll.d a3, t2, a7 - - or t0, a3, t0 - bne t0, t1, L(out1) - srl.d t0, t2, a6 - sll.d a3, t4, a7 - - or t2, a3, t0 - bne t2, t3, L(out2) - srl.d t0, t4, a6 - sll.d a3, t6, a7 - - or t4, a3, t0 - bne t4, t5, L(out3) - srl.d t0, t6, a6 - sll.d a3, t8, a7 - - or t6, t0, a3 - bne t6, t7, L(out4) - srl.d t0, t8, a6 - bne a0, a4, L(un_loop) - -L(un_less_32bytes): - srai.d a4, a2, 4 - beqz a4, L(un_less_16bytes) - ld.d t2, a0, 8 - ld.d t1, a1, 8 - - ld.d t4, a0, 16 - ld.d t3, a1, 16 - addi.d a0, a0, 16 - addi.d a1, a1, 16 - - addi.d a2, a2, -16 - sll.d a3, t2, a7 - or t0, a3, t0 - bne t0, t1, L(out1) - - srl.d t0, t2, a6 - sll.d a3, t4, a7 - or t2, a3, t0 - bne t2, t3, L(out2) - - srl.d t0, t4, a6 - -L(un_less_16bytes): - srai.d a4, a2, 3 - beqz a4, L(un_less_8bytes) - ld.d t2, a0, 8 - - ld.d t1, a1, 8 - addi.d a0, a0, 8 - addi.d a1, a1, 8 - addi.d a2, a2, -8 - - sll.d a3, t2, a7 - or t0, a3, t0 - bne t0, t1, L(out1) - srl.d t0, t2, a6 - -L(un_less_8bytes): - beqz a2, L(ret) - andi a7, a7, 63 - slli.d a4, a2, 3 - bgeu a7, a4, L(last_cmp) - - ld.d t2, a0, 8 - sll.d a3, t2, a7 - or t0, a3, t0 - -L(last_cmp): - ld.d t1, a1, 8 - - li.d t7, -1 - sll.d t2, t7, a4 - sub.d t3, t0, t1 - andn t6, t3, t2 - - bnez t6, L(count_diff) - move a0, zero - jr ra - -END(MEMCMP_NAME) - -#ifdef _LIBC -libc_hidden_builtin_def (MEMCMP_NAME) -#endif diff --git a/sysdeps/loongarch/lp64/memcpy.S b/sysdeps/loongarch/lp64/memcpy.S deleted file mode 100644 index 4791e1a4..00000000 --- a/sysdeps/loongarch/lp64/memcpy.S +++ /dev/null @@ -1,804 +0,0 @@ -#ifdef _LIBC -#include -#include -#include -#else -#include -#include -#endif - -#ifndef MEMCPY_NAME -#define MEMCPY_NAME memcpy -#endif - -#ifndef MEMMOVE_NAME -#define MEMMOVE_NAME memmove -#endif - -#define LD_64(reg, n) \ - ld.d t0, reg, n; \ - ld.d t1, reg, n+8; \ - ld.d t2, reg, n+16; \ - ld.d t3, reg, n+24; \ - ld.d t4, reg, n+32; \ - ld.d t5, reg, n+40; \ - ld.d t6, reg, n+48; \ - ld.d t7, reg, n+56; - -#define ST_64(reg, n) \ - st.d t0, reg, n; \ - st.d t1, reg, n+8; \ - st.d t2, reg, n+16; \ - st.d t3, reg, n+24; \ - st.d t4, reg, n+32; \ - st.d t5, reg, n+40; \ - st.d t6, reg, n+48; \ - st.d t7, reg, n+56; - -LEAF(MEMMOVE_NAME, 6) - sub.d t0, a0, a1 - bltu t0, a2, L(copy_back) - -END(MEMMOVE_NAME) - -#ifdef _LIBC -libc_hidden_builtin_def (MEMMOVE_NAME) -#endif - -LEAF_NO_ALIGN(MEMCPY_NAME) - - srai.d a3, a2, 4 - beqz a3, L(short_data) # less than 16 bytes - - move a4, a0 - andi a5, a0, 0x7 - andi a6, a1, 0x7 - li.d t8, 8 - beqz a5, L(check_align) - - # make dest aligned 8 bytes - sub.d t2, t8, a5 - sub.d a2, a2, t2 - - pcaddi t1, 20 - slli.d t3, t2, 3 - add.d a1, a1, t2 - sub.d t1, t1, t3 - add.d a4, a4, t2 - jr t1 - -L(al7): - ld.b t0, a1, -7 - st.b t0, a4, -7 -L(al6): - ld.b t0, a1, -6 - st.b t0, a4, -6 -L(al5): - ld.b t0, a1, -5 - st.b t0, a4, -5 -L(al4): - ld.b t0, a1, -4 - st.b t0, a4, -4 -L(al3): - ld.b t0, a1, -3 - st.b t0, a4, -3 -L(al2): - ld.b t0, a1, -2 - st.b t0, a4, -2 -L(al1): - ld.b t0, a1, -1 - st.b t0, a4, -1 - -L(check_align): - bne a5, a6, L(unalign) - - srai.d a3, a2, 4 - beqz a3, L(al_less_16bytes) - - andi a3, a2, 0x3f - beq a3, a2, L(al_less_64bytes) - - sub.d t0, a2, a3 - move a2, a3 - add.d a5, a1, t0 - -L(loop_64bytes): - LD_64(a1, 0) - addi.d a1, a1, 64 - ST_64(a4, 0) - - addi.d a4, a4, 64 - bne a1, a5, L(loop_64bytes) - -L(al_less_64bytes): - srai.d a3, a2, 5 - beqz a3, L(al_less_32bytes) - - ld.d t0, a1, 0 - ld.d t1, a1, 8 - ld.d t2, a1, 16 - ld.d t3, a1, 24 - - addi.d a1, a1, 32 - addi.d a2, a2, -32 - - st.d t0, a4, 0 - st.d t1, a4, 8 - st.d t2, a4, 16 - st.d t3, a4, 24 - - addi.d a4, a4, 32 - -L(al_less_32bytes): - srai.d a3, a2, 4 - beqz a3, L(al_less_16bytes) - - ld.d t0, a1, 0 - ld.d t1, a1, 8 - addi.d a1, a1, 16 - addi.d a2, a2, -16 - - st.d t0, a4, 0 - st.d t1, a4, 8 - addi.d a4, a4, 16 - -L(al_less_16bytes): - srai.d a3, a2, 3 - beqz a3, L(al_less_8bytes) - - ld.d t0, a1, 0 - addi.d a1, a1, 8 - addi.d a2, a2, -8 - - st.d t0, a4, 0 - addi.d a4, a4, 8 - -L(al_less_8bytes): - srai.d a3, a2, 2 - beqz a3, L(al_less_4bytes) - - ld.w t0, a1, 0 - addi.d a1, a1, 4 - addi.d a2, a2, -4 - - st.w t0, a4, 0 - addi.d a4, a4, 4 - -L(al_less_4bytes): - srai.d a3, a2, 1 - beqz a3, L(al_less_2bytes) - - ld.h t0, a1, 0 - addi.d a1, a1, 2 - addi.d a2, a2, -2 - - st.h t0, a4, 0 - addi.d a4, a4, 2 - -L(al_less_2bytes): - beqz a2, L(al_less_1byte) - - ld.b t0, a1, 0 - st.b t0, a4, 0 - -L(al_less_1byte): - jr ra - -L(unalign): - andi a5, a1, 0x7 - bstrins.d a1, zero, 2, 0 # make src 8 bytes aligned - - sub.d t8, t8, a5 # use t8 to save count of bytes for aligning - slli.d a5, a5, 3 - - ld.d t0, a1, 0 - addi.d a1, a1, 8 - - slli.d a6, t8, 3 - srl.d a7, t0, a5 - - srai.d a3, a2, 4 - beqz a3, L(un_less_16bytes) - - andi a3, a2, 0x3f - beq a3, a2, L(un_less_64bytes) - - sub.d t0, a2, a3 - move a2, a3 - add.d a3, a1, t0 - -# a5 shift right num -# a6 shift left num -# a7 remaining part -L(un_long_bytes): - ld.d t0, a1, 0 - ld.d t1, a1, 8 - ld.d t2, a1, 16 - ld.d t3, a1, 24 - - srl.d t4, t0, a5 - sll.d t0, t0, a6 - - srl.d t5, t1, a5 - sll.d t1, t1, a6 - - srl.d t6, t2, a5 - sll.d t2, t2, a6 - - srl.d t7, t3, a5 - sll.d t3, t3, a6 - - or t0, a7, t0 - or t1, t4, t1 - or t2, t5, t2 - or t3, t6, t3 - - ld.d t4, a1, 32 - ld.d t5, a1, 40 - ld.d t6, a1, 48 - ld.d a7, a1, 56 - - st.d t0, a4, 0 - st.d t1, a4, 8 - st.d t2, a4, 16 - st.d t3, a4, 24 - - addi.d a1, a1, 64 - - srl.d t0, t4, a5 - sll.d t4, t4, a6 - - srl.d t1, t5, a5 - sll.d t5, t5, a6 - - srl.d t2, t6, a5 - sll.d t6, t6, a6 - - sll.d t3, a7, a6 - srl.d a7, a7, a5 - - or t4, t7, t4 - or t5, t0, t5 - or t6, t1, t6 - or t3, t2, t3 - - st.d t4, a4, 32 - st.d t5, a4, 40 - st.d t6, a4, 48 - st.d t3, a4, 56 - - addi.d a4, a4, 64 - bne a3, a1, L(un_long_bytes) - -L(un_less_64bytes): - srai.d a3, a2, 5 - beqz a3, L(un_less_32bytes) - - ld.d t0, a1, 0 - ld.d t1, a1, 8 - ld.d t2, a1, 16 - ld.d t3, a1, 24 - - addi.d a1, a1, 32 - addi.d a2, a2, -32 - - srl.d t4, t0, a5 - sll.d t0, t0, a6 - - srl.d t5, t1, a5 - sll.d t1, t1, a6 - - srl.d t6, t2, a5 - sll.d t2, t2, a6 - - or t0, a7, t0 - - srl.d a7, t3, a5 - sll.d t3, t3, a6 - - or t1, t4, t1 - or t2, t5, t2 - or t3, t6, t3 - - st.d t0, a4, 0 - st.d t1, a4, 8 - st.d t2, a4, 16 - st.d t3, a4, 24 - - addi.d a4, a4, 32 - -L(un_less_32bytes): - srai.d a3, a2, 4 - beqz a3, L(un_less_16bytes) - - ld.d t0, a1, 0 - ld.d t1, a1, 8 - - addi.d a1, a1, 16 - addi.d a2, a2, -16 - - srl.d t2, t0, a5 - sll.d t3, t0, a6 - - sll.d t4, t1, a6 - or t3, a7, t3 - or t4, t2, t4 - srl.d a7, t1, a5 - - st.d t3, a4, 0 - st.d t4, a4, 8 - - addi.d a4, a4, 16 - -L(un_less_16bytes): - srai.d a3, a2, 3 - beqz a3, L(un_less_8bytes) - - ld.d t0, a1, 0 - - addi.d a1, a1, 8 - addi.d a2, a2, -8 - - sll.d t1, t0, a6 - or t2, a7, t1 - srl.d a7, t0, a5 - - st.d t2, a4, 0 - addi.d a4, a4, 8 - -L(un_less_8bytes): - beqz a2, L(un_less_1byte) - bge t8, a2, 1f # no more data in memory, un_less_8bytes data is stored in a7 - - # combine data in memory and a7(remaining part) - ld.d t0, a1, 0 - sll.d t0, t0, a6 - or a7, a7, t0 - -1: - srai.d a3, a2, 2 - beqz a3, L(un_less_4bytes) - - addi.d a2, a2, -4 - st.w a7, a4, 0 - addi.d a4, a4, 4 - srai.d a7, a7, 32 - -L(un_less_4bytes): - srai.d a3, a2, 1 - beqz a3, L(un_less_2bytes) - - addi.d a2, a2, -2 - st.h a7, a4, 0 - addi.d a4, a4, 2 - srai.d a7, a7, 16 - -L(un_less_2bytes): - beqz a2, L(un_less_1byte) - st.b a7, a4, 0 - -L(un_less_1byte): - jr ra - -# Bytes copying for data less than 16 bytes -L(short_data): - pcaddi t1, 36 - slli.d t2, a2, 3 - add.d a4, a0, a2 - sub.d t1, t1, t2 - add.d a1, a1, a2 - jr t1 - -L(short_15_bytes): - ld.b t0, a1, -15 - st.b t0, a4, -15 -L(short_14_bytes): - ld.b t0, a1, -14 - st.b t0, a4, -14 -L(short_13_bytes): - ld.b t0, a1, -13 - st.b t0, a4, -13 -L(short_12_bytes): - ld.b t0, a1, -12 - st.b t0, a4, -12 -L(short_11_bytes): - ld.b t0, a1, -11 - st.b t0, a4, -11 -L(short_10_bytes): - ld.b t0, a1, -10 - st.b t0, a4, -10 -L(short_9_bytes): - ld.b t0, a1, -9 - st.b t0, a4, -9 -L(short_8_bytes): - ld.b t0, a1, -8 - st.b t0, a4, -8 -L(short_7_bytes): - ld.b t0, a1, -7 - st.b t0, a4, -7 -L(short_6_bytes): - ld.b t0, a1, -6 - st.b t0, a4, -6 -L(short_5_bytes): - ld.b t0, a1, -5 - st.b t0, a4, -5 -L(short_4_bytes): - ld.b t0, a1, -4 - st.b t0, a4, -4 -L(short_3_bytes): - ld.b t0, a1, -3 - st.b t0, a4, -3 -L(short_2_bytes): - ld.b t0, a1, -2 - st.b t0, a4, -2 -L(short_1_bytes): - ld.b t0, a1, -1 - st.b t0, a4, -1 - jr ra - -L(copy_back): - srai.d a3, a2, 4 - beqz a3, L(back_short_data) # less than 16 bytes - - add.d a4, a0, a2 # store the tail of dest - add.d a1, a1, a2 # store the tail of src - - andi a5, a4, 0x7 - andi a6, a1, 0x7 - beqz a5, L(back_check_align) - - # make dest aligned 8 bytes - sub.d a2, a2, a5 - sub.d a1, a1, a5 - sub.d a4, a4, a5 - - pcaddi t1, 18 - slli.d t3, a5, 3 - sub.d t1, t1, t3 - jr t1 - - ld.b t0, a1, 6 - st.b t0, a4, 6 - ld.b t0, a1, 5 - st.b t0, a4, 5 - ld.b t0, a1, 4 - st.b t0, a4, 4 - ld.b t0, a1, 3 - st.b t0, a4, 3 - ld.b t0, a1, 2 - st.b t0, a4, 2 - ld.b t0, a1, 1 - st.b t0, a4, 1 - ld.b t0, a1, 0 - st.b t0, a4, 0 - -L(back_check_align): - bne a5, a6, L(back_unalign) - - srai.d a3, a2, 4 - beqz a3, L(back_less_16bytes) - - andi a3, a2, 0x3f - beq a3, a2, L(back_less_64bytes) - - sub.d t0, a2, a3 - move a2, a3 - sub.d a5, a1, t0 - -L(back_loop_64bytes): - LD_64(a1, -64) - addi.d a1, a1, -64 - ST_64(a4, -64) - - addi.d a4, a4, -64 - bne a1, a5, L(back_loop_64bytes) - -L(back_less_64bytes): - srai.d a3, a2, 5 - beqz a3, L(back_less_32bytes) - - ld.d t0, a1, -32 - ld.d t1, a1, -24 - ld.d t2, a1, -16 - ld.d t3, a1, -8 - - addi.d a1, a1, -32 - addi.d a2, a2, -32 - - st.d t0, a4, -32 - st.d t1, a4, -24 - st.d t2, a4, -16 - st.d t3, a4, -8 - - addi.d a4, a4, -32 - -L(back_less_32bytes): - srai.d a3, a2, 4 - beqz a3, L(back_less_16bytes) - - ld.d t0, a1, -16 - ld.d t1, a1, -8 - - addi.d a2, a2, -16 - addi.d a1, a1, -16 - - st.d t0, a4, -16 - st.d t1, a4, -8 - addi.d a4, a4, -16 - -L(back_less_16bytes): - srai.d a3, a2, 3 - beqz a3, L(back_less_8bytes) - - ld.d t0, a1, -8 - addi.d a2, a2, -8 - addi.d a1, a1, -8 - - st.d t0, a4, -8 - addi.d a4, a4, -8 - -L(back_less_8bytes): - srai.d a3, a2, 2 - beqz a3, L(back_less_4bytes) - - ld.w t0, a1, -4 - addi.d a2, a2, -4 - addi.d a1, a1, -4 - - st.w t0, a4, -4 - addi.d a4, a4, -4 - -L(back_less_4bytes): - srai.d a3, a2, 1 - beqz a3, L(back_less_2bytes) - - ld.h t0, a1, -2 - addi.d a2, a2, -2 - addi.d a1, a1, -2 - - st.h t0, a4, -2 - addi.d a4, a4, -2 - -L(back_less_2bytes): - beqz a2, L(back_less_1byte) - - ld.b t0, a1, -1 - st.b t0, a4, -1 - -L(back_less_1byte): - jr ra - -L(back_unalign): - andi t8, a1, 0x7 - bstrins.d a1, zero, 2, 0 # make src 8 bytes aligned - - sub.d a6, zero, t8 - - ld.d t0, a1, 0 - slli.d a6, a6, 3 - slli.d a5, t8, 3 - sll.d a7, t0, a6 - - srai.d a3, a2, 4 - beqz a3, L(back_un_less_16bytes) - - andi a3, a2, 0x3f - beq a3, a2, L(back_un_less_64bytes) - - sub.d t0, a2, a3 - move a2, a3 - sub.d a3, a1, t0 - -L(back_un_long_bytes): - ld.d t0, a1, -8 - ld.d t1, a1, -16 - ld.d t2, a1, -24 - ld.d t3, a1, -32 - - sll.d t4, t0, a6 - srl.d t0, t0, a5 - - sll.d t5, t1, a6 - srl.d t1, t1, a5 - - sll.d t6, t2, a6 - srl.d t2, t2, a5 - - sll.d t7, t3, a6 - srl.d t3, t3, a5 - - or t0, t0, a7 - or t1, t1, t4 - or t2, t2, t5 - or t3, t3, t6 - - ld.d t4, a1, -40 - ld.d t5, a1, -48 - ld.d t6, a1, -56 - ld.d a7, a1, -64 - st.d t0, a4, -8 - st.d t1, a4, -16 - st.d t2, a4, -24 - st.d t3, a4, -32 - - addi.d a1, a1, -64 - - sll.d t0, t4, a6 - srl.d t4, t4, a5 - - sll.d t1, t5, a6 - srl.d t5, t5, a5 - - sll.d t2, t6, a6 - srl.d t6, t6, a5 - - srl.d t3, a7, a5 - sll.d a7, a7, a6 - - or t4, t7, t4 - or t5, t0, t5 - or t6, t1, t6 - or t3, t2, t3 - - st.d t4, a4, -40 - st.d t5, a4, -48 - st.d t6, a4, -56 - st.d t3, a4, -64 - - addi.d a4, a4, -64 - bne a3, a1, L(back_un_long_bytes) - -L(back_un_less_64bytes): - srai.d a3, a2, 5 - beqz a3, L(back_un_less_32bytes) - - ld.d t0, a1, -8 - ld.d t1, a1, -16 - ld.d t2, a1, -24 - ld.d t3, a1, -32 - - addi.d a1, a1, -32 - addi.d a2, a2, -32 - - sll.d t4, t0, a6 - srl.d t0, t0, a5 - - sll.d t5, t1, a6 - srl.d t1, t1, a5 - - sll.d t6, t2, a6 - srl.d t2, t2, a5 - - or t0, a7, t0 - - sll.d a7, t3, a6 - srl.d t3, t3, a5 - - or t1, t4, t1 - or t2, t5, t2 - or t3, t6, t3 - - st.d t0, a4, -8 - st.d t1, a4, -16 - st.d t2, a4, -24 - st.d t3, a4, -32 - - addi.d a4, a4, -32 - -L(back_un_less_32bytes): - srai.d a3, a2, 4 - beqz a3, L(back_un_less_16bytes) - - ld.d t0, a1, -8 - ld.d t1, a1, -16 - - addi.d a1, a1, -16 - addi.d a2, a2, -16 - - sll.d t2, t0, a6 - srl.d t3, t0, a5 - - srl.d t4, t1, a5 - or t3, a7, t3 - or t4, t2, t4 - sll.d a7, t1, a6 - - st.d t3, a4, -8 - st.d t4, a4, -16 - - addi.d a4, a4, -16 - -L(back_un_less_16bytes): - srai.d a3, a2, 3 - beqz a3, L(back_un_less_8bytes) - - ld.d t0, a1, -8 - - addi.d a1, a1, -8 - addi.d a2, a2, -8 - - srl.d t1, t0, a5 - or t2, a7, t1 - sll.d a7, t0, a6 - - st.d t2, a4, -8 - addi.d a4, a4, -8 - -L(back_un_less_8bytes): - beqz a2, L(back_end) - bge t8, a2, 1f # no more data in memory, un_less_8bytes data is stored in a7 - - # combine data in memory and a7(remaining part) - ld.d t0, a1, -8 - srl.d t0, t0, a5 - or a7, a7, t0 - -1: - srai.d a3, a2, 2 - beqz a3, L(back_un_less_4bytes) - - srai.d t0, a7, 32 - addi.d a2, a2, -4 - st.w t0, a4, -4 - addi.d a4, a4, -4 - slli.d a7, a7, 32 - -L(back_un_less_4bytes): - srai.d a3, a2, 1 - beqz a3, L(back_un_less_2bytes) - srai.d t0, a7, 48 - addi.d a2, a2, -2 - st.h t0, a4, -2 - addi.d a4, a4, -2 - slli.d a7, a7, 16 -L(back_un_less_2bytes): - beqz a2, L(back_un_less_1byte) - srai.d t0, a7, 56 - st.b t0, a4, -1 -L(back_un_less_1byte): - jr ra - -L(back_short_data): - pcaddi t1, 34 - slli.d t2, a2, 3 - sub.d t1, t1, t2 - jr t1 - - ld.b t0, a1, 14 - st.b t0, a0, 14 - ld.b t0, a1, 13 - st.b t0, a0, 13 - ld.b t0, a1, 12 - st.b t0, a0, 12 - ld.b t0, a1, 11 - st.b t0, a0, 11 - ld.b t0, a1, 10 - st.b t0, a0, 10 - ld.b t0, a1, 9 - st.b t0, a0, 9 - ld.b t0, a1, 8 - st.b t0, a0, 8 - ld.b t0, a1, 7 - st.b t0, a0, 7 - ld.b t0, a1, 6 - st.b t0, a0, 6 - ld.b t0, a1, 5 - st.b t0, a0, 5 - ld.b t0, a1, 4 - st.b t0, a0, 4 - ld.b t0, a1, 3 - st.b t0, a0, 3 - ld.b t0, a1, 2 - st.b t0, a0, 2 - ld.b t0, a1, 1 - st.b t0, a0, 1 - ld.b t0, a1, 0 - st.b t0, a0, 0 -L(back_end): - jr ra - -END(MEMCPY_NAME) - -#ifdef _LIBC -libc_hidden_builtin_def (MEMCPY_NAME) -#endif diff --git a/sysdeps/loongarch/lp64/memmove.S b/sysdeps/loongarch/lp64/memmove.S deleted file mode 100644 index 6d1922c4..00000000 --- a/sysdeps/loongarch/lp64/memmove.S +++ /dev/null @@ -1,2 +0,0 @@ -/* DONT DELETE THIS FILE, OTHERWIES MEMCPY.C WILL BE COMPILED. */ -/* There are too many common code in memcpy and memmove. See memcpy.S */ diff --git a/sysdeps/loongarch/lp64/memset.S b/sysdeps/loongarch/lp64/memset.S deleted file mode 100644 index eabd7d23..00000000 --- a/sysdeps/loongarch/lp64/memset.S +++ /dev/null @@ -1,166 +0,0 @@ -#ifdef _LIBC -#include -#include -#include -#else -#include -#include -#endif - -#ifndef MEMSET_NAME -#define MEMSET_NAME memset -#endif - -#define ST_64(n) \ - st.d a1, a0, n; \ - st.d a1, a0, n+8; \ - st.d a1, a0, n+16; \ - st.d a1, a0, n+24; \ - st.d a1, a0, n+32; \ - st.d a1, a0, n+40; \ - st.d a1, a0, n+48; \ - st.d a1, a0, n+56; - -LEAF(MEMSET_NAME, 6) - move t0, a0 - andi a3, a0, 0x7 - li.w t6, 16 - beqz a3, L(align) - blt a2, t6, L(short_data) - -L(make_align): - li.w t8, 8 - sub.d t2, t8, a3 - pcaddi t1, 11 - slli.d t3, t2, 2 - sub.d t1, t1, t3 - jirl zero, t1, 0 - -L(al7): - st.b a1, t0, 6 -L(al6): - st.b a1, t0, 5 -L(al5): - st.b a1, t0, 4 -L(al4): - st.b a1, t0, 3 -L(al3): - st.b a1, t0, 2 -L(al2): - st.b a1, t0, 1 -L(al1): - st.b a1, t0, 0 -L(al0): - add.d t0, t0, t2 - sub.d a2, a2, t2 - -L(align): - bstrins.d a1, a1, 15, 8 - bstrins.d a1, a1, 31, 16 - bstrins.d a1, a1, 63, 32 - - blt a2, t6, L(less_16bytes) - - andi a4, a2, 0x3f - beq a4, a2, L(less_64bytes) - - sub.d t1, a2, a4 - move a2, a4 - add.d a5, t0, t1 - -L(loop_64bytes): - addi.d t0, t0, 64 - st.d a1, t0, -64 - st.d a1, t0, -56 - st.d a1, t0, -48 - st.d a1, t0, -40 - st.d a1, t0, -32 - st.d a1, t0, -24 - st.d a1, t0, -16 - st.d a1, t0, -8 - bne t0, a5, L(loop_64bytes) - -L(less_64bytes): - srai.d a4, a2, 5 - beqz a4, L(less_32bytes) - addi.d a2, a2, -32 - st.d a1, t0, 0 - st.d a1, t0, 8 - st.d a1, t0, 16 - st.d a1, t0, 24 - addi.d t0, t0, 32 -L(less_32bytes): - blt a2, t6, L(less_16bytes) - addi.d a2, a2, -16 - st.d a1, t0, 0 - st.d a1, t0, 8 - addi.d t0, t0, 16 -L(less_16bytes): - srai.d a4, a2, 3 - beqz a4, L(less_8bytes) - addi.d a2, a2, -8 - st.d a1, t0, 0 - addi.d t0, t0, 8 -L(less_8bytes): - beqz a2, L(less_1byte) - srai.d a4, a2, 2 - beqz a4, L(less_4bytes) - addi.d a2, a2, -4 - st.w a1, t0, 0 - addi.d t0, t0, 4 -L(less_4bytes): - srai.d a3, a2, 1 - beqz a3, L(less_2bytes) - addi.d a2, a2, -2 - st.h a1, t0, 0 - addi.d t0, t0, 2 -L(less_2bytes): - beqz a2, L(less_1byte) - st.b a1, t0, 0 -L(less_1byte): - jr ra - -L(short_data): - pcaddi t1, 19 - slli.d t3, a2, 2 - sub.d t1, t1, t3 - jirl zero, t1, 0 -L(short_15): - st.b a1, a0, 14 - -L(short_14): - st.b a1, a0, 13 -L(short_13): - st.b a1, a0, 12 -L(short_12): - st.b a1, a0, 11 -L(short_11): - st.b a1, a0, 10 -L(short_10): - st.b a1, a0, 9 -L(short_9): - st.b a1, a0, 8 -L(short_8): - st.b a1, a0, 7 -L(short_7): - st.b a1, a0, 6 -L(short_6): - st.b a1, a0, 5 -L(short_5): - st.b a1, a0, 4 -L(short_4): - st.b a1, a0, 3 -L(short_3): - st.b a1, a0, 2 -L(short_2): - st.b a1, a0, 1 -L(short_1): - st.b a1, a0, 0 -L(short_0): - jr ra - -END(MEMSET_NAME) - -#ifdef _LIBC -libc_hidden_builtin_def (MEMSET_NAME) -#endif diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S index 4677c912..7dfa3ade 100644 --- a/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S +++ b/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S @@ -1,7 +1,96 @@ +#ifdef _LIBC +#include +#include +#include +#else +#include +#include +#endif + #if IS_IN (libc) #define MEMCHR_NAME __memchr_aligned +#else +#define MEMCHR_NAME memchr #endif -#include "../memchr.S" +LEAF(MEMCHR_NAME, 6) + beqz a2, L(out) + andi t1, a0, 0x7 + lu12i.w a3, 0x01010 + sub.d a5, a0, t1 + + bstrins.d a1, a1, 15, 8 + ld.d t0, a5, 0 + slli.d t2, t1, 3 + ori a3, a3, 0x101 + + bstrins.d a1, a1, 31, 16 + li.w t7, -1 + li.w t8, 9 + bstrins.d a3, a3, 63, 32 + + srl.d t3, t7, t2 + bstrins.d a1, a1, 63, 32 + sub.d t4, t8, t1 + orn t3, a1, t3 + + srl.d t0, t0, t2 + slli.d a4, a3, 7 # 0x8080808080808080 + sltu t4, a2, t4 + xor t2, t0, t3 + + sub.d a6, t2, a3 + andn a7, a4, t2 + and t2, a6, a7 + or t3, t2, t4 + + bnez t3, L(count_pos) + addi.d a2, a2, -8 + addi.d a0, a5, 8 + add.d a2, a2, t1 + +L(loop): + ld.d t0, a0, 0 + sltui t4, a2, 9 + xor t2, t0, a1 + sub.d a6, t2, a3 + + andn a7, a4, t2 + and t2, a6, a7 + or t3, t2, t4 + bnez t3, L(count_pos) + + ld.d t1, a0, 8 + addi.d a0, a0, 16 + sltui t4, a2, 17 + xor t2, t1, a1 + + sub.d a6, t2, a3 + andn a7, a4, t2 + and t2, a6, a7 + addi.d a2, a2, -16 + + or t3, t2, t4 + beqz t3, L(loop) + addi.d a0, a0, -8 + addi.d a2, a2, 8 + +L(count_pos): + ctz.d t0, t2 + srli.d t0, t0, 3 + sltu t1, t0, a2 + add.d a0, a0, t0 + + maskeqz a0, a0, t1 + jr ra + +L(out): + move a0, zero + jr ra +END(MEMCHR_NAME) + +#ifdef _LIBC +libc_hidden_builtin_def (MEMCHR_NAME) +#endif diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S index 512eabca..9505dfce 100644 --- a/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S +++ b/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S @@ -1,11 +1,289 @@ -#if IS_IN (libc) + +#ifdef _LIBC +#include +#include +#include +#else +#include +#include +#endif + +#if IS_IN (libc) #define MEMCMP_NAME __memcmp_aligned +#else +#define MEMCMP_NAME memcmp +#endif + +LEAF(MEMCMP_NAME, 6) + beqz a2, L(ret) + andi a4, a1, 0x7 + andi a3, a0, 0x7 + sltu a5, a4, a3 + + xor t0, a0, a1 + li.w t8, 8 + maskeqz t0, t0, a5 + li.w t7, -1 + + xor a0, a0, t0 // a0 hold smaller one + xor a1, a1, t0 // a1 hold larger one + andi a3, a0, 0x7 // a3 hold small offset + andi a4, a1, 0x7 // a4 hold larger offset + + xor a0, a0, a3 + xor a1, a1, a4 + ld.d t2, a0, 0 // t2 = "fedcbaXX" + ld.d t1, a1, 0 // t1 = "54321YYY" + + slli.d t3, a3, 3 + slli.d t4, a4, 3 + sub.d a6, t3, t4 // a6 = 0xfffffffffffffff8 + srl.d t1, t1, t4 // t1 = "00054321" + + srl.d t0, t2, t3 // t0 = "00fedcba" + srl.d t5, t7, t4 // t5 = 0x000000FFFFFFFFFF + sub.d t6, t0, t1 // t6 hold diff + and t6, t6, t5 // t6 = "000xxxxx" + + sub.d t5, t8, a4 // t5 hold margin 8 - 3 = 5 + bnez t6, L(first_out) + bgeu t5, a2, L(ret) + sub.d a2, a2, t5 + + bnez a6, L(unaligned) + blt a2, t8, L(al_less_8bytes) + andi t1, a2, 31 + beq t1, a2, L(al_less_32bytes) + + sub.d t2, a2, t1 + add.d a4, a0, t2 + move a2, t1 + +L(al_loop): + ld.d t0, a0, 8 + + ld.d t1, a1, 8 + ld.d t2, a0, 16 + ld.d t3, a1, 16 + ld.d t4, a0, 24 + + ld.d t5, a1, 24 + ld.d t6, a0, 32 + ld.d t7, a1, 32 + addi.d a0, a0, 32 + + addi.d a1, a1, 32 + bne t0, t1, L(out1) + bne t2, t3, L(out2) + bne t4, t5, L(out3) + + bne t6, t7, L(out4) + bne a0, a4, L(al_loop) + +L(al_less_32bytes): + srai.d a4, a2, 4 + beqz a4, L(al_less_16bytes) + + ld.d t0, a0, 8 + ld.d t1, a1, 8 + ld.d t2, a0, 16 + ld.d t3, a1, 16 + + addi.d a0, a0, 16 + addi.d a1, a1, 16 + addi.d a2, a2, -16 + bne t0, t1, L(out1) + + bne t2, t3, L(out2) + +L(al_less_16bytes): + srai.d a4, a2, 3 + beqz a4, L(al_less_8bytes) + ld.d t0, a0, 8 + + ld.d t1, a1, 8 + addi.d a0, a0, 8 + addi.d a1, a1, 8 + addi.d a2, a2, -8 + + bne t0, t1, L(out1) + +L(al_less_8bytes): + beqz a2, L(ret) + ld.d t0, a0, 8 + ld.d t1, a1, 8 + + li.d t7, -1 + slli.d t2, a2, 3 + sll.d t2, t7, t2 + sub.d t3, t0, t1 + + andn t6, t3, t2 + bnez t6, L(count_diff) + +L(ret): + move a0, zero + jr ra + +L(out4): + move t0, t6 + move t1, t7 + sub.d t6, t6, t7 + b L(count_diff) + +L(out3): + move t0, t4 + move t1, t5 + sub.d t6, t4, t5 + b L(count_diff) + +L(out2): + move t0, t2 + move t1, t3 +L(out1): + sub.d t6, t0, t1 + b L(count_diff) + +L(first_out): + slli.d t4, a2, 3 + slt t3, a2, t5 + sll.d t4, t7, t4 + maskeqz t4, t4, t3 + + andn t6, t6, t4 + +L(count_diff): + ctz.d t2, t6 + bstrins.d t2, zero, 2, 0 + srl.d t0, t0, t2 + + srl.d t1, t1, t2 + andi t0, t0, 0xff + andi t1, t1, 0xff + sub.d t2, t0, t1 + + sub.d t3, t1, t0 + masknez t2, t2, a5 + maskeqz t3, t3, a5 + or a0, t2, t3 + + jr ra + +L(unaligned): + sub.d a7, zero, a6 + srl.d t0, t2, a6 + blt a2, t8, L(un_less_8bytes) + + andi t1, a2, 31 + beq t1, a2, L(un_less_32bytes) + sub.d t2, a2, t1 + add.d a4, a0, t2 + + move a2, t1 + +L(un_loop): + ld.d t2, a0, 8 + ld.d t1, a1, 8 + ld.d t4, a0, 16 + + ld.d t3, a1, 16 + ld.d t6, a0, 24 + ld.d t5, a1, 24 + ld.d t8, a0, 32 + + ld.d t7, a1, 32 + addi.d a0, a0, 32 + addi.d a1, a1, 32 + sll.d a3, t2, a7 + + or t0, a3, t0 + bne t0, t1, L(out1) + srl.d t0, t2, a6 + sll.d a3, t4, a7 + + or t2, a3, t0 + bne t2, t3, L(out2) + srl.d t0, t4, a6 + sll.d a3, t6, a7 + + or t4, a3, t0 + bne t4, t5, L(out3) + srl.d t0, t6, a6 + sll.d a3, t8, a7 + + or t6, t0, a3 + bne t6, t7, L(out4) + srl.d t0, t8, a6 + bne a0, a4, L(un_loop) + +L(un_less_32bytes): + srai.d a4, a2, 4 + beqz a4, L(un_less_16bytes) + ld.d t2, a0, 8 + ld.d t1, a1, 8 + + ld.d t4, a0, 16 + ld.d t3, a1, 16 + addi.d a0, a0, 16 + addi.d a1, a1, 16 + + addi.d a2, a2, -16 + sll.d a3, t2, a7 + or t0, a3, t0 + bne t0, t1, L(out1) + + srl.d t0, t2, a6 + sll.d a3, t4, a7 + or t2, a3, t0 + bne t2, t3, L(out2) + + srl.d t0, t4, a6 + +L(un_less_16bytes): + srai.d a4, a2, 3 + beqz a4, L(un_less_8bytes) + ld.d t2, a0, 8 + + ld.d t1, a1, 8 + addi.d a0, a0, 8 + addi.d a1, a1, 8 + addi.d a2, a2, -8 + + sll.d a3, t2, a7 + or t0, a3, t0 + bne t0, t1, L(out1) + srl.d t0, t2, a6 + +L(un_less_8bytes): + beqz a2, L(ret) + andi a7, a7, 63 + slli.d a4, a2, 3 + bgeu a7, a4, L(last_cmp) + + ld.d t2, a0, 8 + sll.d a3, t2, a7 + or t0, a3, t0 + +L(last_cmp): + ld.d t1, a1, 8 + + li.d t7, -1 + sll.d t2, t7, a4 + sub.d t3, t0, t1 + andn t6, t3, t2 + + bnez t6, L(count_diff) + move a0, zero + jr ra + +END(MEMCMP_NAME) +#ifdef _LIBC +libc_hidden_builtin_def (MEMCMP_NAME) #endif -#include "../memcmp.S" # undef bcmp weak_alias (MEMCMP_NAME, bcmp) diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S index 5ff8b4e6..3fc86a7f 100644 --- a/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S +++ b/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S @@ -1,11 +1,804 @@ - +#ifdef _LIBC +#include +#include +#include +#else +#include +#include +#endif #if IS_IN (libc) - #define MEMCPY_NAME __memcpy_aligned #define MEMMOVE_NAME __memmove_aligned +#else +#define MEMCPY_NAME memcpy +#define MEMMOVE_NAME memmove +#endif + +#define LD_64(reg, n) \ + ld.d t0, reg, n; \ + ld.d t1, reg, n+8; \ + ld.d t2, reg, n+16; \ + ld.d t3, reg, n+24; \ + ld.d t4, reg, n+32; \ + ld.d t5, reg, n+40; \ + ld.d t6, reg, n+48; \ + ld.d t7, reg, n+56; + +#define ST_64(reg, n) \ + st.d t0, reg, n; \ + st.d t1, reg, n+8; \ + st.d t2, reg, n+16; \ + st.d t3, reg, n+24; \ + st.d t4, reg, n+32; \ + st.d t5, reg, n+40; \ + st.d t6, reg, n+48; \ + st.d t7, reg, n+56; +LEAF(MEMMOVE_NAME, 6) + sub.d t0, a0, a1 + bltu t0, a2, L(copy_back) + +END(MEMMOVE_NAME) + +#ifdef _LIBC +libc_hidden_builtin_def (MEMMOVE_NAME) #endif -#include "../memcpy.S" +LEAF_NO_ALIGN(MEMCPY_NAME) + + srai.d a3, a2, 4 + beqz a3, L(short_data) # less than 16 bytes + + move a4, a0 + andi a5, a0, 0x7 + andi a6, a1, 0x7 + li.d t8, 8 + beqz a5, L(check_align) + + # make dest aligned 8 bytes + sub.d t2, t8, a5 + sub.d a2, a2, t2 + + pcaddi t1, 20 + slli.d t3, t2, 3 + add.d a1, a1, t2 + sub.d t1, t1, t3 + add.d a4, a4, t2 + jr t1 + +L(al7): + ld.b t0, a1, -7 + st.b t0, a4, -7 +L(al6): + ld.b t0, a1, -6 + st.b t0, a4, -6 +L(al5): + ld.b t0, a1, -5 + st.b t0, a4, -5 +L(al4): + ld.b t0, a1, -4 + st.b t0, a4, -4 +L(al3): + ld.b t0, a1, -3 + st.b t0, a4, -3 +L(al2): + ld.b t0, a1, -2 + st.b t0, a4, -2 +L(al1): + ld.b t0, a1, -1 + st.b t0, a4, -1 + +L(check_align): + bne a5, a6, L(unalign) + + srai.d a3, a2, 4 + beqz a3, L(al_less_16bytes) + + andi a3, a2, 0x3f + beq a3, a2, L(al_less_64bytes) + + sub.d t0, a2, a3 + move a2, a3 + add.d a5, a1, t0 + +L(loop_64bytes): + LD_64(a1, 0) + addi.d a1, a1, 64 + ST_64(a4, 0) + + addi.d a4, a4, 64 + bne a1, a5, L(loop_64bytes) + +L(al_less_64bytes): + srai.d a3, a2, 5 + beqz a3, L(al_less_32bytes) + + ld.d t0, a1, 0 + ld.d t1, a1, 8 + ld.d t2, a1, 16 + ld.d t3, a1, 24 + + addi.d a1, a1, 32 + addi.d a2, a2, -32 + + st.d t0, a4, 0 + st.d t1, a4, 8 + st.d t2, a4, 16 + st.d t3, a4, 24 + + addi.d a4, a4, 32 + +L(al_less_32bytes): + srai.d a3, a2, 4 + beqz a3, L(al_less_16bytes) + + ld.d t0, a1, 0 + ld.d t1, a1, 8 + addi.d a1, a1, 16 + addi.d a2, a2, -16 + + st.d t0, a4, 0 + st.d t1, a4, 8 + addi.d a4, a4, 16 + +L(al_less_16bytes): + srai.d a3, a2, 3 + beqz a3, L(al_less_8bytes) + + ld.d t0, a1, 0 + addi.d a1, a1, 8 + addi.d a2, a2, -8 + + st.d t0, a4, 0 + addi.d a4, a4, 8 + +L(al_less_8bytes): + srai.d a3, a2, 2 + beqz a3, L(al_less_4bytes) + + ld.w t0, a1, 0 + addi.d a1, a1, 4 + addi.d a2, a2, -4 + + st.w t0, a4, 0 + addi.d a4, a4, 4 + +L(al_less_4bytes): + srai.d a3, a2, 1 + beqz a3, L(al_less_2bytes) + + ld.h t0, a1, 0 + addi.d a1, a1, 2 + addi.d a2, a2, -2 + + st.h t0, a4, 0 + addi.d a4, a4, 2 + +L(al_less_2bytes): + beqz a2, L(al_less_1byte) + + ld.b t0, a1, 0 + st.b t0, a4, 0 + +L(al_less_1byte): + jr ra + +L(unalign): + andi a5, a1, 0x7 + bstrins.d a1, zero, 2, 0 # make src 8 bytes aligned + + sub.d t8, t8, a5 # use t8 to save count of bytes for aligning + slli.d a5, a5, 3 + + ld.d t0, a1, 0 + addi.d a1, a1, 8 + + slli.d a6, t8, 3 + srl.d a7, t0, a5 + + srai.d a3, a2, 4 + beqz a3, L(un_less_16bytes) + + andi a3, a2, 0x3f + beq a3, a2, L(un_less_64bytes) + + sub.d t0, a2, a3 + move a2, a3 + add.d a3, a1, t0 + +# a5 shift right num +# a6 shift left num +# a7 remaining part +L(un_long_bytes): + ld.d t0, a1, 0 + ld.d t1, a1, 8 + ld.d t2, a1, 16 + ld.d t3, a1, 24 + + srl.d t4, t0, a5 + sll.d t0, t0, a6 + + srl.d t5, t1, a5 + sll.d t1, t1, a6 + + srl.d t6, t2, a5 + sll.d t2, t2, a6 + + srl.d t7, t3, a5 + sll.d t3, t3, a6 + + or t0, a7, t0 + or t1, t4, t1 + or t2, t5, t2 + or t3, t6, t3 + + ld.d t4, a1, 32 + ld.d t5, a1, 40 + ld.d t6, a1, 48 + ld.d a7, a1, 56 + + st.d t0, a4, 0 + st.d t1, a4, 8 + st.d t2, a4, 16 + st.d t3, a4, 24 + + addi.d a1, a1, 64 + + srl.d t0, t4, a5 + sll.d t4, t4, a6 + + srl.d t1, t5, a5 + sll.d t5, t5, a6 + + srl.d t2, t6, a5 + sll.d t6, t6, a6 + + sll.d t3, a7, a6 + srl.d a7, a7, a5 + + or t4, t7, t4 + or t5, t0, t5 + or t6, t1, t6 + or t3, t2, t3 + + st.d t4, a4, 32 + st.d t5, a4, 40 + st.d t6, a4, 48 + st.d t3, a4, 56 + + addi.d a4, a4, 64 + bne a3, a1, L(un_long_bytes) + +L(un_less_64bytes): + srai.d a3, a2, 5 + beqz a3, L(un_less_32bytes) + + ld.d t0, a1, 0 + ld.d t1, a1, 8 + ld.d t2, a1, 16 + ld.d t3, a1, 24 + + addi.d a1, a1, 32 + addi.d a2, a2, -32 + + srl.d t4, t0, a5 + sll.d t0, t0, a6 + + srl.d t5, t1, a5 + sll.d t1, t1, a6 + + srl.d t6, t2, a5 + sll.d t2, t2, a6 + + or t0, a7, t0 + + srl.d a7, t3, a5 + sll.d t3, t3, a6 + + or t1, t4, t1 + or t2, t5, t2 + or t3, t6, t3 + + st.d t0, a4, 0 + st.d t1, a4, 8 + st.d t2, a4, 16 + st.d t3, a4, 24 + + addi.d a4, a4, 32 + +L(un_less_32bytes): + srai.d a3, a2, 4 + beqz a3, L(un_less_16bytes) + + ld.d t0, a1, 0 + ld.d t1, a1, 8 + + addi.d a1, a1, 16 + addi.d a2, a2, -16 + + srl.d t2, t0, a5 + sll.d t3, t0, a6 + + sll.d t4, t1, a6 + or t3, a7, t3 + or t4, t2, t4 + srl.d a7, t1, a5 + + st.d t3, a4, 0 + st.d t4, a4, 8 + + addi.d a4, a4, 16 + +L(un_less_16bytes): + srai.d a3, a2, 3 + beqz a3, L(un_less_8bytes) + + ld.d t0, a1, 0 + + addi.d a1, a1, 8 + addi.d a2, a2, -8 + + sll.d t1, t0, a6 + or t2, a7, t1 + srl.d a7, t0, a5 + + st.d t2, a4, 0 + addi.d a4, a4, 8 + +L(un_less_8bytes): + beqz a2, L(un_less_1byte) + bge t8, a2, 1f # no more data in memory, un_less_8bytes data is stored in a7 + + # combine data in memory and a7(remaining part) + ld.d t0, a1, 0 + sll.d t0, t0, a6 + or a7, a7, t0 + +1: + srai.d a3, a2, 2 + beqz a3, L(un_less_4bytes) + + addi.d a2, a2, -4 + st.w a7, a4, 0 + addi.d a4, a4, 4 + srai.d a7, a7, 32 + +L(un_less_4bytes): + srai.d a3, a2, 1 + beqz a3, L(un_less_2bytes) + + addi.d a2, a2, -2 + st.h a7, a4, 0 + addi.d a4, a4, 2 + srai.d a7, a7, 16 +L(un_less_2bytes): + beqz a2, L(un_less_1byte) + st.b a7, a4, 0 + +L(un_less_1byte): + jr ra + +# Bytes copying for data less than 16 bytes +L(short_data): + pcaddi t1, 36 + slli.d t2, a2, 3 + add.d a4, a0, a2 + sub.d t1, t1, t2 + add.d a1, a1, a2 + jr t1 + +L(short_15_bytes): + ld.b t0, a1, -15 + st.b t0, a4, -15 +L(short_14_bytes): + ld.b t0, a1, -14 + st.b t0, a4, -14 +L(short_13_bytes): + ld.b t0, a1, -13 + st.b t0, a4, -13 +L(short_12_bytes): + ld.b t0, a1, -12 + st.b t0, a4, -12 +L(short_11_bytes): + ld.b t0, a1, -11 + st.b t0, a4, -11 +L(short_10_bytes): + ld.b t0, a1, -10 + st.b t0, a4, -10 +L(short_9_bytes): + ld.b t0, a1, -9 + st.b t0, a4, -9 +L(short_8_bytes): + ld.b t0, a1, -8 + st.b t0, a4, -8 +L(short_7_bytes): + ld.b t0, a1, -7 + st.b t0, a4, -7 +L(short_6_bytes): + ld.b t0, a1, -6 + st.b t0, a4, -6 +L(short_5_bytes): + ld.b t0, a1, -5 + st.b t0, a4, -5 +L(short_4_bytes): + ld.b t0, a1, -4 + st.b t0, a4, -4 +L(short_3_bytes): + ld.b t0, a1, -3 + st.b t0, a4, -3 +L(short_2_bytes): + ld.b t0, a1, -2 + st.b t0, a4, -2 +L(short_1_bytes): + ld.b t0, a1, -1 + st.b t0, a4, -1 + jr ra + +L(copy_back): + srai.d a3, a2, 4 + beqz a3, L(back_short_data) # less than 16 bytes + + add.d a4, a0, a2 # store the tail of dest + add.d a1, a1, a2 # store the tail of src + + andi a5, a4, 0x7 + andi a6, a1, 0x7 + beqz a5, L(back_check_align) + + # make dest aligned 8 bytes + sub.d a2, a2, a5 + sub.d a1, a1, a5 + sub.d a4, a4, a5 + + pcaddi t1, 18 + slli.d t3, a5, 3 + sub.d t1, t1, t3 + jr t1 + + ld.b t0, a1, 6 + st.b t0, a4, 6 + ld.b t0, a1, 5 + st.b t0, a4, 5 + ld.b t0, a1, 4 + st.b t0, a4, 4 + ld.b t0, a1, 3 + st.b t0, a4, 3 + ld.b t0, a1, 2 + st.b t0, a4, 2 + ld.b t0, a1, 1 + st.b t0, a4, 1 + ld.b t0, a1, 0 + st.b t0, a4, 0 + +L(back_check_align): + bne a5, a6, L(back_unalign) + + srai.d a3, a2, 4 + beqz a3, L(back_less_16bytes) + + andi a3, a2, 0x3f + beq a3, a2, L(back_less_64bytes) + + sub.d t0, a2, a3 + move a2, a3 + sub.d a5, a1, t0 + +L(back_loop_64bytes): + LD_64(a1, -64) + addi.d a1, a1, -64 + ST_64(a4, -64) + + addi.d a4, a4, -64 + bne a1, a5, L(back_loop_64bytes) + +L(back_less_64bytes): + srai.d a3, a2, 5 + beqz a3, L(back_less_32bytes) + + ld.d t0, a1, -32 + ld.d t1, a1, -24 + ld.d t2, a1, -16 + ld.d t3, a1, -8 + + addi.d a1, a1, -32 + addi.d a2, a2, -32 + + st.d t0, a4, -32 + st.d t1, a4, -24 + st.d t2, a4, -16 + st.d t3, a4, -8 + + addi.d a4, a4, -32 + +L(back_less_32bytes): + srai.d a3, a2, 4 + beqz a3, L(back_less_16bytes) + + ld.d t0, a1, -16 + ld.d t1, a1, -8 + + addi.d a2, a2, -16 + addi.d a1, a1, -16 + + st.d t0, a4, -16 + st.d t1, a4, -8 + addi.d a4, a4, -16 + +L(back_less_16bytes): + srai.d a3, a2, 3 + beqz a3, L(back_less_8bytes) + + ld.d t0, a1, -8 + addi.d a2, a2, -8 + addi.d a1, a1, -8 + + st.d t0, a4, -8 + addi.d a4, a4, -8 + +L(back_less_8bytes): + srai.d a3, a2, 2 + beqz a3, L(back_less_4bytes) + + ld.w t0, a1, -4 + addi.d a2, a2, -4 + addi.d a1, a1, -4 + + st.w t0, a4, -4 + addi.d a4, a4, -4 + +L(back_less_4bytes): + srai.d a3, a2, 1 + beqz a3, L(back_less_2bytes) + + ld.h t0, a1, -2 + addi.d a2, a2, -2 + addi.d a1, a1, -2 + + st.h t0, a4, -2 + addi.d a4, a4, -2 + +L(back_less_2bytes): + beqz a2, L(back_less_1byte) + + ld.b t0, a1, -1 + st.b t0, a4, -1 + +L(back_less_1byte): + jr ra + +L(back_unalign): + andi t8, a1, 0x7 + bstrins.d a1, zero, 2, 0 # make src 8 bytes aligned + + sub.d a6, zero, t8 + + ld.d t0, a1, 0 + slli.d a6, a6, 3 + slli.d a5, t8, 3 + sll.d a7, t0, a6 + + srai.d a3, a2, 4 + beqz a3, L(back_un_less_16bytes) + + andi a3, a2, 0x3f + beq a3, a2, L(back_un_less_64bytes) + + sub.d t0, a2, a3 + move a2, a3 + sub.d a3, a1, t0 + +L(back_un_long_bytes): + ld.d t0, a1, -8 + ld.d t1, a1, -16 + ld.d t2, a1, -24 + ld.d t3, a1, -32 + + sll.d t4, t0, a6 + srl.d t0, t0, a5 + + sll.d t5, t1, a6 + srl.d t1, t1, a5 + + sll.d t6, t2, a6 + srl.d t2, t2, a5 + + sll.d t7, t3, a6 + srl.d t3, t3, a5 + + or t0, t0, a7 + or t1, t1, t4 + or t2, t2, t5 + or t3, t3, t6 + + ld.d t4, a1, -40 + ld.d t5, a1, -48 + ld.d t6, a1, -56 + ld.d a7, a1, -64 + st.d t0, a4, -8 + st.d t1, a4, -16 + st.d t2, a4, -24 + st.d t3, a4, -32 + + addi.d a1, a1, -64 + + sll.d t0, t4, a6 + srl.d t4, t4, a5 + + sll.d t1, t5, a6 + srl.d t5, t5, a5 + + sll.d t2, t6, a6 + srl.d t6, t6, a5 + + srl.d t3, a7, a5 + sll.d a7, a7, a6 + + or t4, t7, t4 + or t5, t0, t5 + or t6, t1, t6 + or t3, t2, t3 + + st.d t4, a4, -40 + st.d t5, a4, -48 + st.d t6, a4, -56 + st.d t3, a4, -64 + + addi.d a4, a4, -64 + bne a3, a1, L(back_un_long_bytes) + +L(back_un_less_64bytes): + srai.d a3, a2, 5 + beqz a3, L(back_un_less_32bytes) + + ld.d t0, a1, -8 + ld.d t1, a1, -16 + ld.d t2, a1, -24 + ld.d t3, a1, -32 + + addi.d a1, a1, -32 + addi.d a2, a2, -32 + + sll.d t4, t0, a6 + srl.d t0, t0, a5 + + sll.d t5, t1, a6 + srl.d t1, t1, a5 + + sll.d t6, t2, a6 + srl.d t2, t2, a5 + + or t0, a7, t0 + + sll.d a7, t3, a6 + srl.d t3, t3, a5 + + or t1, t4, t1 + or t2, t5, t2 + or t3, t6, t3 + + st.d t0, a4, -8 + st.d t1, a4, -16 + st.d t2, a4, -24 + st.d t3, a4, -32 + + addi.d a4, a4, -32 + +L(back_un_less_32bytes): + srai.d a3, a2, 4 + beqz a3, L(back_un_less_16bytes) + + ld.d t0, a1, -8 + ld.d t1, a1, -16 + + addi.d a1, a1, -16 + addi.d a2, a2, -16 + + sll.d t2, t0, a6 + srl.d t3, t0, a5 + + srl.d t4, t1, a5 + or t3, a7, t3 + or t4, t2, t4 + sll.d a7, t1, a6 + + st.d t3, a4, -8 + st.d t4, a4, -16 + + addi.d a4, a4, -16 + +L(back_un_less_16bytes): + srai.d a3, a2, 3 + beqz a3, L(back_un_less_8bytes) + + ld.d t0, a1, -8 + + addi.d a1, a1, -8 + addi.d a2, a2, -8 + + srl.d t1, t0, a5 + or t2, a7, t1 + sll.d a7, t0, a6 + + st.d t2, a4, -8 + addi.d a4, a4, -8 + +L(back_un_less_8bytes): + beqz a2, L(back_end) + bge t8, a2, 1f # no more data in memory, un_less_8bytes data is stored in a7 + + # combine data in memory and a7(remaining part) + ld.d t0, a1, -8 + srl.d t0, t0, a5 + or a7, a7, t0 + +1: + srai.d a3, a2, 2 + beqz a3, L(back_un_less_4bytes) + + srai.d t0, a7, 32 + addi.d a2, a2, -4 + st.w t0, a4, -4 + addi.d a4, a4, -4 + slli.d a7, a7, 32 + +L(back_un_less_4bytes): + srai.d a3, a2, 1 + beqz a3, L(back_un_less_2bytes) + srai.d t0, a7, 48 + addi.d a2, a2, -2 + st.h t0, a4, -2 + addi.d a4, a4, -2 + slli.d a7, a7, 16 +L(back_un_less_2bytes): + beqz a2, L(back_un_less_1byte) + srai.d t0, a7, 56 + st.b t0, a4, -1 +L(back_un_less_1byte): + jr ra + +L(back_short_data): + pcaddi t1, 34 + slli.d t2, a2, 3 + sub.d t1, t1, t2 + jr t1 + + ld.b t0, a1, 14 + st.b t0, a0, 14 + ld.b t0, a1, 13 + st.b t0, a0, 13 + ld.b t0, a1, 12 + st.b t0, a0, 12 + ld.b t0, a1, 11 + st.b t0, a0, 11 + ld.b t0, a1, 10 + st.b t0, a0, 10 + ld.b t0, a1, 9 + st.b t0, a0, 9 + ld.b t0, a1, 8 + st.b t0, a0, 8 + ld.b t0, a1, 7 + st.b t0, a0, 7 + ld.b t0, a1, 6 + st.b t0, a0, 6 + ld.b t0, a1, 5 + st.b t0, a0, 5 + ld.b t0, a1, 4 + st.b t0, a0, 4 + ld.b t0, a1, 3 + st.b t0, a0, 3 + ld.b t0, a1, 2 + st.b t0, a0, 2 + ld.b t0, a1, 1 + st.b t0, a0, 1 + ld.b t0, a1, 0 + st.b t0, a0, 0 +L(back_end): + jr ra + +END(MEMCPY_NAME) + +#ifdef _LIBC +libc_hidden_builtin_def (MEMCPY_NAME) +#endif diff --git a/sysdeps/loongarch/lp64/multiarch/memset-aligned.S b/sysdeps/loongarch/lp64/multiarch/memset-aligned.S index da2f5ada..412ee849 100644 --- a/sysdeps/loongarch/lp64/multiarch/memset-aligned.S +++ b/sysdeps/loongarch/lp64/multiarch/memset-aligned.S @@ -1,9 +1,169 @@ +#ifdef _LIBC +#include +#include +#include +#else +#include +#include +#endif #if IS_IN (libc) - #define MEMSET_NAME __memset_aligned - +#else +#define MEMSET_NAME memset #endif -#include "../memset.S" +#define ST_64(n) \ + st.d a1, a0, n; \ + st.d a1, a0, n+8; \ + st.d a1, a0, n+16; \ + st.d a1, a0, n+24; \ + st.d a1, a0, n+32; \ + st.d a1, a0, n+40; \ + st.d a1, a0, n+48; \ + st.d a1, a0, n+56; + +LEAF(MEMSET_NAME, 6) + move t0, a0 + andi a3, a0, 0x7 + li.w t6, 16 + beqz a3, L(align) + blt a2, t6, L(short_data) + +L(make_align): + li.w t8, 8 + sub.d t2, t8, a3 + pcaddi t1, 11 + slli.d t3, t2, 2 + sub.d t1, t1, t3 + jirl zero, t1, 0 + +L(al7): + st.b a1, t0, 6 +L(al6): + st.b a1, t0, 5 +L(al5): + st.b a1, t0, 4 +L(al4): + st.b a1, t0, 3 +L(al3): + st.b a1, t0, 2 +L(al2): + st.b a1, t0, 1 +L(al1): + st.b a1, t0, 0 +L(al0): + add.d t0, t0, t2 + sub.d a2, a2, t2 + +L(align): + bstrins.d a1, a1, 15, 8 + bstrins.d a1, a1, 31, 16 + bstrins.d a1, a1, 63, 32 + + blt a2, t6, L(less_16bytes) + + andi a4, a2, 0x3f + beq a4, a2, L(less_64bytes) + + sub.d t1, a2, a4 + move a2, a4 + add.d a5, t0, t1 + +L(loop_64bytes): + addi.d t0, t0, 64 + st.d a1, t0, -64 + st.d a1, t0, -56 + st.d a1, t0, -48 + st.d a1, t0, -40 + st.d a1, t0, -32 + st.d a1, t0, -24 + st.d a1, t0, -16 + st.d a1, t0, -8 + bne t0, a5, L(loop_64bytes) + +L(less_64bytes): + srai.d a4, a2, 5 + beqz a4, L(less_32bytes) + addi.d a2, a2, -32 + st.d a1, t0, 0 + st.d a1, t0, 8 + st.d a1, t0, 16 + st.d a1, t0, 24 + addi.d t0, t0, 32 +L(less_32bytes): + blt a2, t6, L(less_16bytes) + addi.d a2, a2, -16 + st.d a1, t0, 0 + st.d a1, t0, 8 + addi.d t0, t0, 16 +L(less_16bytes): + srai.d a4, a2, 3 + beqz a4, L(less_8bytes) + addi.d a2, a2, -8 + st.d a1, t0, 0 + addi.d t0, t0, 8 +L(less_8bytes): + beqz a2, L(less_1byte) + srai.d a4, a2, 2 + beqz a4, L(less_4bytes) + addi.d a2, a2, -4 + st.w a1, t0, 0 + addi.d t0, t0, 4 +L(less_4bytes): + srai.d a3, a2, 1 + beqz a3, L(less_2bytes) + addi.d a2, a2, -2 + st.h a1, t0, 0 + addi.d t0, t0, 2 +L(less_2bytes): + beqz a2, L(less_1byte) + st.b a1, t0, 0 +L(less_1byte): + jr ra + +L(short_data): + pcaddi t1, 19 + slli.d t3, a2, 2 + sub.d t1, t1, t3 + jirl zero, t1, 0 +L(short_15): + st.b a1, a0, 14 + +L(short_14): + st.b a1, a0, 13 +L(short_13): + st.b a1, a0, 12 +L(short_12): + st.b a1, a0, 11 +L(short_11): + st.b a1, a0, 10 +L(short_10): + st.b a1, a0, 9 +L(short_9): + st.b a1, a0, 8 +L(short_8): + st.b a1, a0, 7 +L(short_7): + st.b a1, a0, 6 +L(short_6): + st.b a1, a0, 5 +L(short_5): + st.b a1, a0, 4 +L(short_4): + st.b a1, a0, 3 +L(short_3): + st.b a1, a0, 2 +L(short_2): + st.b a1, a0, 1 +L(short_1): + st.b a1, a0, 0 +L(short_0): + jr ra + +END(MEMSET_NAME) + +#ifdef _LIBC +libc_hidden_builtin_def (MEMSET_NAME) +#endif diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S index 0b46b4ca..a13e293f 100644 --- a/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S +++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S @@ -1,7 +1,115 @@ +#ifdef _LIBC +#include +#include +#include +#else +#include +#include +#endif #if IS_IN (libc) #define RAWMEMCHR_NAME __rawmemchr_aligned +#else +#define RAWMEMCHR_NAME __rawmemchr #endif -#include "../rawmemchr.S" +LEAF(RAWMEMCHR_NAME, 6) + andi t1, a0, 0x7 + bstrins.d a0, zero, 2, 0 + lu12i.w a2, 0x01010 + bstrins.d a1, a1, 15, 8 + + ld.d t0, a0, 0 + slli.d t1, t1, 3 + ori a2, a2, 0x101 + bstrins.d a1, a1, 31, 16 + + li.w t8, -1 + bstrins.d a1, a1, 63, 32 + bstrins.d a2, a2, 63, 32 + sll.d t2, t8, t1 + + sll.d t3, a1, t1 + orn t0, t0, t2 + slli.d a3, a2, 7 + beqz a1, L(find_zero) + + xor t0, t0, t3 + sub.d t1, t0, a2 + andn t2, a3, t0 + and t3, t1, t2 + + bnez t3, L(count_pos) + addi.d a0, a0, 8 + +L(loop): + ld.d t0, a0, 0 + xor t0, t0, a1 + + sub.d t1, t0, a2 + andn t2, a3, t0 + and t3, t1, t2 + bnez t3, L(count_pos) + + ld.d t0, a0, 8 + addi.d a0, a0, 16 + xor t0, t0, a1 + sub.d t1, t0, a2 + + andn t2, a3, t0 + and t3, t1, t2 + beqz t3, L(loop) + addi.d a0, a0, -8 +L(count_pos): + ctz.d t0, t3 + srli.d t0, t0, 3 + add.d a0, a0, t0 + jr ra + +L(loop_7bit): + ld.d t0, a0, 0 +L(find_zero): + sub.d t1, t0, a2 + and t2, t1, a3 + bnez t2, L(more_check) + + ld.d t0, a0, 8 + addi.d a0, a0, 16 + sub.d t1, t0, a2 + and t2, t1, a3 + + beqz t2, L(loop_7bit) + addi.d a0, a0, -8 + +L(more_check): + andn t2, a3, t0 + and t3, t1, t2 + bnez t3, L(count_pos) + addi.d a0, a0, 8 + +L(loop_8bit): + ld.d t0, a0, 0 + + sub.d t1, t0, a2 + andn t2, a3, t0 + and t3, t1, t2 + bnez t3, L(count_pos) + + ld.d t0, a0, 8 + addi.d a0, a0, 16 + sub.d t1, t0, a2 + + andn t2, a3, t0 + and t3, t1, t2 + beqz t3, L(loop_8bit) + + addi.d a0, a0, -8 + b L(count_pos) + +END(RAWMEMCHR_NAME) + +#ifdef _LIBC +weak_alias (__rawmemchr, rawmemchr) +libc_hidden_builtin_def (__rawmemchr) +#endif diff --git a/sysdeps/loongarch/lp64/rawmemchr.S b/sysdeps/loongarch/lp64/rawmemchr.S deleted file mode 100644 index ef1db7ed..00000000 --- a/sysdeps/loongarch/lp64/rawmemchr.S +++ /dev/null @@ -1,113 +0,0 @@ -#ifdef _LIBC -#include -#include -#include -#else -#include -#include -#endif - -#ifndef RAWMEMCHR_NAME -# define RAWMEMCHR_NAME __rawmemchr -#endif - - -LEAF(RAWMEMCHR_NAME, 6) - andi t1, a0, 0x7 - bstrins.d a0, zero, 2, 0 - lu12i.w a2, 0x01010 - bstrins.d a1, a1, 15, 8 - - ld.d t0, a0, 0 - slli.d t1, t1, 3 - ori a2, a2, 0x101 - bstrins.d a1, a1, 31, 16 - - li.w t8, -1 - bstrins.d a1, a1, 63, 32 - bstrins.d a2, a2, 63, 32 - sll.d t2, t8, t1 - - sll.d t3, a1, t1 - orn t0, t0, t2 - slli.d a3, a2, 7 - beqz a1, L(find_zero) - - xor t0, t0, t3 - sub.d t1, t0, a2 - andn t2, a3, t0 - and t3, t1, t2 - - bnez t3, L(count_pos) - addi.d a0, a0, 8 - -L(loop): - ld.d t0, a0, 0 - xor t0, t0, a1 - - sub.d t1, t0, a2 - andn t2, a3, t0 - and t3, t1, t2 - bnez t3, L(count_pos) - - ld.d t0, a0, 8 - addi.d a0, a0, 16 - xor t0, t0, a1 - sub.d t1, t0, a2 - - andn t2, a3, t0 - and t3, t1, t2 - beqz t3, L(loop) - addi.d a0, a0, -8 -L(count_pos): - ctz.d t0, t3 - srli.d t0, t0, 3 - add.d a0, a0, t0 - jr ra - -L(loop_7bit): - ld.d t0, a0, 0 -L(find_zero): - sub.d t1, t0, a2 - and t2, t1, a3 - bnez t2, L(more_check) - - ld.d t0, a0, 8 - addi.d a0, a0, 16 - sub.d t1, t0, a2 - and t2, t1, a3 - - beqz t2, L(loop_7bit) - addi.d a0, a0, -8 - -L(more_check): - andn t2, a3, t0 - and t3, t1, t2 - bnez t3, L(count_pos) - addi.d a0, a0, 8 - -L(loop_8bit): - ld.d t0, a0, 0 - - sub.d t1, t0, a2 - andn t2, a3, t0 - and t3, t1, t2 - bnez t3, L(count_pos) - - ld.d t0, a0, 8 - addi.d a0, a0, 16 - sub.d t1, t0, a2 - - andn t2, a3, t0 - and t3, t1, t2 - beqz t3, L(loop_8bit) - - addi.d a0, a0, -8 - b L(count_pos) - -END(RAWMEMCHR_NAME) - -#ifdef _LIBC -weak_alias (__rawmemchr, rawmemchr) -libc_hidden_builtin_def (__rawmemchr) -#endif -- 2.33.0