Compare commits
No commits in common. "glibc-2.28-251.0.1.an8.2" and "a23.1" have entirely different histories.
glibc-2.28
...
a23.1
1060 changed files with 15194 additions and 268687 deletions
|
@ -1,36 +1,40 @@
|
|||
commit bd77dd7e73e3530203be1c52c8a29d08270cb25d
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Wed Sep 13 14:10:56 2023 +0200
|
||||
From 4ea972b7edd7e36610e8cde18bf7a8149d7bac4f Mon Sep 17 00:00:00 2001
|
||||
From: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Wed, 13 Sep 2023 14:10:56 +0200
|
||||
Subject: [PATCH] CVE-2023-4527: Stack read overflow with large TCP responses
|
||||
in no-aaaa mode
|
||||
|
||||
CVE-2023-4527: Stack read overflow with large TCP responses in no-aaaa mode
|
||||
Without passing alt_dns_packet_buffer, __res_context_search can only
|
||||
store 2048 bytes (what fits into dns_packet_buffer). However,
|
||||
the function returns the total packet size, and the subsequent
|
||||
DNS parsing code in _nss_dns_gethostbyname4_r reads beyond the end
|
||||
of the stack-allocated buffer.
|
||||
|
||||
Without passing alt_dns_packet_buffer, __res_context_search can only
|
||||
store 2048 bytes (what fits into dns_packet_buffer). However,
|
||||
the function returns the total packet size, and the subsequent
|
||||
DNS parsing code in _nss_dns_gethostbyname4_r reads beyond the end
|
||||
of the stack-allocated buffer.
|
||||
Fixes commit f282cdbe7f436c75864e5640a4 ("resolv: Implement no-aaaa
|
||||
stub resolver option") and bug 30842.
|
||||
|
||||
Fixes commit f282cdbe7f436c75864e5640a4 ("resolv: Implement no-aaaa
|
||||
stub resolver option") and bug 30842.
|
||||
|
||||
Conflicts:
|
||||
resolv/nss_dns/dns-host.c
|
||||
(missing dns_packet_buffer cleanup downstream)
|
||||
(cherry picked from commit bd77dd7e73e3530203be1c52c8a29d08270cb25d)
|
||||
---
|
||||
resolv/Makefile | 2 +
|
||||
resolv/nss_dns/dns-host.c | 2 +-
|
||||
resolv/tst-resolv-noaaaa-vc.c | 129 ++++++++++++++++++++++++++++++++++
|
||||
4 files changed, 139 insertions(+), 1 deletion(-)
|
||||
create mode 100644 resolv/tst-resolv-noaaaa-vc.c
|
||||
|
||||
diff --git a/resolv/Makefile b/resolv/Makefile
|
||||
index ab8ad49b5318ad41..4f4eaf060443c128 100644
|
||||
index f8a92c6cff..28cedf49ee 100644
|
||||
--- a/resolv/Makefile
|
||||
+++ b/resolv/Makefile
|
||||
@@ -58,6 +58,7 @@ tests += \
|
||||
tst-resolv-edns \
|
||||
@@ -101,6 +101,7 @@ tests += \
|
||||
tst-resolv-invalid-cname \
|
||||
tst-resolv-network \
|
||||
tst-resolv-noaaaa \
|
||||
+ tst-resolv-noaaaa-vc \
|
||||
tst-resolv-nondecimal \
|
||||
tst-resolv-res_init-multi \
|
||||
tst-resolv-search \
|
||||
@@ -202,6 +203,7 @@ $(objpfx)tst-resolv-res_init-multi: $(objpfx)libresolv.so \
|
||||
$(objpfx)tst-resolv-res_init-thread: $(libdl) $(objpfx)libresolv.so \
|
||||
@@ -291,6 +292,7 @@ $(objpfx)tst-resolv-res_init-thread: $(objpfx)libresolv.so \
|
||||
$(objpfx)tst-resolv-invalid-cname: $(objpfx)libresolv.so \
|
||||
$(shared-thread-library)
|
||||
$(objpfx)tst-resolv-noaaaa: $(objpfx)libresolv.so $(shared-thread-library)
|
||||
+$(objpfx)tst-resolv-noaaaa-vc: $(objpfx)libresolv.so $(shared-thread-library)
|
||||
|
@ -38,21 +42,21 @@ index ab8ad49b5318ad41..4f4eaf060443c128 100644
|
|||
$(objpfx)tst-resolv-qtypes: $(objpfx)libresolv.so $(shared-thread-library)
|
||||
$(objpfx)tst-resolv-rotate: $(objpfx)libresolv.so $(shared-thread-library)
|
||||
diff --git a/resolv/nss_dns/dns-host.c b/resolv/nss_dns/dns-host.c
|
||||
index ff0a0b6f7f1f4703..f678c7d7caa3a026 100644
|
||||
index 9fa81f23c8..227734da5c 100644
|
||||
--- a/resolv/nss_dns/dns-host.c
|
||||
+++ b/resolv/nss_dns/dns-host.c
|
||||
@@ -392,7 +392,7 @@ _nss_dns_gethostbyname4_r (const char *name, struct gaih_addrtuple **pat,
|
||||
else
|
||||
@@ -427,7 +427,7 @@ _nss_dns_gethostbyname4_r (const char *name, struct gaih_addrtuple **pat,
|
||||
{
|
||||
n = __res_context_search (ctx, name, C_IN, T_A,
|
||||
- host_buffer.buf->buf, 2048, NULL,
|
||||
+ host_buffer.buf->buf, 2048, &host_buffer.ptr,
|
||||
NULL, NULL, NULL, NULL);
|
||||
dns_packet_buffer, sizeof (dns_packet_buffer),
|
||||
- NULL, NULL, NULL, NULL, NULL);
|
||||
+ &alt_dns_packet_buffer, NULL, NULL, NULL, NULL);
|
||||
if (n >= 0)
|
||||
status = gaih_getanswer_noaaaa (host_buffer.buf, n,
|
||||
status = gaih_getanswer_noaaaa (alt_dns_packet_buffer, n,
|
||||
&abuf, pat, errnop, herrnop, ttlp);
|
||||
diff --git a/resolv/tst-resolv-noaaaa-vc.c b/resolv/tst-resolv-noaaaa-vc.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..9f5aebd99f2d74a2
|
||||
index 0000000000..9f5aebd99f
|
||||
--- /dev/null
|
||||
+++ b/resolv/tst-resolv-noaaaa-vc.c
|
||||
@@ -0,0 +1,129 @@
|
||||
|
@ -185,3 +189,6 @@ index 0000000000000000..9f5aebd99f2d74a2
|
|||
+}
|
||||
+
|
||||
+#include <support/test-driver.c>
|
||||
--
|
||||
2.39.3
|
||||
|
|
@ -1,4 +1,8 @@
|
|||
Avoid UAF in getcanonname (CVE-2023-4806)
|
||||
From a9728f798ec7f05454c95637ee6581afaa9b487d Mon Sep 17 00:00:00 2001
|
||||
From: Siddhesh Poyarekar <siddhesh@sourceware.org>
|
||||
Date: Fri, 15 Sep 2023 13:51:12 -0400
|
||||
Subject: [PATCH] getaddrinfo: Fix use after free in getcanonname
|
||||
(CVE-2023-4806)
|
||||
|
||||
When an NSS plugin only implements the _gethostbyname2_r and
|
||||
_getcanonname_r callbacks, getaddrinfo could use memory that was freed
|
||||
|
@ -17,60 +21,39 @@ reference in res->at->name. This then gets dereferenced in the
|
|||
getcanonname_r plugin call, resulting in the use after free.
|
||||
|
||||
Fix this by copying h_name over and freeing it at the end. This
|
||||
resolves BZ #30843, which is assigned CVE-2023-4806. This is a minimal
|
||||
RHEL-8-specific fix. Test case differences from upstream:
|
||||
resolves BZ #30843, which is assigned CVE-2023-4806.
|
||||
|
||||
- The test module needs to explicitly link against libnss_files on
|
||||
RHEL-8; upstream libnss_files is built into libc.so.
|
||||
|
||||
- Test module code was adapted to not use the upstream NSS module
|
||||
convenience macros.
|
||||
|
||||
This change is adapted from the following commit from upstream:
|
||||
|
||||
commit 973fe93a5675c42798b2161c6f29c01b0e243994
|
||||
Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
|
||||
Date: Fri Sep 15 13:51:12 2023 -0400
|
||||
|
||||
getaddrinfo: Fix use after free in getcanonname (CVE-2023-4806)
|
||||
|
||||
When an NSS plugin only implements the _gethostbyname2_r and
|
||||
_getcanonname_r callbacks, getaddrinfo could use memory that was freed
|
||||
during tmpbuf resizing, through h_name in a previous query response.
|
||||
|
||||
The backing store for res->at->name when doing a query with
|
||||
gethostbyname3_r or gethostbyname2_r is tmpbuf, which is reallocated in
|
||||
gethosts during the query. For AF_INET6 lookup with AI_ALL |
|
||||
AI_V4MAPPED, gethosts gets called twice, once for a v6 lookup and second
|
||||
for a v4 lookup. In this case, if the first call reallocates tmpbuf
|
||||
enough number of times, resulting in a malloc, th->h_name (that
|
||||
res->at->name refers to) ends up on a heap allocated storage in tmpbuf.
|
||||
Now if the second call to gethosts also causes the plugin callback to
|
||||
return NSS_STATUS_TRYAGAIN, tmpbuf will get freed, resulting in a UAF
|
||||
reference in res->at->name. This then gets dereferenced in the
|
||||
getcanonname_r plugin call, resulting in the use after free.
|
||||
|
||||
Fix this by copying h_name over and freeing it at the end. This
|
||||
resolves BZ #30843, which is assigned CVE-2023-4806.
|
||||
|
||||
Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
|
||||
Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
|
||||
(cherry picked from commit 973fe93a5675c42798b2161c6f29c01b0e243994)
|
||||
---
|
||||
nss/Makefile | 15 ++++-
|
||||
nss/nss_test_gai_hv2_canonname.c | 56 +++++++++++++++++
|
||||
nss/tst-nss-gai-hv2-canonname.c | 63 +++++++++++++++++++
|
||||
nss/tst-nss-gai-hv2-canonname.h | 1 +
|
||||
.../postclean.req | 0
|
||||
.../tst-nss-gai-hv2-canonname.script | 2 +
|
||||
sysdeps/posix/getaddrinfo.c | 25 +++++---
|
||||
7 files changed, 152 insertions(+), 10 deletions(-)
|
||||
create mode 100644 nss/nss_test_gai_hv2_canonname.c
|
||||
create mode 100644 nss/tst-nss-gai-hv2-canonname.c
|
||||
create mode 100644 nss/tst-nss-gai-hv2-canonname.h
|
||||
create mode 100644 nss/tst-nss-gai-hv2-canonname.root/postclean.req
|
||||
create mode 100644 nss/tst-nss-gai-hv2-canonname.root/tst-nss-gai-hv2-canonname.script
|
||||
|
||||
diff --git a/nss/Makefile b/nss/Makefile
|
||||
index cfb255c6e7a3a4de..5829a2539306ddb5 100644
|
||||
index a978e3927a..f0af87e6f1 100644
|
||||
--- a/nss/Makefile
|
||||
+++ b/nss/Makefile
|
||||
@@ -66,7 +66,8 @@ xtests = bug-erange
|
||||
tests-container = \
|
||||
tst-nss-db-endpwent \
|
||||
tst-nss-db-endgrent \
|
||||
- tst-nss-gai-actions
|
||||
+ tst-nss-gai-actions \
|
||||
+ tst-nss-gai-hv2-canonname
|
||||
@@ -81,6 +81,7 @@ tests-container := \
|
||||
tst-nss-test3 \
|
||||
tst-reload1 \
|
||||
tst-reload2 \
|
||||
+ tst-nss-gai-hv2-canonname \
|
||||
# tests-container
|
||||
|
||||
# Tests which need libdl
|
||||
ifeq (yes,$(build-shared))
|
||||
@@ -132,7 +133,8 @@ routines += $(libnss_files-routines)
|
||||
static-only-routines += $(libnss_files-routines)
|
||||
@@ -144,7 +145,8 @@ libnss_compat-inhibit-o = $(filter-out .os,$(object-suffixes))
|
||||
ifeq ($(build-static-nss),yes)
|
||||
tests-static += tst-nss-static
|
||||
endif
|
||||
-extra-test-objs += nss_test1.os nss_test2.os nss_test_errno.os
|
||||
|
@ -79,7 +62,7 @@ index cfb255c6e7a3a4de..5829a2539306ddb5 100644
|
|||
|
||||
include ../Rules
|
||||
|
||||
@@ -169,12 +171,17 @@ rtld-tests-LDFLAGS += -Wl,--dynamic-list=nss_test.ver
|
||||
@@ -179,12 +181,16 @@ rtld-tests-LDFLAGS += -Wl,--dynamic-list=nss_test.ver
|
||||
libof-nss_test1 = extramodules
|
||||
libof-nss_test2 = extramodules
|
||||
libof-nss_test_errno = extramodules
|
||||
|
@ -91,34 +74,38 @@ index cfb255c6e7a3a4de..5829a2539306ddb5 100644
|
|||
$(objpfx)/libnss_test_errno.so: $(objpfx)nss_test_errno.os $(link-libc-deps)
|
||||
$(build-module)
|
||||
+$(objpfx)/libnss_test_gai_hv2_canonname.so: \
|
||||
+ $(objpfx)nss_test_gai_hv2_canonname.os $(link-libc-deps) \
|
||||
+ $(objpfx)/libnss_files.so
|
||||
+ $(objpfx)nss_test_gai_hv2_canonname.os $(link-libc-deps)
|
||||
+ $(build-module)
|
||||
$(objpfx)nss_test2.os : nss_test1.c
|
||||
ifdef libnss_test1.so-version
|
||||
$(objpfx)/libnss_test1.so$(libnss_test1.so-version): $(objpfx)/libnss_test1.so
|
||||
@@ -187,10 +194,14 @@ endif
|
||||
# Use the nss_files suffix for these objects as well.
|
||||
$(objpfx)/libnss_test1.so$(libnss_files.so-version): $(objpfx)/libnss_test1.so
|
||||
@@ -194,10 +200,14 @@ $(objpfx)/libnss_test2.so$(libnss_files.so-version): $(objpfx)/libnss_test2.so
|
||||
$(objpfx)/libnss_test_errno.so$(libnss_files.so-version): \
|
||||
$(objpfx)/libnss_test_errno.so
|
||||
$(make-link)
|
||||
+$(objpfx)/libnss_test_gai_hv2_canonname.so$(libnss_files.so-version): \
|
||||
+ $(objpfx)/libnss_test_gai_hv2_canonname.so
|
||||
+ $(make-link)
|
||||
$(patsubst %,$(objpfx)%.out,$(tests)) : \
|
||||
$(objpfx)/libnss_test1.so$(libnss_test1.so-version) \
|
||||
$(objpfx)/libnss_test2.so$(libnss_test2.so-version) \
|
||||
$(patsubst %,$(objpfx)%.out,$(tests) $(tests-container)) : \
|
||||
$(objpfx)/libnss_test1.so$(libnss_files.so-version) \
|
||||
$(objpfx)/libnss_test2.so$(libnss_files.so-version) \
|
||||
- $(objpfx)/libnss_test_errno.so$(libnss_files.so-version)
|
||||
+ $(objpfx)/libnss_test_errno.so$(libnss_files.so-version) \
|
||||
+ $(objpfx)/libnss_test_gai_hv2_canonname.so$(libnss_files.so-version)
|
||||
|
||||
ifeq (yes,$(have-thread-library))
|
||||
$(objpfx)tst-cancel-getpwuid_r: $(shared-thread-library)
|
||||
@@ -214,3 +224,4 @@ LDFLAGS-tst-nss-test3 = -Wl,--disable-new-dtags
|
||||
LDFLAGS-tst-nss-test4 = -Wl,--disable-new-dtags
|
||||
LDFLAGS-tst-nss-test5 = -Wl,--disable-new-dtags
|
||||
LDFLAGS-tst-nss-test_errno = -Wl,--disable-new-dtags
|
||||
+LDFLAGS-tst-nss-test_gai_hv2_canonname = -Wl,--disable-new-dtags
|
||||
diff --git a/nss/nss_test_gai_hv2_canonname.c b/nss/nss_test_gai_hv2_canonname.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..4195d7d24fdd5f6d
|
||||
index 0000000000..4439c83c9f
|
||||
--- /dev/null
|
||||
+++ b/nss/nss_test_gai_hv2_canonname.c
|
||||
@@ -0,0 +1,64 @@
|
||||
@@ -0,0 +1,56 @@
|
||||
+/* NSS service provider that only provides gethostbyname2_r.
|
||||
+ Copyright The GNU Toolchain Authors.
|
||||
+ This file is part of the GNU C Library.
|
||||
|
@ -137,7 +124,6 @@ index 0000000000000000..4195d7d24fdd5f6d
|
|||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <netdb.h>
|
||||
+#include <nss.h>
|
||||
+#include <stdlib.h>
|
||||
+#include <string.h>
|
||||
|
@ -145,20 +131,13 @@ index 0000000000000000..4195d7d24fdd5f6d
|
|||
+
|
||||
+/* Catch misnamed and functions. */
|
||||
+#pragma GCC diagnostic error "-Wmissing-prototypes"
|
||||
+NSS_DECLARE_MODULE_FUNCTIONS (test_gai_hv2_canonname)
|
||||
+
|
||||
+extern enum nss_status _nss_files_gethostbyname2_r (const char *, int,
|
||||
+ struct hostent *, char *,
|
||||
+ size_t, int *, int *);
|
||||
+
|
||||
+enum nss_status
|
||||
+_nss_test_gai_hv2_canonname_gethostbyname2_r (const char *, int, struct hostent
|
||||
+ *, char *, size_t, int *, int *);
|
||||
+
|
||||
+enum nss_status
|
||||
+_nss_test_gai_hv2_canonname_getcanonname_r (const char *, char *, size_t, char
|
||||
+ **, int *, int *);
|
||||
+
|
||||
+enum nss_status
|
||||
+_nss_test_gai_hv2_canonname_gethostbyname2_r (const char *name, int af,
|
||||
+ struct hostent *result,
|
||||
+ char *buffer, size_t buflen,
|
||||
|
@ -185,7 +164,7 @@ index 0000000000000000..4195d7d24fdd5f6d
|
|||
+}
|
||||
diff --git a/nss/tst-nss-gai-hv2-canonname.c b/nss/tst-nss-gai-hv2-canonname.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..d5f10c07d6a90773
|
||||
index 0000000000..d5f10c07d6
|
||||
--- /dev/null
|
||||
+++ b/nss/tst-nss-gai-hv2-canonname.c
|
||||
@@ -0,0 +1,63 @@
|
||||
|
@ -254,61 +233,86 @@ index 0000000000000000..d5f10c07d6a90773
|
|||
+#include <support/test-driver.c>
|
||||
diff --git a/nss/tst-nss-gai-hv2-canonname.h b/nss/tst-nss-gai-hv2-canonname.h
|
||||
new file mode 100644
|
||||
index 0000000000000000..14f2a9cb0867dff9
|
||||
index 0000000000..14f2a9cb08
|
||||
--- /dev/null
|
||||
+++ b/nss/tst-nss-gai-hv2-canonname.h
|
||||
@@ -0,0 +1 @@
|
||||
+#define QUERYNAME "test.example.com"
|
||||
diff --git a/nss/tst-nss-gai-hv2-canonname.root/postclean.req b/nss/tst-nss-gai-hv2-canonname.root/postclean.req
|
||||
new file mode 100644
|
||||
index 0000000000000000..e69de29bb2d1d643
|
||||
index 0000000000..e69de29bb2
|
||||
diff --git a/nss/tst-nss-gai-hv2-canonname.root/tst-nss-gai-hv2-canonname.script b/nss/tst-nss-gai-hv2-canonname.root/tst-nss-gai-hv2-canonname.script
|
||||
new file mode 100644
|
||||
index 0000000000000000..31848b4a28524af6
|
||||
index 0000000000..31848b4a28
|
||||
--- /dev/null
|
||||
+++ b/nss/tst-nss-gai-hv2-canonname.root/tst-nss-gai-hv2-canonname.script
|
||||
@@ -0,0 +1,2 @@
|
||||
+cp $B/nss/libnss_test_gai_hv2_canonname.so $L/libnss_test_gai_hv2_canonname.so.2
|
||||
+su
|
||||
diff --git a/sysdeps/posix/getaddrinfo.c b/sysdeps/posix/getaddrinfo.c
|
||||
index 4fa963644af8b7d5..46046504a6858f2e 100644
|
||||
index 5cda9bb072..7a43a3bf4c 100644
|
||||
--- a/sysdeps/posix/getaddrinfo.c
|
||||
+++ b/sysdeps/posix/getaddrinfo.c
|
||||
@@ -233,7 +233,6 @@ convert_hostent_to_gaih_addrtuple (const struct addrinfo *req,
|
||||
@@ -120,6 +120,7 @@ struct gaih_result
|
||||
{
|
||||
struct gaih_addrtuple *at;
|
||||
char *canon;
|
||||
+ char *h_name;
|
||||
bool free_at;
|
||||
bool got_ipv6;
|
||||
};
|
||||
@@ -165,6 +166,7 @@ gaih_result_reset (struct gaih_result *res)
|
||||
if (res->free_at)
|
||||
free (res->at);
|
||||
free (res->canon);
|
||||
+ free (res->h_name);
|
||||
memset (res, 0, sizeof (*res));
|
||||
}
|
||||
|
||||
@@ -203,9 +205,8 @@ gaih_inet_serv (const char *servicename, const struct gaih_typeproto *tp,
|
||||
return 0;
|
||||
}
|
||||
|
||||
-/* Convert struct hostent to a list of struct gaih_addrtuple objects. h_name
|
||||
- is not copied, and the struct hostent object must not be deallocated
|
||||
- prematurely. The new addresses are appended to the tuple array in RES. */
|
||||
+/* Convert struct hostent to a list of struct gaih_addrtuple objects. The new
|
||||
+ addresses are appended to the tuple array in RES. */
|
||||
static bool
|
||||
convert_hostent_to_gaih_addrtuple (const struct addrinfo *req, int family,
|
||||
struct hostent *h, struct gaih_result *res)
|
||||
@@ -238,6 +239,15 @@ convert_hostent_to_gaih_addrtuple (const struct addrinfo *req, int family,
|
||||
res->at = array;
|
||||
res->free_at = true;
|
||||
|
||||
+ /* Duplicate h_name because it may get reclaimed when the underlying storage
|
||||
+ is freed. */
|
||||
+ if (res->h_name == NULL)
|
||||
+ {
|
||||
+ res->h_name = __strdup (h->h_name);
|
||||
+ if (res->h_name == NULL)
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
/* Update the next pointers on reallocation. */
|
||||
for (size_t i = 0; i < old; i++)
|
||||
array[i].next = array + i + 1;
|
||||
@@ -262,7 +272,6 @@ convert_hostent_to_gaih_addrtuple (const struct addrinfo *req, int family,
|
||||
}
|
||||
array[i].next = array + i + 1;
|
||||
}
|
||||
- array[0].name = h->h_name;
|
||||
array[count - 1].next = NULL;
|
||||
|
||||
*result = array;
|
||||
@@ -287,6 +286,18 @@ convert_hostent_to_gaih_addrtuple (const struct addrinfo *req,
|
||||
} \
|
||||
*pat = addrmem; \
|
||||
\
|
||||
+ /* Store h_name so that it survives accidental deallocation when \
|
||||
+ gethosts is called again and tmpbuf gets reallocated. */ \
|
||||
+ if (h_name == NULL && th.h_name != NULL) \
|
||||
+ { \
|
||||
+ h_name = __strdup (th.h_name); \
|
||||
+ if (h_name == NULL) \
|
||||
+ { \
|
||||
+ __resolv_context_put (res_ctx); \
|
||||
+ result = -EAI_SYSTEM; \
|
||||
+ goto free_and_return; \
|
||||
+ } \
|
||||
+ } \
|
||||
if (localcanon != NULL && canon == NULL) \
|
||||
{ \
|
||||
canonbuf = __strdup (localcanon); \
|
||||
@@ -323,15 +334,15 @@ typedef enum nss_status (*nss_getcanonname_r)
|
||||
return true;
|
||||
@@ -324,15 +333,15 @@ gethosts (nss_gethostbyname3_r fct, int family, const char *name,
|
||||
memory allocation failure. The returned string is allocated on the
|
||||
heap; the caller has to free it. */
|
||||
static char *
|
||||
-getcanonname (service_user *nip, struct gaih_addrtuple *at, const char *name)
|
||||
+getcanonname (service_user *nip, const char *hname, const char *name)
|
||||
-getcanonname (nss_action_list nip, struct gaih_addrtuple *at, const char *name)
|
||||
+getcanonname (nss_action_list nip, const char *hname, const char *name)
|
||||
{
|
||||
nss_getcanonname_r cfct = __nss_lookup_function (nip, "getcanonname_r");
|
||||
nss_getcanonname_r *cfct = __nss_lookup_function (nip, "getcanonname_r");
|
||||
char *s = (char *) name;
|
||||
if (cfct != NULL)
|
||||
{
|
||||
|
@ -320,28 +324,15 @@ index 4fa963644af8b7d5..46046504a6858f2e 100644
|
|||
/* If the canonical name cannot be determined, use the passed
|
||||
string. */
|
||||
s = (char *) name;
|
||||
@@ -349,6 +360,7 @@ gaih_inet (const char *name, const struct gaih_service *service,
|
||||
struct gaih_addrtuple *at = NULL;
|
||||
bool got_ipv6 = false;
|
||||
const char *canon = NULL;
|
||||
+ char *h_name = NULL;
|
||||
const char *orig_name = name;
|
||||
|
||||
/* Reserve stack memory for the scratch buffer in the getaddrinfo
|
||||
@@ -919,7 +931,7 @@ gaih_inet (const char *name, const struct gaih_service *service,
|
||||
if ((req->ai_flags & AI_CANONNAME) != 0
|
||||
&& canon == NULL)
|
||||
{
|
||||
- canonbuf = getcanonname (nip, at, name);
|
||||
+ canonbuf = getcanonname (nip, h_name, name);
|
||||
if (canonbuf == NULL)
|
||||
{
|
||||
__resolv_context_enable_inet6
|
||||
@@ -1169,6 +1181,7 @@ gaih_inet (const char *name, const struct gaih_service *service,
|
||||
free ((char *) name);
|
||||
free (addrmem);
|
||||
free (canonbuf);
|
||||
+ free (h_name);
|
||||
|
||||
return result;
|
||||
}
|
||||
@@ -771,7 +780,7 @@ get_nss_addresses (const char *name, const struct addrinfo *req,
|
||||
if ((req->ai_flags & AI_CANONNAME) != 0
|
||||
&& res->canon == NULL)
|
||||
{
|
||||
- char *canonbuf = getcanonname (nip, res->at, name);
|
||||
+ char *canonbuf = getcanonname (nip, res->h_name, name);
|
||||
if (canonbuf == NULL)
|
||||
{
|
||||
__resolv_context_put (res_ctx);
|
||||
--
|
||||
2.39.3
|
||||
|
98
0085-CVE-2023-5156.patch
Normal file
98
0085-CVE-2023-5156.patch
Normal file
|
@ -0,0 +1,98 @@
|
|||
From 856bac55f98dc840e7c27cfa82262b933385de90 Mon Sep 17 00:00:00 2001
|
||||
From: Romain Geissler <romain.geissler@amadeus.com>
|
||||
Date: Mon, 25 Sep 2023 01:21:51 +0100
|
||||
Subject: [PATCH] Fix leak in getaddrinfo introduced by the fix for
|
||||
CVE-2023-4806 [BZ #30843]
|
||||
|
||||
This patch fixes a very recently added leak in getaddrinfo.
|
||||
|
||||
This was assigned CVE-2023-5156.
|
||||
|
||||
Resolves: BZ #30884
|
||||
Related: BZ #30842
|
||||
|
||||
Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
|
||||
(cherry picked from commit ec6b95c3303c700eb89eebeda2d7264cc184a796)
|
||||
---
|
||||
nss/Makefile | 20 ++++++++++++++++++++
|
||||
nss/tst-nss-gai-hv2-canonname.c | 3 +++
|
||||
sysdeps/posix/getaddrinfo.c | 4 +---
|
||||
3 files changed, 24 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/nss/Makefile b/nss/Makefile
|
||||
index f0af87e6f1..7a52c68791 100644
|
||||
--- a/nss/Makefile
|
||||
+++ b/nss/Makefile
|
||||
@@ -148,6 +148,15 @@ endif
|
||||
extra-test-objs += nss_test1.os nss_test2.os nss_test_errno.os \
|
||||
nss_test_gai_hv2_canonname.os
|
||||
|
||||
+ifeq ($(run-built-tests),yes)
|
||||
+ifneq (no,$(PERL))
|
||||
+tests-special += $(objpfx)mtrace-tst-nss-gai-hv2-canonname.out
|
||||
+endif
|
||||
+endif
|
||||
+
|
||||
+generated += mtrace-tst-nss-gai-hv2-canonname.out \
|
||||
+ tst-nss-gai-hv2-canonname.mtrace
|
||||
+
|
||||
include ../Rules
|
||||
|
||||
ifeq (yes,$(have-selinux))
|
||||
@@ -216,6 +225,17 @@ endif
|
||||
$(objpfx)tst-nss-files-alias-leak.out: $(objpfx)/libnss_files.so
|
||||
$(objpfx)tst-nss-files-alias-truncated.out: $(objpfx)/libnss_files.so
|
||||
|
||||
+tst-nss-gai-hv2-canonname-ENV = \
|
||||
+ MALLOC_TRACE=$(objpfx)tst-nss-gai-hv2-canonname.mtrace \
|
||||
+ LD_PRELOAD=$(common-objpfx)/malloc/libc_malloc_debug.so
|
||||
+$(objpfx)mtrace-tst-nss-gai-hv2-canonname.out: \
|
||||
+ $(objpfx)tst-nss-gai-hv2-canonname.out
|
||||
+ { test -r $(objpfx)tst-nss-gai-hv2-canonname.mtrace \
|
||||
+ || ( echo "tst-nss-gai-hv2-canonname.mtrace does not exist"; exit 77; ) \
|
||||
+ && $(common-objpfx)malloc/mtrace \
|
||||
+ $(objpfx)tst-nss-gai-hv2-canonname.mtrace; } > $@; \
|
||||
+ $(evaluate-test)
|
||||
+
|
||||
# Disable DT_RUNPATH on NSS tests so that the glibc internal NSS
|
||||
# functions can load testing NSS modules via DT_RPATH.
|
||||
LDFLAGS-tst-nss-test1 = -Wl,--disable-new-dtags
|
||||
diff --git a/nss/tst-nss-gai-hv2-canonname.c b/nss/tst-nss-gai-hv2-canonname.c
|
||||
index d5f10c07d6..7db53cf09d 100644
|
||||
--- a/nss/tst-nss-gai-hv2-canonname.c
|
||||
+++ b/nss/tst-nss-gai-hv2-canonname.c
|
||||
@@ -21,6 +21,7 @@
|
||||
#include <netdb.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
+#include <mcheck.h>
|
||||
#include <support/check.h>
|
||||
#include <support/xstdio.h>
|
||||
#include "nss/tst-nss-gai-hv2-canonname.h"
|
||||
@@ -41,6 +42,8 @@ static void do_prepare (int a, char **av)
|
||||
static int
|
||||
do_test (void)
|
||||
{
|
||||
+ mtrace ();
|
||||
+
|
||||
__nss_configure_lookup ("hosts", "test_gai_hv2_canonname");
|
||||
|
||||
struct addrinfo hints = {};
|
||||
diff --git a/sysdeps/posix/getaddrinfo.c b/sysdeps/posix/getaddrinfo.c
|
||||
index 7a43a3bf4c..f975dcd2bc 100644
|
||||
--- a/sysdeps/posix/getaddrinfo.c
|
||||
+++ b/sysdeps/posix/getaddrinfo.c
|
||||
@@ -1196,9 +1196,7 @@ free_and_return:
|
||||
if (malloc_name)
|
||||
free ((char *) name);
|
||||
free (addrmem);
|
||||
- if (res.free_at)
|
||||
- free (res.at);
|
||||
- free (res.canon);
|
||||
+ gaih_result_reset (&res);
|
||||
|
||||
return result;
|
||||
}
|
||||
--
|
||||
2.39.3
|
||||
|
181
0087-CVE-2023-6246.patch
Normal file
181
0087-CVE-2023-6246.patch
Normal file
|
@ -0,0 +1,181 @@
|
|||
From d1a83b6767f68b3cb5b4b4ea2617254acd040c82 Mon Sep 17 00:00:00 2001
|
||||
From: Arjun Shankar <arjun@redhat.com>
|
||||
Date: Mon, 15 Jan 2024 17:44:43 +0100
|
||||
Subject: [PATCH] syslog: Fix heap buffer overflow in __vsyslog_internal
|
||||
(CVE-2023-6246)
|
||||
|
||||
__vsyslog_internal did not handle a case where printing a SYSLOG_HEADER
|
||||
containing a long program name failed to update the required buffer
|
||||
size, leading to the allocation and overflow of a too-small buffer on
|
||||
the heap. This commit fixes that. It also adds a new regression test
|
||||
that uses glibc.malloc.check.
|
||||
|
||||
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
||||
Tested-by: Carlos O'Donell <carlos@redhat.com>
|
||||
(cherry picked from commit 6bd0e4efcc78f3c0115e5ea9739a1642807450da)
|
||||
---
|
||||
misc/Makefile | 8 ++-
|
||||
misc/syslog.c | 50 +++++++++++++------
|
||||
misc/tst-syslog-long-progname.c | 39 +++++++++++++++
|
||||
.../postclean.req | 0
|
||||
4 files changed, 82 insertions(+), 15 deletions(-)
|
||||
create mode 100644 misc/tst-syslog-long-progname.c
|
||||
create mode 100644 misc/tst-syslog-long-progname.root/postclean.req
|
||||
|
||||
diff --git a/misc/Makefile b/misc/Makefile
|
||||
index ba8232a0e9..66e9ded8f9 100644
|
||||
--- a/misc/Makefile
|
||||
+++ b/misc/Makefile
|
||||
@@ -115,7 +115,10 @@ tests-special += $(objpfx)tst-error1-mem.out \
|
||||
$(objpfx)tst-allocate_once-mem.out
|
||||
endif
|
||||
|
||||
-tests-container := tst-syslog
|
||||
+tests-container := \
|
||||
+ tst-syslog \
|
||||
+ tst-syslog-long-progname \
|
||||
+ # tests-container
|
||||
|
||||
CFLAGS-select.c += -fexceptions -fasynchronous-unwind-tables
|
||||
CFLAGS-tsearch.c += $(uses-callbacks)
|
||||
@@ -175,6 +178,9 @@ $(objpfx)tst-allocate_once-mem.out: $(objpfx)tst-allocate_once.out
|
||||
$(common-objpfx)malloc/mtrace $(objpfx)tst-allocate_once.mtrace > $@; \
|
||||
$(evaluate-test)
|
||||
|
||||
+tst-syslog-long-progname-ENV = GLIBC_TUNABLES=glibc.malloc.check=3 \
|
||||
+ LD_PRELOAD=libc_malloc_debug.so.0
|
||||
+
|
||||
$(objpfx)tst-select: $(librt)
|
||||
$(objpfx)tst-select-time64: $(librt)
|
||||
$(objpfx)tst-pselect: $(librt)
|
||||
diff --git a/misc/syslog.c b/misc/syslog.c
|
||||
index f67d4b58a4..fe1daf988b 100644
|
||||
--- a/misc/syslog.c
|
||||
+++ b/misc/syslog.c
|
||||
@@ -122,8 +122,9 @@ __vsyslog_internal (int pri, const char *fmt, va_list ap,
|
||||
{
|
||||
/* Try to use a static buffer as an optimization. */
|
||||
char bufs[1024];
|
||||
- char *buf = NULL;
|
||||
- size_t bufsize = 0;
|
||||
+ char *buf = bufs;
|
||||
+ size_t bufsize;
|
||||
+
|
||||
int msgoff;
|
||||
int saved_errno = errno;
|
||||
|
||||
@@ -175,29 +176,50 @@ __vsyslog_internal (int pri, const char *fmt, va_list ap,
|
||||
#define SYSLOG_HEADER_WITHOUT_TS(__pri, __msgoff) \
|
||||
"<%d>: %n", __pri, __msgoff
|
||||
|
||||
- int l;
|
||||
+ int l, vl;
|
||||
if (has_ts)
|
||||
l = __snprintf (bufs, sizeof bufs,
|
||||
SYSLOG_HEADER (pri, timestamp, &msgoff, pid));
|
||||
else
|
||||
l = __snprintf (bufs, sizeof bufs,
|
||||
SYSLOG_HEADER_WITHOUT_TS (pri, &msgoff));
|
||||
+
|
||||
+ char *pos;
|
||||
+ size_t len;
|
||||
+
|
||||
if (0 <= l && l < sizeof bufs)
|
||||
{
|
||||
- va_list apc;
|
||||
- va_copy (apc, ap);
|
||||
+ /* At this point, there is still a chance that we can print the
|
||||
+ remaining part of the log into bufs and use that. */
|
||||
+ pos = bufs + l;
|
||||
+ len = sizeof (bufs) - l;
|
||||
+ }
|
||||
+ else
|
||||
+ {
|
||||
+ buf = NULL;
|
||||
+ /* We already know that bufs is too small to use for this log message.
|
||||
+ The next vsnprintf into bufs is used only to calculate the total
|
||||
+ required buffer length. We will discard bufs contents and allocate
|
||||
+ an appropriately sized buffer later instead. */
|
||||
+ pos = bufs;
|
||||
+ len = sizeof (bufs);
|
||||
+ }
|
||||
|
||||
- /* Restore errno for %m format. */
|
||||
- __set_errno (saved_errno);
|
||||
+ {
|
||||
+ va_list apc;
|
||||
+ va_copy (apc, ap);
|
||||
|
||||
- int vl = __vsnprintf_internal (bufs + l, sizeof bufs - l, fmt, apc,
|
||||
- mode_flags);
|
||||
- if (0 <= vl && vl < sizeof bufs - l)
|
||||
- buf = bufs;
|
||||
- bufsize = l + vl;
|
||||
+ /* Restore errno for %m format. */
|
||||
+ __set_errno (saved_errno);
|
||||
|
||||
- va_end (apc);
|
||||
- }
|
||||
+ vl = __vsnprintf_internal (pos, len, fmt, apc, mode_flags);
|
||||
+
|
||||
+ if (!(0 <= vl && vl < len))
|
||||
+ buf = NULL;
|
||||
+
|
||||
+ bufsize = l + vl;
|
||||
+ va_end (apc);
|
||||
+ }
|
||||
|
||||
if (buf == NULL)
|
||||
{
|
||||
diff --git a/misc/tst-syslog-long-progname.c b/misc/tst-syslog-long-progname.c
|
||||
new file mode 100644
|
||||
index 0000000000..88f37a8a00
|
||||
--- /dev/null
|
||||
+++ b/misc/tst-syslog-long-progname.c
|
||||
@@ -0,0 +1,39 @@
|
||||
+/* Test heap buffer overflow in syslog with long __progname (CVE-2023-6246)
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <syslog.h>
|
||||
+#include <string.h>
|
||||
+
|
||||
+extern char * __progname;
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ char long_progname[2048];
|
||||
+
|
||||
+ memset (long_progname, 'X', sizeof (long_progname) - 1);
|
||||
+ long_progname[sizeof (long_progname) - 1] = '\0';
|
||||
+
|
||||
+ __progname = long_progname;
|
||||
+
|
||||
+ syslog (LOG_INFO, "Hello, World!");
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+#include <support/test-driver.c>
|
||||
diff --git a/misc/tst-syslog-long-progname.root/postclean.req b/misc/tst-syslog-long-progname.root/postclean.req
|
||||
new file mode 100644
|
||||
index 0000000000..e69de29bb2
|
||||
--
|
||||
2.39.3
|
||||
|
106
0088-CVE-2023-6779.patch
Normal file
106
0088-CVE-2023-6779.patch
Normal file
|
@ -0,0 +1,106 @@
|
|||
From 2bc9d7c002bdac38b5c2a3f11b78e309d7765b83 Mon Sep 17 00:00:00 2001
|
||||
From: Arjun Shankar <arjun@redhat.com>
|
||||
Date: Mon, 15 Jan 2024 17:44:44 +0100
|
||||
Subject: [PATCH] syslog: Fix heap buffer overflow in __vsyslog_internal
|
||||
(CVE-2023-6779)
|
||||
|
||||
__vsyslog_internal used the return value of snprintf/vsnprintf to
|
||||
calculate buffer sizes for memory allocation. If these functions (for
|
||||
any reason) failed and returned -1, the resulting buffer would be too
|
||||
small to hold output. This commit fixes that.
|
||||
|
||||
All snprintf/vsnprintf calls are checked for negative return values and
|
||||
the function silently returns upon encountering them.
|
||||
|
||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
||||
(cherry picked from commit 7e5a0c286da33159d47d0122007aac016f3e02cd)
|
||||
---
|
||||
misc/syslog.c | 39 ++++++++++++++++++++++++++++-----------
|
||||
1 file changed, 28 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/misc/syslog.c b/misc/syslog.c
|
||||
index fe1daf988b..3108ae9134 100644
|
||||
--- a/misc/syslog.c
|
||||
+++ b/misc/syslog.c
|
||||
@@ -183,11 +183,13 @@ __vsyslog_internal (int pri, const char *fmt, va_list ap,
|
||||
else
|
||||
l = __snprintf (bufs, sizeof bufs,
|
||||
SYSLOG_HEADER_WITHOUT_TS (pri, &msgoff));
|
||||
+ if (l < 0)
|
||||
+ goto out;
|
||||
|
||||
char *pos;
|
||||
size_t len;
|
||||
|
||||
- if (0 <= l && l < sizeof bufs)
|
||||
+ if (l < sizeof bufs)
|
||||
{
|
||||
/* At this point, there is still a chance that we can print the
|
||||
remaining part of the log into bufs and use that. */
|
||||
@@ -213,12 +215,15 @@ __vsyslog_internal (int pri, const char *fmt, va_list ap,
|
||||
__set_errno (saved_errno);
|
||||
|
||||
vl = __vsnprintf_internal (pos, len, fmt, apc, mode_flags);
|
||||
+ va_end (apc);
|
||||
+
|
||||
+ if (vl < 0)
|
||||
+ goto out;
|
||||
|
||||
- if (!(0 <= vl && vl < len))
|
||||
+ if (vl >= len)
|
||||
buf = NULL;
|
||||
|
||||
bufsize = l + vl;
|
||||
- va_end (apc);
|
||||
}
|
||||
|
||||
if (buf == NULL)
|
||||
@@ -229,25 +234,37 @@ __vsyslog_internal (int pri, const char *fmt, va_list ap,
|
||||
/* Tell the cancellation handler to free this buffer. */
|
||||
clarg.buf = buf;
|
||||
|
||||
+ int cl;
|
||||
if (has_ts)
|
||||
- __snprintf (buf, l + 1,
|
||||
- SYSLOG_HEADER (pri, timestamp, &msgoff, pid));
|
||||
+ cl = __snprintf (buf, l + 1,
|
||||
+ SYSLOG_HEADER (pri, timestamp, &msgoff, pid));
|
||||
else
|
||||
- __snprintf (buf, l + 1,
|
||||
- SYSLOG_HEADER_WITHOUT_TS (pri, &msgoff));
|
||||
+ cl = __snprintf (buf, l + 1,
|
||||
+ SYSLOG_HEADER_WITHOUT_TS (pri, &msgoff));
|
||||
+ if (cl != l)
|
||||
+ goto out;
|
||||
|
||||
va_list apc;
|
||||
va_copy (apc, ap);
|
||||
- __vsnprintf_internal (buf + l, bufsize - l + 1, fmt, apc,
|
||||
- mode_flags);
|
||||
+ cl = __vsnprintf_internal (buf + l, bufsize - l + 1, fmt, apc,
|
||||
+ mode_flags);
|
||||
va_end (apc);
|
||||
+
|
||||
+ if (cl != vl)
|
||||
+ goto out;
|
||||
}
|
||||
else
|
||||
{
|
||||
+ int bl;
|
||||
/* Nothing much to do but emit an error message. */
|
||||
- bufsize = __snprintf (bufs, sizeof bufs,
|
||||
- "out of memory[%d]", __getpid ());
|
||||
+ bl = __snprintf (bufs, sizeof bufs,
|
||||
+ "out of memory[%d]", __getpid ());
|
||||
+ if (bl < 0 || bl >= sizeof bufs)
|
||||
+ goto out;
|
||||
+
|
||||
+ bufsize = bl;
|
||||
buf = bufs;
|
||||
+ msgoff = 0;
|
||||
}
|
||||
}
|
||||
|
||||
--
|
||||
2.39.3
|
||||
|
41
0089-CVE-2023-6780.patch
Normal file
41
0089-CVE-2023-6780.patch
Normal file
|
@ -0,0 +1,41 @@
|
|||
From b9b7d6a27aa0632f334352fa400771115b3c69b7 Mon Sep 17 00:00:00 2001
|
||||
From: Arjun Shankar <arjun@redhat.com>
|
||||
Date: Mon, 15 Jan 2024 17:44:45 +0100
|
||||
Subject: [PATCH] syslog: Fix integer overflow in __vsyslog_internal
|
||||
(CVE-2023-6780)
|
||||
|
||||
__vsyslog_internal calculated a buffer size by adding two integers, but
|
||||
did not first check if the addition would overflow. This commit fixes
|
||||
that.
|
||||
|
||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
||||
Tested-by: Carlos O'Donell <carlos@redhat.com>
|
||||
(cherry picked from commit ddf542da94caf97ff43cc2875c88749880b7259b)
|
||||
---
|
||||
misc/syslog.c | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/misc/syslog.c b/misc/syslog.c
|
||||
index 3108ae9134..9336036666 100644
|
||||
--- a/misc/syslog.c
|
||||
+++ b/misc/syslog.c
|
||||
@@ -41,6 +41,7 @@ static char sccsid[] = "@(#)syslog.c 8.4 (Berkeley) 3/18/94";
|
||||
#include <sys/uio.h>
|
||||
#include <sys/un.h>
|
||||
#include <syslog.h>
|
||||
+#include <limits.h>
|
||||
|
||||
static int LogType = SOCK_DGRAM; /* type of socket connection */
|
||||
static int LogFile = -1; /* fd for log */
|
||||
@@ -217,7 +218,7 @@ __vsyslog_internal (int pri, const char *fmt, va_list ap,
|
||||
vl = __vsnprintf_internal (pos, len, fmt, apc, mode_flags);
|
||||
va_end (apc);
|
||||
|
||||
- if (vl < 0)
|
||||
+ if (vl < 0 || vl >= INT_MAX - l)
|
||||
goto out;
|
||||
|
||||
if (vl >= len)
|
||||
--
|
||||
2.39.3
|
||||
|
|
@ -1,34 +1,42 @@
|
|||
Author: Charles Fol <folcharles@gmail.com>
|
||||
Date: Thu Mar 28 12:25:38 2024 -0300
|
||||
From f9dc609e06b1136bb0408be9605ce7973a767ada Mon Sep 17 00:00:00 2001
|
||||
From: Charles Fol <folcharles@gmail.com>
|
||||
Date: Thu, 28 Mar 2024 12:25:38 -0300
|
||||
Subject: [PATCH] iconv: ISO-2022-CN-EXT: fix out-of-bound writes when writing
|
||||
escape sequence (CVE-2024-2961)
|
||||
|
||||
iconv: ISO-2022-CN-EXT: fix out-of-bound writes when writing escape sequence (CVE-2024-2961)
|
||||
ISO-2022-CN-EXT uses escape sequences to indicate character set changes
|
||||
(as specified by RFC 1922). While the SOdesignation has the expected
|
||||
bounds checks, neither SS2designation nor SS3designation have its;
|
||||
allowing a write overflow of 1, 2, or 3 bytes with fixed values:
|
||||
'$+I', '$+J', '$+K', '$+L', '$+M', or '$*H'.
|
||||
|
||||
ISO-2022-CN-EXT uses escape sequences to indicate character set changes
|
||||
(as specified by RFC 1922). While the SOdesignation has the expected
|
||||
bounds checks, neither SS2designation nor SS3designation have its;
|
||||
allowing a write overflow of 1, 2, or 3 bytes with fixed values:
|
||||
'$+I', '$+J', '$+K', '$+L', '$+M', or '$*H'.
|
||||
Checked on aarch64-linux-gnu.
|
||||
|
||||
Checked on aarch64-linux-gnu.
|
||||
|
||||
Co-authored-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
||||
Tested-by: Carlos O'Donell <carlos@redhat.com>
|
||||
Co-authored-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
||||
Tested-by: Carlos O'Donell <carlos@redhat.com>
|
||||
---
|
||||
iconvdata/Makefile | 5 +-
|
||||
iconvdata/iso-2022-cn-ext.c | 12 +++
|
||||
iconvdata/tst-iconv-iso-2022-cn-ext.c | 128 ++++++++++++++++++++++++++
|
||||
3 files changed, 144 insertions(+), 1 deletion(-)
|
||||
create mode 100644 iconvdata/tst-iconv-iso-2022-cn-ext.c
|
||||
|
||||
diff --git a/iconvdata/Makefile b/iconvdata/Makefile
|
||||
index 646e2ccd11478646..c959758a90ed954f 100644
|
||||
index ea019ce5c0..7196a8744b 100644
|
||||
--- a/iconvdata/Makefile
|
||||
+++ b/iconvdata/Makefile
|
||||
@@ -75,7 +75,7 @@ ifeq (yes,$(build-shared))
|
||||
@@ -75,7 +75,8 @@ ifeq (yes,$(build-shared))
|
||||
tests = bug-iconv1 bug-iconv2 tst-loading tst-e2big tst-iconv4 bug-iconv4 \
|
||||
tst-iconv6 bug-iconv5 bug-iconv6 tst-iconv7 bug-iconv8 bug-iconv9 \
|
||||
bug-iconv10 bug-iconv11 bug-iconv12 bug-iconv13 bug-iconv14 \
|
||||
- bug-iconv15
|
||||
+ bug-iconv15 tst-iconv-iso-2022-cn-ext
|
||||
bug-iconv10 bug-iconv11 bug-iconv12 tst-iconv-big5-hkscs-to-2ucs4 \
|
||||
- bug-iconv13 bug-iconv14 bug-iconv15
|
||||
+ bug-iconv13 bug-iconv14 bug-iconv15 \
|
||||
+ tst-iconv-iso-2022-cn-ext
|
||||
ifeq ($(have-thread-library),yes)
|
||||
tests += bug-iconv3
|
||||
endif
|
||||
@@ -325,6 +325,8 @@ $(objpfx)bug-iconv14.out: $(addprefix $(objpfx), $(gconv-modules)) \
|
||||
@@ -330,6 +331,8 @@ $(objpfx)bug-iconv14.out: $(addprefix $(objpfx), $(gconv-modules)) \
|
||||
$(addprefix $(objpfx),$(modules.so))
|
||||
$(objpfx)bug-iconv15.out: $(addprefix $(objpfx), $(gconv-modules)) \
|
||||
$(addprefix $(objpfx),$(modules.so))
|
||||
|
@ -38,10 +46,10 @@ index 646e2ccd11478646..c959758a90ed954f 100644
|
|||
$(objpfx)iconv-test.out: run-iconv-test.sh \
|
||||
$(addprefix $(objpfx), $(gconv-modules)) \
|
||||
diff --git a/iconvdata/iso-2022-cn-ext.c b/iconvdata/iso-2022-cn-ext.c
|
||||
index c21a7187b4d7808e..bd9493c12d95070b 100644
|
||||
index b34c8a36f4..cce29b1969 100644
|
||||
--- a/iconvdata/iso-2022-cn-ext.c
|
||||
+++ b/iconvdata/iso-2022-cn-ext.c
|
||||
@@ -575,6 +575,12 @@ DIAG_IGNORE_Os_NEEDS_COMMENT (5, "-Wmaybe-uninitialized");
|
||||
@@ -574,6 +574,12 @@ DIAG_IGNORE_Os_NEEDS_COMMENT (5, "-Wmaybe-uninitialized");
|
||||
{ \
|
||||
const char *escseq; \
|
||||
\
|
||||
|
@ -54,7 +62,7 @@ index c21a7187b4d7808e..bd9493c12d95070b 100644
|
|||
assert (used == CNS11643_2_set); /* XXX */ \
|
||||
escseq = "*H"; \
|
||||
*outptr++ = ESC; \
|
||||
@@ -588,6 +594,12 @@ DIAG_IGNORE_Os_NEEDS_COMMENT (5, "-Wmaybe-uninitialized");
|
||||
@@ -587,6 +593,12 @@ DIAG_IGNORE_Os_NEEDS_COMMENT (5, "-Wmaybe-uninitialized");
|
||||
{ \
|
||||
const char *escseq; \
|
||||
\
|
||||
|
@ -69,7 +77,7 @@ index c21a7187b4d7808e..bd9493c12d95070b 100644
|
|||
*outptr++ = ESC; \
|
||||
diff --git a/iconvdata/tst-iconv-iso-2022-cn-ext.c b/iconvdata/tst-iconv-iso-2022-cn-ext.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..96a8765fd5369681
|
||||
index 0000000000..96a8765fd5
|
||||
--- /dev/null
|
||||
+++ b/iconvdata/tst-iconv-iso-2022-cn-ext.c
|
||||
@@ -0,0 +1,128 @@
|
||||
|
@ -155,7 +163,7 @@ index 0000000000000000..96a8765fd5369681
|
|||
+
|
||||
+ /* Same as before for SS2designation. */
|
||||
+ {
|
||||
+ char inbuf[] = "㴽 \xe3\xb4\xbd";
|
||||
+ char inbuf[] = "ã´½ \xe3\xb4\xbd";
|
||||
+
|
||||
+ for (int i = 0; i < 14; i++)
|
||||
+ {
|
||||
|
@ -175,7 +183,7 @@ index 0000000000000000..96a8765fd5369681
|
|||
+
|
||||
+ /* Same as before for SS3designation. */
|
||||
+ {
|
||||
+ char inbuf[] = "劄 \xe5\x8a\x84";
|
||||
+ char inbuf[] = "å \xe5\x8a\x84";
|
||||
+
|
||||
+ for (int i = 0; i < 14; i++)
|
||||
+ {
|
||||
|
@ -201,3 +209,5 @@ index 0000000000000000..96a8765fd5369681
|
|||
+}
|
||||
+
|
||||
+#include <support/test-driver.c>
|
||||
--
|
||||
2.39.3
|
|
@ -1,38 +1,18 @@
|
|||
This patch was developed under embargo and cannot reference an upstream
|
||||
commit. To find the associated commit please review the upstream git
|
||||
log for CVE-2023-4911 to identify the relevant commits.
|
||||
From 27e06a423cf06845a0515ab767a109b31b34724a Mon Sep 17 00:00:00 2001
|
||||
From: Chunmei Xu <xuchunmei@linux.alibaba.com>
|
||||
Date: Tue, 5 Mar 2024 14:12:15 +0800
|
||||
Subject: [PATCH 1/1] fix CVE-2023-4911
|
||||
|
||||
Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
|
||||
Date: Tue Sep 19 18:39:32 2023 -0400
|
||||
|
||||
tunables: Terminate if end of input is reached (CVE-2023-4911)
|
||||
|
||||
The string parsing routine may end up writing beyond bounds of tunestr
|
||||
if the input tunable string is malformed, of the form name=name=val.
|
||||
This gets processed twice, first as name=name=val and next as name=val,
|
||||
resulting in tunestr being name=name=val:name=val, thus overflowing
|
||||
tunestr.
|
||||
|
||||
Terminate the parsing loop at the first instance itself so that tunestr
|
||||
does not overflow.
|
||||
|
||||
This also fixes up tst-env-setuid-tunables to actually handle failures
|
||||
correct and add new tests to validate the fix for this CVE.
|
||||
|
||||
Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
|
||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
||||
|
||||
Conflicts:
|
||||
NEWS
|
||||
(Dropped)
|
||||
elf/tst-env-setuid-tunables.c
|
||||
(Trivial conflict at HAVE_TUNABLES)
|
||||
---
|
||||
elf/dl-tunables.c | 16 ++++++++-------
|
||||
elf/tst-env-setuid-tunables.c | 37 +++++++++++++++++++++++++++--------
|
||||
2 files changed, 38 insertions(+), 15 deletions(-)
|
||||
|
||||
diff --git a/elf/dl-tunables.c b/elf/dl-tunables.c
|
||||
index 3c84809d44381241..2c878e08ea197b29 100644
|
||||
index 62b7332d..0edfade8 100644
|
||||
--- a/elf/dl-tunables.c
|
||||
+++ b/elf/dl-tunables.c
|
||||
@@ -193,11 +193,7 @@ parse_tunables (char *tunestr, char *valstring)
|
||||
@@ -180,11 +180,7 @@ parse_tunables (char *tunestr, char *valstring)
|
||||
/* If we reach the end of the string before getting a valid name-value
|
||||
pair, bail out. */
|
||||
if (p[len] == '\0')
|
||||
|
@ -45,7 +25,7 @@ index 3c84809d44381241..2c878e08ea197b29 100644
|
|||
|
||||
/* We did not find a valid name-value pair before encountering the
|
||||
colon. */
|
||||
@@ -257,9 +253,16 @@ parse_tunables (char *tunestr, char *valstring)
|
||||
@@ -244,9 +240,15 @@ parse_tunables (char *tunestr, char *valstring)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -54,7 +34,6 @@ index 3c84809d44381241..2c878e08ea197b29 100644
|
|||
+ /* We reached the end while processing the tunable string. */
|
||||
+ if (p[len] == '\0')
|
||||
+ break;
|
||||
+
|
||||
+ p += len + 1;
|
||||
}
|
||||
+
|
||||
|
@ -62,13 +41,13 @@ index 3c84809d44381241..2c878e08ea197b29 100644
|
|||
+ if (__libc_enable_secure)
|
||||
+ tunestr[off] = '\0';
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Enable the glibc.malloc.check tunable in SETUID/SETGID programs only when
|
||||
diff --git a/elf/tst-env-setuid-tunables.c b/elf/tst-env-setuid-tunables.c
|
||||
index 0b9b075c40598c6f..8b0861c4ad853040 100644
|
||||
index 7dfb0e07..2364d162 100644
|
||||
--- a/elf/tst-env-setuid-tunables.c
|
||||
+++ b/elf/tst-env-setuid-tunables.c
|
||||
@@ -52,6 +52,8 @@ const char *teststrings[] =
|
||||
@@ -50,6 +50,8 @@ const char *teststrings[] =
|
||||
"glibc.malloc.perturb=0x800:not_valid.malloc.check=2:glibc.malloc.mmap_threshold=4096",
|
||||
"glibc.not_valid.check=2:glibc.malloc.mmap_threshold=4096",
|
||||
"not_valid.malloc.check=2:glibc.malloc.mmap_threshold=4096",
|
||||
|
@ -77,7 +56,7 @@ index 0b9b075c40598c6f..8b0861c4ad853040 100644
|
|||
"glibc.malloc.garbage=2:glibc.maoc.mmap_threshold=4096:glibc.malloc.check=2",
|
||||
"glibc.malloc.check=4:glibc.malloc.garbage=2:glibc.maoc.mmap_threshold=4096",
|
||||
":glibc.malloc.garbage=2:glibc.malloc.check=1",
|
||||
@@ -70,6 +72,8 @@ const char *resultstrings[] =
|
||||
@@ -68,6 +70,8 @@ const char *resultstrings[] =
|
||||
"glibc.malloc.perturb=0x800:glibc.malloc.mmap_threshold=4096",
|
||||
"glibc.malloc.mmap_threshold=4096",
|
||||
"glibc.malloc.mmap_threshold=4096",
|
||||
|
@ -86,10 +65,10 @@ index 0b9b075c40598c6f..8b0861c4ad853040 100644
|
|||
"",
|
||||
"",
|
||||
"",
|
||||
@@ -84,11 +88,18 @@ test_child (int off)
|
||||
@@ -81,11 +85,17 @@ test_child (int off)
|
||||
{
|
||||
const char *val = getenv ("GLIBC_TUNABLES");
|
||||
|
||||
#if HAVE_TUNABLES
|
||||
+ printf (" [%d] GLIBC_TUNABLES is %s\n", off, val);
|
||||
+ fflush (stdout);
|
||||
if (val != NULL && strcmp (val, resultstrings[off]) == 0)
|
||||
|
@ -98,27 +77,26 @@ index 0b9b075c40598c6f..8b0861c4ad853040 100644
|
|||
if (val != NULL)
|
||||
- printf ("[%d] Unexpected GLIBC_TUNABLES VALUE %s\n", off, val);
|
||||
+ printf (" [%d] Unexpected GLIBC_TUNABLES VALUE %s, expected %s\n",
|
||||
+ off, val, resultstrings[off]);
|
||||
+ off, val, resultstrings[off]);
|
||||
+ else
|
||||
+ printf (" [%d] GLIBC_TUNABLES environment variable absent\n", off);
|
||||
+
|
||||
+ fflush (stdout);
|
||||
|
||||
return 1;
|
||||
#else
|
||||
@@ -117,21 +128,26 @@ do_test (int argc, char **argv)
|
||||
}
|
||||
@@ -106,31 +116,42 @@ do_test (int argc, char **argv)
|
||||
if (ret != 0)
|
||||
exit (1);
|
||||
|
||||
- exit (EXIT_SUCCESS);
|
||||
+ /* Special return code to make sure that the child executed all the way
|
||||
+ through. */
|
||||
+ through. */
|
||||
+ exit (42);
|
||||
}
|
||||
else
|
||||
{
|
||||
- int ret = 0;
|
||||
-
|
||||
|
||||
/* Spawn tests. */
|
||||
for (int i = 0; i < array_length (teststrings); i++)
|
||||
{
|
||||
|
@ -130,28 +108,32 @@ index 0b9b075c40598c6f..8b0861c4ad853040 100644
|
|||
+ fflush (stdout);
|
||||
if (setenv ("GLIBC_TUNABLES", teststrings[i], 1) != 0)
|
||||
- exit (1);
|
||||
+ {
|
||||
-
|
||||
+ {
|
||||
+ printf (" [%d] Failed to set GLIBC_TUNABLES: %m", i);
|
||||
+ support_record_failure ();
|
||||
+ continue;
|
||||
+ }
|
||||
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
int status = support_capture_subprogram_self_sgid (buf);
|
||||
|
||||
@@ -139,9 +155,14 @@ do_test (int argc, char **argv)
|
||||
/* Bail out early if unsupported. */
|
||||
if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
|
||||
return EXIT_UNSUPPORTED;
|
||||
|
||||
- ret |= status;
|
||||
+ if (WEXITSTATUS (status) != 42)
|
||||
+ {
|
||||
+ printf (" [%d] child failed with status %d\n", i,
|
||||
+ WEXITSTATUS (status));
|
||||
+ support_record_failure ();
|
||||
+ }
|
||||
+ {
|
||||
+ printf (" [%d] child failed with status %d\n", i,
|
||||
+ WEXITSTATUS (status));
|
||||
+ support_record_failure ();
|
||||
+ }
|
||||
}
|
||||
- return ret;
|
||||
+ return 0;
|
||||
}
|
||||
}
|
||||
|
||||
--
|
||||
2.41.0
|
||||
|
6058
ChangeLog.old
6058
ChangeLog.old
File diff suppressed because it is too large
Load diff
40
Decrease-value-of-arch_minimum_kernel-with-LoongArch.patch
Normal file
40
Decrease-value-of-arch_minimum_kernel-with-LoongArch.patch
Normal file
|
@ -0,0 +1,40 @@
|
|||
From 2c8dfc45a8009e5110a9d2148b62d802e989fde7 Mon Sep 17 00:00:00 2001
|
||||
From: ticat_fp <fanpeng@loongson.cn>
|
||||
Date: Thu, 29 Feb 2024 15:58:31 +0800
|
||||
Subject: [PATCH] Decrease value of arch_minimum_kernel with LoongArch
|
||||
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/unix/sysv/linux/loongarch/configure | 2 +-
|
||||
sysdeps/unix/sysv/linux/loongarch/configure.ac | 2 +-
|
||||
2 files changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/unix/sysv/linux/loongarch/configure b/sysdeps/unix/sysv/linux/loongarch/configure
|
||||
index 0d1159e9..851b2285 100644
|
||||
--- a/sysdeps/unix/sysv/linux/loongarch/configure
|
||||
+++ b/sysdeps/unix/sysv/linux/loongarch/configure
|
||||
@@ -1,7 +1,7 @@
|
||||
# This file is generated from configure.ac by Autoconf. DO NOT EDIT!
|
||||
# Local configure fragment for sysdeps/unix/sysv/linux/loongarch.
|
||||
|
||||
-arch_minimum_kernel=5.19.0
|
||||
+arch_minimum_kernel=4.19.0
|
||||
|
||||
libc_cv_loongarch_int_abi=no
|
||||
|
||||
diff --git a/sysdeps/unix/sysv/linux/loongarch/configure.ac b/sysdeps/unix/sysv/linux/loongarch/configure.ac
|
||||
index 04e9150a..00815c2f 100644
|
||||
--- a/sysdeps/unix/sysv/linux/loongarch/configure.ac
|
||||
+++ b/sysdeps/unix/sysv/linux/loongarch/configure.ac
|
||||
@@ -2,7 +2,7 @@ sinclude(./aclocal.m4)dnl Autoconf lossage
|
||||
GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
|
||||
# Local configure fragment for sysdeps/unix/sysv/linux/loongarch.
|
||||
|
||||
-arch_minimum_kernel=5.19.0
|
||||
+arch_minimum_kernel=4.19.0
|
||||
|
||||
libc_cv_loongarch_int_abi=no
|
||||
AC_EGREP_CPP(4 8 8, [__SIZEOF_INT__ __SIZEOF_LONG__ __SIZEOF_POINTER__
|
||||
--
|
||||
2.33.0
|
||||
|
|
@ -1,34 +0,0 @@
|
|||
From c5de7c407853b807e8d0c764e6325bb1311f39cd Mon Sep 17 00:00:00 2001
|
||||
From: Xing Li <lixing@loongson.cn>
|
||||
Date: Tue, 4 Jul 2023 15:10:03 +0800
|
||||
Subject: [PATCH 2/2] Fix tst-cancel21.c to suit kernel struct sigcontext
|
||||
change. * nptl/tst-cancel21.c
|
||||
|
||||
---
|
||||
nptl/tst-cancel21.c | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/nptl/tst-cancel21.c b/nptl/tst-cancel21.c
|
||||
index b10fdbc1..a3653f21 100644
|
||||
--- a/nptl/tst-cancel21.c
|
||||
+++ b/nptl/tst-cancel21.c
|
||||
@@ -217,14 +217,14 @@ static int
|
||||
do_test (void)
|
||||
{
|
||||
stack_t ss;
|
||||
- ss.ss_sp = malloc (2 * SIGSTKSZ);
|
||||
+ ss.ss_sp = malloc (4 * SIGSTKSZ);
|
||||
if (ss.ss_sp == NULL)
|
||||
{
|
||||
puts ("failed to allocate alternate stack");
|
||||
return 1;
|
||||
}
|
||||
ss.ss_flags = 0;
|
||||
- ss.ss_size = 2 * SIGSTKSZ;
|
||||
+ ss.ss_size = 4 * SIGSTKSZ;
|
||||
if (sigaltstack (&ss, NULL) < 0)
|
||||
{
|
||||
printf ("sigaltstack failed %m\n");
|
||||
--
|
||||
2.27.0
|
||||
|
499
LoongArch-Add-glibc.cpu.hwcap-support.patch
Normal file
499
LoongArch-Add-glibc.cpu.hwcap-support.patch
Normal file
|
@ -0,0 +1,499 @@
|
|||
From 8923e4e9c79e672fd6b3b89aba598a60d5c01211 Mon Sep 17 00:00:00 2001
|
||||
From: caiyinyu <caiyinyu@loongson.cn>
|
||||
Date: Fri, 15 Sep 2023 17:35:19 +0800
|
||||
Subject: [PATCH 25/29] LoongArch: Add glibc.cpu.hwcap support.
|
||||
|
||||
Key Points:
|
||||
1. On lasx & lsx platforms, We must use _dl_runtime_{profile, resolve}_{lsx, lasx}
|
||||
to save vector registers.
|
||||
2. Via "tunables", users can choose str/mem_{lasx,lsx,unaligned} functions with
|
||||
`export GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX,...`.
|
||||
Note: glibc.cpu.hwcaps doesn't affect _dl_runtime_{profile, resolve}_{lsx, lasx}
|
||||
selection.
|
||||
|
||||
Usage Notes:
|
||||
1. Only valid inputs: LASX, LSX, UAL. Case-sensitive, comma-separated, no spaces.
|
||||
2. Example: `export GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX,UAL` turns on LASX & UAL.
|
||||
Unmentioned features turn off. With default ifunc: lasx > lsx > unaligned >
|
||||
aligned > generic, effect is: lasx > unaligned > aligned > generic; lsx off.
|
||||
3. Incorrect GLIBC_TUNABLES settings will show error messages.
|
||||
For example: On lsx platforms, you cannot enable lasx features. If you do
|
||||
that, you will get error messages.
|
||||
4. Valid input examples:
|
||||
- GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX: lasx > aligned > generic.
|
||||
- GLIBC_TUNABLES=glibc.cpu.hwcaps=LSX,UAL: lsx > unaligned > aligned > generic.
|
||||
- GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX,UAL,LASX,UAL,LSX,LASX,UAL: Repetitions
|
||||
allowed but not recommended. Results in: lasx > lsx > unaligned > aligned >
|
||||
generic.
|
||||
|
||||
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/loongarch/Makefile | 4 +
|
||||
sysdeps/loongarch/Versions | 5 ++
|
||||
sysdeps/loongarch/cpu-tunables.c | 89 +++++++++++++++++++
|
||||
sysdeps/loongarch/dl-get-cpu-features.c | 25 ++++++
|
||||
sysdeps/loongarch/dl-machine.h | 27 +++++-
|
||||
sysdeps/loongarch/dl-tunables.list | 25 ++++++
|
||||
.../unix/sysv/linux/loongarch/cpu-features.c | 29 ++++++
|
||||
.../unix/sysv/linux/loongarch/cpu-features.h | 18 +++-
|
||||
.../unix/sysv/linux/loongarch/dl-procinfo.c | 60 +++++++++++++
|
||||
sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c | 21 +++++
|
||||
.../unix/sysv/linux/loongarch/libc-start.c | 34 +++++++
|
||||
11 files changed, 329 insertions(+), 8 deletions(-)
|
||||
create mode 100644 sysdeps/loongarch/Versions
|
||||
create mode 100644 sysdeps/loongarch/cpu-tunables.c
|
||||
create mode 100644 sysdeps/loongarch/dl-get-cpu-features.c
|
||||
create mode 100644 sysdeps/loongarch/dl-tunables.list
|
||||
create mode 100644 sysdeps/unix/sysv/linux/loongarch/cpu-features.c
|
||||
create mode 100644 sysdeps/unix/sysv/linux/loongarch/dl-procinfo.c
|
||||
create mode 100644 sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c
|
||||
create mode 100644 sysdeps/unix/sysv/linux/loongarch/libc-start.c
|
||||
|
||||
diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
|
||||
index 43d2f583..30a1f4a8 100644
|
||||
--- a/sysdeps/loongarch/Makefile
|
||||
+++ b/sysdeps/loongarch/Makefile
|
||||
@@ -6,6 +6,10 @@ ifeq ($(subdir),elf)
|
||||
gen-as-const-headers += dl-link.sym
|
||||
endif
|
||||
|
||||
+ifeq ($(subdir),elf)
|
||||
+ sysdep-dl-routines += dl-get-cpu-features
|
||||
+endif
|
||||
+
|
||||
# LoongArch's assembler also needs to know about PIC as it changes the
|
||||
# definition of some assembler macros.
|
||||
ASFLAGS-.os += $(pic-ccflag)
|
||||
diff --git a/sysdeps/loongarch/Versions b/sysdeps/loongarch/Versions
|
||||
new file mode 100644
|
||||
index 00000000..33ae2cc0
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/Versions
|
||||
@@ -0,0 +1,5 @@
|
||||
+ld {
|
||||
+ GLIBC_PRIVATE {
|
||||
+ _dl_larch_get_cpu_features;
|
||||
+ }
|
||||
+}
|
||||
diff --git a/sysdeps/loongarch/cpu-tunables.c b/sysdeps/loongarch/cpu-tunables.c
|
||||
new file mode 100644
|
||||
index 00000000..8e9fab93
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/cpu-tunables.c
|
||||
@@ -0,0 +1,89 @@
|
||||
+/* LoongArch CPU feature tuning.
|
||||
+ This file is part of the GNU C Library.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+# include <stdbool.h>
|
||||
+# include <stdint.h>
|
||||
+# include <unistd.h> /* Get STDOUT_FILENO for _dl_printf. */
|
||||
+# include <elf/dl-tunables.h>
|
||||
+# include <string.h>
|
||||
+# include <cpu-features.h>
|
||||
+# include <ldsodefs.h>
|
||||
+# include <sys/auxv.h>
|
||||
+
|
||||
+# define HWCAP_LOONGARCH_IFUNC \
|
||||
+ (HWCAP_LOONGARCH_UAL | HWCAP_LOONGARCH_LSX | HWCAP_LOONGARCH_LASX)
|
||||
+
|
||||
+# define CHECK_GLIBC_IFUNC_CPU_OFF(f, name, len) \
|
||||
+ _Static_assert (sizeof (#name) - 1 == len, #name " != " #len); \
|
||||
+ if (!memcmp (f, #name, len) && \
|
||||
+ (GLRO (dl_hwcap) & HWCAP_LOONGARCH_##name)) \
|
||||
+ { \
|
||||
+ hwcap |= (HWCAP_LOONGARCH_##name | (~HWCAP_LOONGARCH_IFUNC)); \
|
||||
+ break; \
|
||||
+ } \
|
||||
+
|
||||
+attribute_hidden
|
||||
+void
|
||||
+TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
|
||||
+{
|
||||
+ const char *p = valp->strval;
|
||||
+ size_t len;
|
||||
+ unsigned long hwcap = 0;
|
||||
+ const char *c;
|
||||
+
|
||||
+ do {
|
||||
+ for (c = p; *c != ','; c++)
|
||||
+ if (*c == '\0')
|
||||
+ break;
|
||||
+
|
||||
+ len = c - p;
|
||||
+
|
||||
+ switch(len)
|
||||
+ {
|
||||
+ default:
|
||||
+ _dl_fatal_printf (
|
||||
+ "The valid values of glibc.cpu.hwcaps is UAL, LASX, LSX!!\n"
|
||||
+ );
|
||||
+ break;
|
||||
+ case 3:
|
||||
+ {
|
||||
+ CHECK_GLIBC_IFUNC_CPU_OFF (p, LSX, 3);
|
||||
+ CHECK_GLIBC_IFUNC_CPU_OFF (p, UAL, 3);
|
||||
+ _dl_fatal_printf (
|
||||
+ "Some features are invalid or not supported on this machine!!\n"
|
||||
+ "The valid values of glibc.cpu.hwcaps is UAL, LASX, LSX!!\n"
|
||||
+ );
|
||||
+ }
|
||||
+ break;
|
||||
+ case 4:
|
||||
+ {
|
||||
+ CHECK_GLIBC_IFUNC_CPU_OFF (p, LASX, 4);
|
||||
+ _dl_fatal_printf (
|
||||
+ "Some features are invalid or not supported on this machine!!\n"
|
||||
+ "The valid values of glibc.cpu.hwcaps is UAL, LASX, LSX!!\n"
|
||||
+ );
|
||||
+ }
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ p += len + 1;
|
||||
+ }
|
||||
+ while (*c != '\0');
|
||||
+
|
||||
+ GLRO (dl_larch_cpu_features).hwcap &= hwcap;
|
||||
+}
|
||||
diff --git a/sysdeps/loongarch/dl-get-cpu-features.c b/sysdeps/loongarch/dl-get-cpu-features.c
|
||||
new file mode 100644
|
||||
index 00000000..7cd9bc15
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/dl-get-cpu-features.c
|
||||
@@ -0,0 +1,25 @@
|
||||
+/* Define _dl_larch_get_cpu_features.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+
|
||||
+#include <ldsodefs.h>
|
||||
+
|
||||
+const struct cpu_features *
|
||||
+_dl_larch_get_cpu_features (void)
|
||||
+{
|
||||
+ return &GLRO(dl_larch_cpu_features);
|
||||
+}
|
||||
diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
|
||||
index 57913cef..b395a928 100644
|
||||
--- a/sysdeps/loongarch/dl-machine.h
|
||||
+++ b/sysdeps/loongarch/dl-machine.h
|
||||
@@ -29,6 +29,8 @@
|
||||
#include <dl-static-tls.h>
|
||||
#include <dl-machine-rel.h>
|
||||
|
||||
+#include <cpu-features.c>
|
||||
+
|
||||
#ifndef _RTLD_PROLOGUE
|
||||
# define _RTLD_PROLOGUE(entry) \
|
||||
".globl\t" __STRING (entry) "\n\t" \
|
||||
@@ -53,6 +55,23 @@
|
||||
#define ELF_MACHINE_NO_REL 1
|
||||
#define ELF_MACHINE_NO_RELA 0
|
||||
|
||||
+#define DL_PLATFORM_INIT dl_platform_init ()
|
||||
+
|
||||
+static inline void __attribute__ ((unused))
|
||||
+dl_platform_init (void)
|
||||
+{
|
||||
+ if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0')
|
||||
+ /* Avoid an empty string which would disturb us. */
|
||||
+ GLRO(dl_platform) = NULL;
|
||||
+
|
||||
+#ifdef SHARED
|
||||
+ /* init_cpu_features has been called early from __libc_start_main in
|
||||
+ static executable. */
|
||||
+ init_cpu_features (&GLRO(dl_larch_cpu_features));
|
||||
+#endif
|
||||
+}
|
||||
+
|
||||
+
|
||||
/* Return nonzero iff ELF header is compatible with the running host. */
|
||||
static inline int
|
||||
elf_machine_matches_host (const ElfW (Ehdr) *ehdr)
|
||||
@@ -290,9 +309,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
|
||||
if (profile != 0)
|
||||
{
|
||||
#if !defined __loongarch_soft_float
|
||||
- if (SUPPORT_LASX)
|
||||
+ if (RTLD_SUPPORT_LASX)
|
||||
gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile_lasx;
|
||||
- else if (SUPPORT_LSX)
|
||||
+ else if (RTLD_SUPPORT_LSX)
|
||||
gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile_lsx;
|
||||
else
|
||||
#endif
|
||||
@@ -310,9 +329,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
|
||||
indicated by the offset on the stack, and then jump to
|
||||
the resolved address. */
|
||||
#if !defined __loongarch_soft_float
|
||||
- if (SUPPORT_LASX)
|
||||
+ if (RTLD_SUPPORT_LASX)
|
||||
gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lasx;
|
||||
- else if (SUPPORT_LSX)
|
||||
+ else if (RTLD_SUPPORT_LSX)
|
||||
gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lsx;
|
||||
else
|
||||
#endif
|
||||
diff --git a/sysdeps/loongarch/dl-tunables.list b/sysdeps/loongarch/dl-tunables.list
|
||||
new file mode 100644
|
||||
index 00000000..66b34275
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/dl-tunables.list
|
||||
@@ -0,0 +1,25 @@
|
||||
+# LoongArch specific tunables.
|
||||
+# Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+# This file is part of the GNU C Library.
|
||||
+
|
||||
+# The GNU C Library is free software; you can redistribute it and/or
|
||||
+# modify it under the terms of the GNU Lesser General Public
|
||||
+# License as published by the Free Software Foundation; either
|
||||
+# version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+# The GNU C Library is distributed in the hope that it will be useful,
|
||||
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+# Lesser General Public License for more details.
|
||||
+
|
||||
+# You should have received a copy of the GNU Lesser General Public
|
||||
+# License along with the GNU C Library; if not, see
|
||||
+# <http://www.gnu.org/licenses/>.
|
||||
+
|
||||
+glibc {
|
||||
+ cpu {
|
||||
+ hwcaps {
|
||||
+ type: STRING
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.c b/sysdeps/unix/sysv/linux/loongarch/cpu-features.c
|
||||
new file mode 100644
|
||||
index 00000000..1290c4ce
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.c
|
||||
@@ -0,0 +1,29 @@
|
||||
+/* Initialize CPU feature data. LoongArch64 version.
|
||||
+ This file is part of the GNU C Library.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <cpu-features.h>
|
||||
+#include <elf/dl-hwcaps.h>
|
||||
+#include <elf/dl-tunables.h>
|
||||
+extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *) attribute_hidden;
|
||||
+
|
||||
+static inline void
|
||||
+init_cpu_features (struct cpu_features *cpu_features)
|
||||
+{
|
||||
+ GLRO (dl_larch_cpu_features).hwcap = GLRO (dl_hwcap);
|
||||
+ TUNABLE_GET (glibc, cpu, hwcaps, tunable_val_t *, TUNABLE_CALLBACK (set_hwcaps));
|
||||
+}
|
||||
diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
|
||||
index d1a280a5..450963ce 100644
|
||||
--- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
|
||||
+++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
|
||||
@@ -19,13 +19,23 @@
|
||||
#ifndef _CPU_FEATURES_LOONGARCH64_H
|
||||
#define _CPU_FEATURES_LOONGARCH64_H
|
||||
|
||||
+#include <stdint.h>
|
||||
#include <sys/auxv.h>
|
||||
|
||||
-#define SUPPORT_UAL (GLRO (dl_hwcap) & HWCAP_LOONGARCH_UAL)
|
||||
-#define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX)
|
||||
-#define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX)
|
||||
+struct cpu_features
|
||||
+ {
|
||||
+ uint64_t hwcap;
|
||||
+ };
|
||||
|
||||
+/* Get a pointer to the CPU features structure. */
|
||||
+extern const struct cpu_features *_dl_larch_get_cpu_features (void)
|
||||
+ __attribute__ ((pure));
|
||||
+
|
||||
+#define SUPPORT_UAL (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_UAL)
|
||||
+#define SUPPORT_LSX (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_LSX)
|
||||
+#define SUPPORT_LASX (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_LASX)
|
||||
+#define RTLD_SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX)
|
||||
+#define RTLD_SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX)
|
||||
#define INIT_ARCH()
|
||||
|
||||
#endif /* _CPU_FEATURES_LOONGARCH64_H */
|
||||
-
|
||||
diff --git a/sysdeps/unix/sysv/linux/loongarch/dl-procinfo.c b/sysdeps/unix/sysv/linux/loongarch/dl-procinfo.c
|
||||
new file mode 100644
|
||||
index 00000000..6217fda9
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/unix/sysv/linux/loongarch/dl-procinfo.c
|
||||
@@ -0,0 +1,60 @@
|
||||
+/* Data for LoongArch64 version of processor capability information.
|
||||
+ Linux version.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+/* If anything should be added here check whether the size of each string
|
||||
+ is still ok with the given array size.
|
||||
+
|
||||
+ All the #ifdefs in the definitions are quite irritating but
|
||||
+ necessary if we want to avoid duplicating the information. There
|
||||
+ are three different modes:
|
||||
+
|
||||
+ - PROCINFO_DECL is defined. This means we are only interested in
|
||||
+ declarations.
|
||||
+
|
||||
+ - PROCINFO_DECL is not defined:
|
||||
+
|
||||
+ + if SHARED is defined the file is included in an array
|
||||
+ initializer. The .element = { ... } syntax is needed.
|
||||
+
|
||||
+ + if SHARED is not defined a normal array initialization is
|
||||
+ needed.
|
||||
+ */
|
||||
+
|
||||
+#ifndef PROCINFO_CLASS
|
||||
+# define PROCINFO_CLASS
|
||||
+#endif
|
||||
+
|
||||
+#if !IS_IN (ldconfig)
|
||||
+# if !defined PROCINFO_DECL && defined SHARED
|
||||
+ ._dl_larch_cpu_features
|
||||
+# else
|
||||
+PROCINFO_CLASS struct cpu_features _dl_larch_cpu_features
|
||||
+# endif
|
||||
+# ifndef PROCINFO_DECL
|
||||
+= { }
|
||||
+# endif
|
||||
+# if !defined SHARED || defined PROCINFO_DECL
|
||||
+;
|
||||
+# else
|
||||
+,
|
||||
+# endif
|
||||
+#endif
|
||||
+
|
||||
+#undef PROCINFO_DECL
|
||||
+#undef PROCINFO_CLASS
|
||||
diff --git a/sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c b/sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c
|
||||
new file mode 100644
|
||||
index 00000000..455fd71a
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c
|
||||
@@ -0,0 +1,21 @@
|
||||
+/* Operating system support for run-time dynamic linker. LoongArch version.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <config.h>
|
||||
+#include <sysdeps/loongarch/cpu-tunables.c>
|
||||
+#include <sysdeps/unix/sysv/linux/dl-sysdep.c>
|
||||
diff --git a/sysdeps/unix/sysv/linux/loongarch/libc-start.c b/sysdeps/unix/sysv/linux/loongarch/libc-start.c
|
||||
new file mode 100644
|
||||
index 00000000..f1346ece
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/unix/sysv/linux/loongarch/libc-start.c
|
||||
@@ -0,0 +1,34 @@
|
||||
+/* Override csu/libc-start.c on LoongArch64.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifndef SHARED
|
||||
+
|
||||
+/* Mark symbols hidden in static PIE for early self relocation to work. */
|
||||
+# if BUILD_PIE_DEFAULT
|
||||
+# pragma GCC visibility push(hidden)
|
||||
+# endif
|
||||
+
|
||||
+# include <ldsodefs.h>
|
||||
+# include <cpu-features.c>
|
||||
+
|
||||
+extern struct cpu_features _dl_larch_cpu_features;
|
||||
+
|
||||
+# define ARCH_INIT_CPU_FEATURES() init_cpu_features (&_dl_larch_cpu_features)
|
||||
+
|
||||
+#endif
|
||||
+#include <csu/libc-start.c>
|
||||
--
|
||||
2.33.0
|
||||
|
485
LoongArch-Add-ifunc-support-for-memchr-aligned-lsx-l.patch
Normal file
485
LoongArch-Add-ifunc-support-for-memchr-aligned-lsx-l.patch
Normal file
|
@ -0,0 +1,485 @@
|
|||
From 3ee56bbc56faa7b85a6513340db4a4fdd6ce709d Mon Sep 17 00:00:00 2001
|
||||
From: dengjianbo <dengjianbo@loongson.cn>
|
||||
Date: Mon, 28 Aug 2023 10:08:36 +0800
|
||||
Subject: [PATCH 15/29] LoongArch: Add ifunc support for memchr{aligned, lsx,
|
||||
lasx}
|
||||
|
||||
According to glibc memchr microbenchmark, this implementation could reduce
|
||||
the runtime as following:
|
||||
|
||||
Name Percent of runtime reduced
|
||||
memchr-lasx 37%-83%
|
||||
memchr-lsx 30%-66%
|
||||
memchr-aligned 0%-15%
|
||||
|
||||
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/loongarch/lp64/multiarch/Makefile | 3 +
|
||||
.../lp64/multiarch/ifunc-impl-list.c | 7 ++
|
||||
.../loongarch/lp64/multiarch/ifunc-memchr.h | 40 ++++++
|
||||
.../loongarch/lp64/multiarch/memchr-aligned.S | 95 ++++++++++++++
|
||||
.../loongarch/lp64/multiarch/memchr-lasx.S | 117 ++++++++++++++++++
|
||||
sysdeps/loongarch/lp64/multiarch/memchr-lsx.S | 102 +++++++++++++++
|
||||
sysdeps/loongarch/lp64/multiarch/memchr.c | 37 ++++++
|
||||
7 files changed, 401 insertions(+)
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-memchr.h
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/memchr-aligned.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/memchr-lasx.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/memchr-lsx.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/memchr.c
|
||||
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
index 64416b02..2f4802cf 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
@@ -24,5 +24,8 @@ sysdep_routines += \
|
||||
rawmemchr-aligned \
|
||||
rawmemchr-lsx \
|
||||
rawmemchr-lasx \
|
||||
+ memchr-aligned \
|
||||
+ memchr-lsx \
|
||||
+ memchr-lasx \
|
||||
# sysdep_routines
|
||||
endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
index 3db9af14..a567b9cf 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
@@ -102,5 +102,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_aligned)
|
||||
)
|
||||
|
||||
+ IFUNC_IMPL (i, name, memchr,
|
||||
+#if !defined __loongarch_soft_float
|
||||
+ IFUNC_IMPL_ADD (array, i, memchr, SUPPORT_LASX, __memchr_lasx)
|
||||
+ IFUNC_IMPL_ADD (array, i, memchr, SUPPORT_LSX, __memchr_lsx)
|
||||
+#endif
|
||||
+ IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_aligned)
|
||||
+ )
|
||||
return i;
|
||||
}
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-memchr.h b/sysdeps/loongarch/lp64/multiarch/ifunc-memchr.h
|
||||
new file mode 100644
|
||||
index 00000000..9060ccd5
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-memchr.h
|
||||
@@ -0,0 +1,40 @@
|
||||
+/* Common definition for memchr ifunc selections.
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <ldsodefs.h>
|
||||
+#include <ifunc-init.h>
|
||||
+
|
||||
+#if !defined __loongarch_soft_float
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
|
||||
+#endif
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
|
||||
+
|
||||
+static inline void *
|
||||
+IFUNC_SELECTOR (void)
|
||||
+{
|
||||
+#if !defined __loongarch_soft_float
|
||||
+ if (SUPPORT_LASX)
|
||||
+ return OPTIMIZE (lasx);
|
||||
+ else if (SUPPORT_LSX)
|
||||
+ return OPTIMIZE (lsx);
|
||||
+ else
|
||||
+#endif
|
||||
+ return OPTIMIZE (aligned);
|
||||
+}
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S
|
||||
new file mode 100644
|
||||
index 00000000..81d0d004
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S
|
||||
@@ -0,0 +1,95 @@
|
||||
+/* Optimized memchr implementation using basic LoongArch instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc)
|
||||
+# define MEMCHR_NAME __memchr_aligned
|
||||
+#else
|
||||
+# define MEMCHR_NAME memchr
|
||||
+#endif
|
||||
+
|
||||
+LEAF(MEMCHR_NAME, 6)
|
||||
+ beqz a2, L(out)
|
||||
+ andi t1, a0, 0x7
|
||||
+ add.d a5, a0, a2
|
||||
+ bstrins.d a0, zero, 2, 0
|
||||
+
|
||||
+ ld.d t0, a0, 0
|
||||
+ bstrins.d a1, a1, 15, 8
|
||||
+ lu12i.w a3, 0x01010
|
||||
+ slli.d t2, t1, 03
|
||||
+
|
||||
+ bstrins.d a1, a1, 31, 16
|
||||
+ ori a3, a3, 0x101
|
||||
+ li.d t7, -1
|
||||
+ li.d t8, 8
|
||||
+
|
||||
+ bstrins.d a1, a1, 63, 32
|
||||
+ bstrins.d a3, a3, 63, 32
|
||||
+ sll.d t2, t7, t2
|
||||
+ xor t0, t0, a1
|
||||
+
|
||||
+
|
||||
+ addi.d a6, a5, -1
|
||||
+ slli.d a4, a3, 7
|
||||
+ sub.d t1, t8, t1
|
||||
+ orn t0, t0, t2
|
||||
+
|
||||
+ sub.d t2, t0, a3
|
||||
+ andn t3, a4, t0
|
||||
+ bstrins.d a6, zero, 2, 0
|
||||
+ and t0, t2, t3
|
||||
+
|
||||
+ bgeu t1, a2, L(end)
|
||||
+L(loop):
|
||||
+ bnez t0, L(found)
|
||||
+ ld.d t1, a0, 8
|
||||
+ xor t0, t1, a1
|
||||
+
|
||||
+ addi.d a0, a0, 8
|
||||
+ sub.d t2, t0, a3
|
||||
+ andn t3, a4, t0
|
||||
+ and t0, t2, t3
|
||||
+
|
||||
+
|
||||
+ bne a0, a6, L(loop)
|
||||
+L(end):
|
||||
+ sub.d t1, a5, a6
|
||||
+ ctz.d t0, t0
|
||||
+ srli.d t0, t0, 3
|
||||
+
|
||||
+ sltu t1, t0, t1
|
||||
+ add.d a0, a0, t0
|
||||
+ maskeqz a0, a0, t1
|
||||
+ jr ra
|
||||
+
|
||||
+L(found):
|
||||
+ ctz.d t0, t0
|
||||
+ srli.d t0, t0, 3
|
||||
+ add.d a0, a0, t0
|
||||
+ jr ra
|
||||
+
|
||||
+L(out):
|
||||
+ move a0, zero
|
||||
+ jr ra
|
||||
+END(MEMCHR_NAME)
|
||||
+
|
||||
+libc_hidden_builtin_def (MEMCHR_NAME)
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S
|
||||
new file mode 100644
|
||||
index 00000000..a26cdf48
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S
|
||||
@@ -0,0 +1,117 @@
|
||||
+/* Optimized memchr implementation using LoongArch LASX instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc) && !defined __loongarch_soft_float
|
||||
+
|
||||
+# define MEMCHR __memchr_lasx
|
||||
+
|
||||
+LEAF(MEMCHR, 6)
|
||||
+ beqz a2, L(ret0)
|
||||
+ add.d a3, a0, a2
|
||||
+ andi t0, a0, 0x3f
|
||||
+ bstrins.d a0, zero, 5, 0
|
||||
+
|
||||
+ xvld xr0, a0, 0
|
||||
+ xvld xr1, a0, 32
|
||||
+ li.d t1, -1
|
||||
+ li.d t2, 64
|
||||
+
|
||||
+ xvreplgr2vr.b xr2, a1
|
||||
+ sll.d t3, t1, t0
|
||||
+ sub.d t2, t2, t0
|
||||
+ xvseq.b xr0, xr0, xr2
|
||||
+
|
||||
+ xvseq.b xr1, xr1, xr2
|
||||
+ xvmsknz.b xr0, xr0
|
||||
+ xvmsknz.b xr1, xr1
|
||||
+ xvpickve.w xr3, xr0, 4
|
||||
+
|
||||
+
|
||||
+ xvpickve.w xr4, xr1, 4
|
||||
+ vilvl.h vr0, vr3, vr0
|
||||
+ vilvl.h vr1, vr4, vr1
|
||||
+ vilvl.w vr0, vr1, vr0
|
||||
+
|
||||
+ movfr2gr.d t0, fa0
|
||||
+ and t0, t0, t3
|
||||
+ bgeu t2, a2, L(end)
|
||||
+ bnez t0, L(found)
|
||||
+
|
||||
+ addi.d a4, a3, -1
|
||||
+ bstrins.d a4, zero, 5, 0
|
||||
+L(loop):
|
||||
+ xvld xr0, a0, 64
|
||||
+ xvld xr1, a0, 96
|
||||
+
|
||||
+ addi.d a0, a0, 64
|
||||
+ xvseq.b xr0, xr0, xr2
|
||||
+ xvseq.b xr1, xr1, xr2
|
||||
+ beq a0, a4, L(out)
|
||||
+
|
||||
+
|
||||
+ xvmax.bu xr3, xr0, xr1
|
||||
+ xvseteqz.v fcc0, xr3
|
||||
+ bcnez fcc0, L(loop)
|
||||
+ xvmsknz.b xr0, xr0
|
||||
+
|
||||
+ xvmsknz.b xr1, xr1
|
||||
+ xvpickve.w xr3, xr0, 4
|
||||
+ xvpickve.w xr4, xr1, 4
|
||||
+ vilvl.h vr0, vr3, vr0
|
||||
+
|
||||
+ vilvl.h vr1, vr4, vr1
|
||||
+ vilvl.w vr0, vr1, vr0
|
||||
+ movfr2gr.d t0, fa0
|
||||
+L(found):
|
||||
+ ctz.d t1, t0
|
||||
+
|
||||
+ add.d a0, a0, t1
|
||||
+ jr ra
|
||||
+L(ret0):
|
||||
+ move a0, zero
|
||||
+ jr ra
|
||||
+
|
||||
+
|
||||
+L(out):
|
||||
+ xvmsknz.b xr0, xr0
|
||||
+ xvmsknz.b xr1, xr1
|
||||
+ xvpickve.w xr3, xr0, 4
|
||||
+ xvpickve.w xr4, xr1, 4
|
||||
+
|
||||
+ vilvl.h vr0, vr3, vr0
|
||||
+ vilvl.h vr1, vr4, vr1
|
||||
+ vilvl.w vr0, vr1, vr0
|
||||
+ movfr2gr.d t0, fa0
|
||||
+
|
||||
+L(end):
|
||||
+ sub.d t2, zero, a3
|
||||
+ srl.d t1, t1, t2
|
||||
+ and t0, t0, t1
|
||||
+ ctz.d t1, t0
|
||||
+
|
||||
+ add.d a0, a0, t1
|
||||
+ maskeqz a0, a0, t0
|
||||
+ jr ra
|
||||
+END(MEMCHR)
|
||||
+
|
||||
+libc_hidden_builtin_def (MEMCHR)
|
||||
+#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S
|
||||
new file mode 100644
|
||||
index 00000000..a73ecd25
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S
|
||||
@@ -0,0 +1,102 @@
|
||||
+/* Optimized memchr implementation using LoongArch LSX instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc) && !defined __loongarch_soft_float
|
||||
+
|
||||
+# define MEMCHR __memchr_lsx
|
||||
+
|
||||
+LEAF(MEMCHR, 6)
|
||||
+ beqz a2, L(ret0)
|
||||
+ add.d a3, a0, a2
|
||||
+ andi t0, a0, 0x1f
|
||||
+ bstrins.d a0, zero, 4, 0
|
||||
+
|
||||
+ vld vr0, a0, 0
|
||||
+ vld vr1, a0, 16
|
||||
+ li.d t1, -1
|
||||
+ li.d t2, 32
|
||||
+
|
||||
+ vreplgr2vr.b vr2, a1
|
||||
+ sll.d t3, t1, t0
|
||||
+ sub.d t2, t2, t0
|
||||
+ vseq.b vr0, vr0, vr2
|
||||
+
|
||||
+ vseq.b vr1, vr1, vr2
|
||||
+ vmsknz.b vr0, vr0
|
||||
+ vmsknz.b vr1, vr1
|
||||
+ vilvl.h vr0, vr1, vr0
|
||||
+
|
||||
+
|
||||
+ movfr2gr.s t0, fa0
|
||||
+ and t0, t0, t3
|
||||
+ bgeu t2, a2, L(end)
|
||||
+ bnez t0, L(found)
|
||||
+
|
||||
+ addi.d a4, a3, -1
|
||||
+ bstrins.d a4, zero, 4, 0
|
||||
+L(loop):
|
||||
+ vld vr0, a0, 32
|
||||
+ vld vr1, a0, 48
|
||||
+
|
||||
+ addi.d a0, a0, 32
|
||||
+ vseq.b vr0, vr0, vr2
|
||||
+ vseq.b vr1, vr1, vr2
|
||||
+ beq a0, a4, L(out)
|
||||
+
|
||||
+ vmax.bu vr3, vr0, vr1
|
||||
+ vseteqz.v fcc0, vr3
|
||||
+ bcnez fcc0, L(loop)
|
||||
+ vmsknz.b vr0, vr0
|
||||
+
|
||||
+
|
||||
+ vmsknz.b vr1, vr1
|
||||
+ vilvl.h vr0, vr1, vr0
|
||||
+ movfr2gr.s t0, fa0
|
||||
+L(found):
|
||||
+ ctz.w t0, t0
|
||||
+
|
||||
+ add.d a0, a0, t0
|
||||
+ jr ra
|
||||
+L(ret0):
|
||||
+ move a0, zero
|
||||
+ jr ra
|
||||
+
|
||||
+L(out):
|
||||
+ vmsknz.b vr0, vr0
|
||||
+ vmsknz.b vr1, vr1
|
||||
+ vilvl.h vr0, vr1, vr0
|
||||
+ movfr2gr.s t0, fa0
|
||||
+
|
||||
+L(end):
|
||||
+ sub.d t2, zero, a3
|
||||
+ srl.w t1, t1, t2
|
||||
+ and t0, t0, t1
|
||||
+ ctz.w t1, t0
|
||||
+
|
||||
+
|
||||
+ add.d a0, a0, t1
|
||||
+ maskeqz a0, a0, t0
|
||||
+ jr ra
|
||||
+END(MEMCHR)
|
||||
+
|
||||
+libc_hidden_builtin_def (MEMCHR)
|
||||
+#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memchr.c b/sysdeps/loongarch/lp64/multiarch/memchr.c
|
||||
new file mode 100644
|
||||
index 00000000..059479c0
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memchr.c
|
||||
@@ -0,0 +1,37 @@
|
||||
+/* Multiple versions of memchr.
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+/* Define multiple versions only for the definition in libc. */
|
||||
+#if IS_IN (libc)
|
||||
+# define memchr __redirect_memchr
|
||||
+# include <string.h>
|
||||
+# undef memchr
|
||||
+
|
||||
+# define SYMBOL_NAME memchr
|
||||
+# include "ifunc-memchr.h"
|
||||
+
|
||||
+libc_ifunc_redirected (__redirect_memchr, memchr,
|
||||
+ IFUNC_SELECTOR ());
|
||||
+
|
||||
+# ifdef SHARED
|
||||
+__hidden_ver1 (memchr, __GI_memchr, __redirect_memchr)
|
||||
+ __attribute__ ((visibility ("hidden"))) __attribute_copy__ (memchr);
|
||||
+# endif
|
||||
+
|
||||
+#endif
|
||||
--
|
||||
2.33.0
|
||||
|
946
LoongArch-Add-ifunc-support-for-memcmp-aligned-lsx-l.patch
Normal file
946
LoongArch-Add-ifunc-support-for-memcmp-aligned-lsx-l.patch
Normal file
|
@ -0,0 +1,946 @@
|
|||
From 60f4bbd1eec528ba8df044ae6b3091f6337a7fcc Mon Sep 17 00:00:00 2001
|
||||
From: dengjianbo <dengjianbo@loongson.cn>
|
||||
Date: Mon, 28 Aug 2023 10:08:39 +0800
|
||||
Subject: [PATCH 18/29] LoongArch: Add ifunc support for memcmp{aligned, lsx,
|
||||
lasx}
|
||||
|
||||
According to glibc memcmp microbenchmark test results(Add generic
|
||||
memcmp), this implementation have performance improvement
|
||||
except the length is less than 3, details as below:
|
||||
|
||||
Name Percent of time reduced
|
||||
memcmp-lasx 16%-74%
|
||||
memcmp-lsx 20%-50%
|
||||
memcmp-aligned 5%-20%
|
||||
|
||||
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/loongarch/lp64/multiarch/Makefile | 3 +
|
||||
.../lp64/multiarch/ifunc-impl-list.c | 7 +
|
||||
.../loongarch/lp64/multiarch/ifunc-memcmp.h | 40 +++
|
||||
.../loongarch/lp64/multiarch/memcmp-aligned.S | 292 ++++++++++++++++++
|
||||
.../loongarch/lp64/multiarch/memcmp-lasx.S | 207 +++++++++++++
|
||||
sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S | 269 ++++++++++++++++
|
||||
sysdeps/loongarch/lp64/multiarch/memcmp.c | 43 +++
|
||||
7 files changed, 861 insertions(+)
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-memcmp.h
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/memcmp.c
|
||||
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
index 216886c5..360a6718 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
@@ -34,5 +34,8 @@ sysdep_routines += \
|
||||
memset-unaligned \
|
||||
memset-lsx \
|
||||
memset-lasx \
|
||||
+ memcmp-aligned \
|
||||
+ memcmp-lsx \
|
||||
+ memcmp-lasx \
|
||||
# sysdep_routines
|
||||
endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
index 37f60dde..e397d58c 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
@@ -127,5 +127,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_aligned)
|
||||
)
|
||||
|
||||
+ IFUNC_IMPL (i, name, memcmp,
|
||||
+#if !defined __loongarch_soft_float
|
||||
+ IFUNC_IMPL_ADD (array, i, memcmp, SUPPORT_LASX, __memcmp_lasx)
|
||||
+ IFUNC_IMPL_ADD (array, i, memcmp, SUPPORT_LSX, __memcmp_lsx)
|
||||
+#endif
|
||||
+ IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_aligned)
|
||||
+ )
|
||||
return i;
|
||||
}
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-memcmp.h b/sysdeps/loongarch/lp64/multiarch/ifunc-memcmp.h
|
||||
new file mode 100644
|
||||
index 00000000..04adc2e5
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-memcmp.h
|
||||
@@ -0,0 +1,40 @@
|
||||
+/* Common definition for memcmp ifunc selections.
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <ldsodefs.h>
|
||||
+#include <ifunc-init.h>
|
||||
+
|
||||
+#if !defined __loongarch_soft_float
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
|
||||
+#endif
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
|
||||
+
|
||||
+static inline void *
|
||||
+IFUNC_SELECTOR (void)
|
||||
+{
|
||||
+#if !defined __loongarch_soft_float
|
||||
+ if (SUPPORT_LASX)
|
||||
+ return OPTIMIZE (lasx);
|
||||
+ else if (SUPPORT_LSX)
|
||||
+ return OPTIMIZE (lsx);
|
||||
+ else
|
||||
+#endif
|
||||
+ return OPTIMIZE (aligned);
|
||||
+}
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S
|
||||
new file mode 100644
|
||||
index 00000000..14a7caa9
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S
|
||||
@@ -0,0 +1,292 @@
|
||||
+/* Optimized memcmp implementation using basic LoongArch instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc)
|
||||
+# define MEMCMP_NAME __memcmp_aligned
|
||||
+#else
|
||||
+# define MEMCMP_NAME memcmp
|
||||
+#endif
|
||||
+
|
||||
+LEAF(MEMCMP_NAME, 6)
|
||||
+ beqz a2, L(ret)
|
||||
+ andi a4, a1, 0x7
|
||||
+ andi a3, a0, 0x7
|
||||
+ sltu a5, a4, a3
|
||||
+
|
||||
+ xor t0, a0, a1
|
||||
+ li.w t8, 8
|
||||
+ maskeqz t0, t0, a5
|
||||
+ li.w t7, -1
|
||||
+
|
||||
+ xor a0, a0, t0
|
||||
+ xor a1, a1, t0
|
||||
+ andi a3, a0, 0x7
|
||||
+ andi a4, a1, 0x7
|
||||
+
|
||||
+ xor a0, a0, a3
|
||||
+ xor a1, a1, a4
|
||||
+ ld.d t2, a0, 0
|
||||
+ ld.d t1, a1, 0
|
||||
+
|
||||
+ slli.d t3, a3, 3
|
||||
+ slli.d t4, a4, 3
|
||||
+ sub.d a6, t3, t4
|
||||
+ srl.d t1, t1, t4
|
||||
+
|
||||
+ srl.d t0, t2, t3
|
||||
+ srl.d t5, t7, t4
|
||||
+ sub.d t6, t0, t1
|
||||
+ and t6, t6, t5
|
||||
+
|
||||
+ sub.d t5, t8, a4
|
||||
+ bnez t6, L(first_out)
|
||||
+ bgeu t5, a2, L(ret)
|
||||
+ sub.d a2, a2, t5
|
||||
+
|
||||
+ bnez a6, L(unaligned)
|
||||
+ blt a2, t8, L(al_less_8bytes)
|
||||
+ andi t1, a2, 31
|
||||
+ beq t1, a2, L(al_less_32bytes)
|
||||
+
|
||||
+ sub.d t2, a2, t1
|
||||
+ add.d a4, a0, t2
|
||||
+ move a2, t1
|
||||
+
|
||||
+L(al_loop):
|
||||
+ ld.d t0, a0, 8
|
||||
+
|
||||
+ ld.d t1, a1, 8
|
||||
+ ld.d t2, a0, 16
|
||||
+ ld.d t3, a1, 16
|
||||
+ ld.d t4, a0, 24
|
||||
+
|
||||
+ ld.d t5, a1, 24
|
||||
+ ld.d t6, a0, 32
|
||||
+ ld.d t7, a1, 32
|
||||
+ addi.d a0, a0, 32
|
||||
+
|
||||
+ addi.d a1, a1, 32
|
||||
+ bne t0, t1, L(out1)
|
||||
+ bne t2, t3, L(out2)
|
||||
+ bne t4, t5, L(out3)
|
||||
+
|
||||
+ bne t6, t7, L(out4)
|
||||
+ bne a0, a4, L(al_loop)
|
||||
+
|
||||
+L(al_less_32bytes):
|
||||
+ srai.d a4, a2, 4
|
||||
+ beqz a4, L(al_less_16bytes)
|
||||
+
|
||||
+ ld.d t0, a0, 8
|
||||
+ ld.d t1, a1, 8
|
||||
+ ld.d t2, a0, 16
|
||||
+ ld.d t3, a1, 16
|
||||
+
|
||||
+ addi.d a0, a0, 16
|
||||
+ addi.d a1, a1, 16
|
||||
+ addi.d a2, a2, -16
|
||||
+ bne t0, t1, L(out1)
|
||||
+
|
||||
+ bne t2, t3, L(out2)
|
||||
+
|
||||
+L(al_less_16bytes):
|
||||
+ srai.d a4, a2, 3
|
||||
+ beqz a4, L(al_less_8bytes)
|
||||
+ ld.d t0, a0, 8
|
||||
+
|
||||
+ ld.d t1, a1, 8
|
||||
+ addi.d a0, a0, 8
|
||||
+ addi.d a1, a1, 8
|
||||
+ addi.d a2, a2, -8
|
||||
+
|
||||
+ bne t0, t1, L(out1)
|
||||
+
|
||||
+L(al_less_8bytes):
|
||||
+ beqz a2, L(ret)
|
||||
+ ld.d t0, a0, 8
|
||||
+ ld.d t1, a1, 8
|
||||
+
|
||||
+ li.d t7, -1
|
||||
+ slli.d t2, a2, 3
|
||||
+ sll.d t2, t7, t2
|
||||
+ sub.d t3, t0, t1
|
||||
+
|
||||
+ andn t6, t3, t2
|
||||
+ bnez t6, L(count_diff)
|
||||
+
|
||||
+L(ret):
|
||||
+ move a0, zero
|
||||
+ jr ra
|
||||
+
|
||||
+L(out4):
|
||||
+ move t0, t6
|
||||
+ move t1, t7
|
||||
+ sub.d t6, t6, t7
|
||||
+ b L(count_diff)
|
||||
+
|
||||
+L(out3):
|
||||
+ move t0, t4
|
||||
+ move t1, t5
|
||||
+ sub.d t6, t4, t5
|
||||
+ b L(count_diff)
|
||||
+
|
||||
+L(out2):
|
||||
+ move t0, t2
|
||||
+ move t1, t3
|
||||
+L(out1):
|
||||
+ sub.d t6, t0, t1
|
||||
+ b L(count_diff)
|
||||
+
|
||||
+L(first_out):
|
||||
+ slli.d t4, a2, 3
|
||||
+ slt t3, a2, t5
|
||||
+ sll.d t4, t7, t4
|
||||
+ maskeqz t4, t4, t3
|
||||
+
|
||||
+ andn t6, t6, t4
|
||||
+
|
||||
+L(count_diff):
|
||||
+ ctz.d t2, t6
|
||||
+ bstrins.d t2, zero, 2, 0
|
||||
+ srl.d t0, t0, t2
|
||||
+
|
||||
+ srl.d t1, t1, t2
|
||||
+ andi t0, t0, 0xff
|
||||
+ andi t1, t1, 0xff
|
||||
+ sub.d t2, t0, t1
|
||||
+
|
||||
+ sub.d t3, t1, t0
|
||||
+ masknez t2, t2, a5
|
||||
+ maskeqz t3, t3, a5
|
||||
+ or a0, t2, t3
|
||||
+
|
||||
+ jr ra
|
||||
+
|
||||
+L(unaligned):
|
||||
+ sub.d a7, zero, a6
|
||||
+ srl.d t0, t2, a6
|
||||
+ blt a2, t8, L(un_less_8bytes)
|
||||
+
|
||||
+ andi t1, a2, 31
|
||||
+ beq t1, a2, L(un_less_32bytes)
|
||||
+ sub.d t2, a2, t1
|
||||
+ add.d a4, a0, t2
|
||||
+
|
||||
+ move a2, t1
|
||||
+
|
||||
+L(un_loop):
|
||||
+ ld.d t2, a0, 8
|
||||
+ ld.d t1, a1, 8
|
||||
+ ld.d t4, a0, 16
|
||||
+
|
||||
+ ld.d t3, a1, 16
|
||||
+ ld.d t6, a0, 24
|
||||
+ ld.d t5, a1, 24
|
||||
+ ld.d t8, a0, 32
|
||||
+
|
||||
+ ld.d t7, a1, 32
|
||||
+ addi.d a0, a0, 32
|
||||
+ addi.d a1, a1, 32
|
||||
+ sll.d a3, t2, a7
|
||||
+
|
||||
+ or t0, a3, t0
|
||||
+ bne t0, t1, L(out1)
|
||||
+ srl.d t0, t2, a6
|
||||
+ sll.d a3, t4, a7
|
||||
+
|
||||
+ or t2, a3, t0
|
||||
+ bne t2, t3, L(out2)
|
||||
+ srl.d t0, t4, a6
|
||||
+ sll.d a3, t6, a7
|
||||
+
|
||||
+ or t4, a3, t0
|
||||
+ bne t4, t5, L(out3)
|
||||
+ srl.d t0, t6, a6
|
||||
+ sll.d a3, t8, a7
|
||||
+
|
||||
+ or t6, t0, a3
|
||||
+ bne t6, t7, L(out4)
|
||||
+ srl.d t0, t8, a6
|
||||
+ bne a0, a4, L(un_loop)
|
||||
+
|
||||
+L(un_less_32bytes):
|
||||
+ srai.d a4, a2, 4
|
||||
+ beqz a4, L(un_less_16bytes)
|
||||
+ ld.d t2, a0, 8
|
||||
+ ld.d t1, a1, 8
|
||||
+
|
||||
+ ld.d t4, a0, 16
|
||||
+ ld.d t3, a1, 16
|
||||
+ addi.d a0, a0, 16
|
||||
+ addi.d a1, a1, 16
|
||||
+
|
||||
+ addi.d a2, a2, -16
|
||||
+ sll.d a3, t2, a7
|
||||
+ or t0, a3, t0
|
||||
+ bne t0, t1, L(out1)
|
||||
+
|
||||
+ srl.d t0, t2, a6
|
||||
+ sll.d a3, t4, a7
|
||||
+ or t2, a3, t0
|
||||
+ bne t2, t3, L(out2)
|
||||
+
|
||||
+ srl.d t0, t4, a6
|
||||
+
|
||||
+L(un_less_16bytes):
|
||||
+ srai.d a4, a2, 3
|
||||
+ beqz a4, L(un_less_8bytes)
|
||||
+ ld.d t2, a0, 8
|
||||
+
|
||||
+ ld.d t1, a1, 8
|
||||
+ addi.d a0, a0, 8
|
||||
+ addi.d a1, a1, 8
|
||||
+ addi.d a2, a2, -8
|
||||
+
|
||||
+ sll.d a3, t2, a7
|
||||
+ or t0, a3, t0
|
||||
+ bne t0, t1, L(out1)
|
||||
+ srl.d t0, t2, a6
|
||||
+
|
||||
+L(un_less_8bytes):
|
||||
+ beqz a2, L(ret)
|
||||
+ andi a7, a7, 63
|
||||
+ slli.d a4, a2, 3
|
||||
+ bgeu a7, a4, L(last_cmp)
|
||||
+
|
||||
+ ld.d t2, a0, 8
|
||||
+ sll.d a3, t2, a7
|
||||
+ or t0, a3, t0
|
||||
+
|
||||
+L(last_cmp):
|
||||
+ ld.d t1, a1, 8
|
||||
+
|
||||
+ li.d t7, -1
|
||||
+ sll.d t2, t7, a4
|
||||
+ sub.d t3, t0, t1
|
||||
+ andn t6, t3, t2
|
||||
+
|
||||
+ bnez t6, L(count_diff)
|
||||
+ move a0, zero
|
||||
+ jr ra
|
||||
+END(MEMCMP_NAME)
|
||||
+
|
||||
+libc_hidden_builtin_def (MEMCMP_NAME)
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S
|
||||
new file mode 100644
|
||||
index 00000000..3151a179
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S
|
||||
@@ -0,0 +1,207 @@
|
||||
+/* Optimized memcmp implementation using LoongArch LASX instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc) && !defined __loongarch_soft_float
|
||||
+
|
||||
+# define MEMCMP __memcmp_lasx
|
||||
+
|
||||
+LEAF(MEMCMP, 6)
|
||||
+ li.d t2, 32
|
||||
+ add.d a3, a0, a2
|
||||
+ add.d a4, a1, a2
|
||||
+ bgeu t2, a2, L(less32)
|
||||
+
|
||||
+ li.d t1, 160
|
||||
+ bgeu a2, t1, L(make_aligned)
|
||||
+L(loop32):
|
||||
+ xvld xr0, a0, 0
|
||||
+ xvld xr1, a1, 0
|
||||
+
|
||||
+ addi.d a0, a0, 32
|
||||
+ addi.d a1, a1, 32
|
||||
+ addi.d a2, a2, -32
|
||||
+ xvseq.b xr2, xr0, xr1
|
||||
+
|
||||
+ xvsetanyeqz.b fcc0, xr2
|
||||
+ bcnez fcc0, L(end)
|
||||
+L(last_bytes):
|
||||
+ bltu t2, a2, L(loop32)
|
||||
+ xvld xr0, a3, -32
|
||||
+
|
||||
+
|
||||
+ xvld xr1, a4, -32
|
||||
+ xvseq.b xr2, xr0, xr1
|
||||
+L(end):
|
||||
+ xvmsknz.b xr2, xr2
|
||||
+ xvpermi.q xr4, xr0, 1
|
||||
+
|
||||
+ xvpickve.w xr3, xr2, 4
|
||||
+ xvpermi.q xr5, xr1, 1
|
||||
+ vilvl.h vr2, vr3, vr2
|
||||
+ movfr2gr.s t0, fa2
|
||||
+
|
||||
+ cto.w t0, t0
|
||||
+ vreplgr2vr.b vr2, t0
|
||||
+ vshuf.b vr0, vr4, vr0, vr2
|
||||
+ vshuf.b vr1, vr5, vr1, vr2
|
||||
+
|
||||
+ vpickve2gr.bu t0, vr0, 0
|
||||
+ vpickve2gr.bu t1, vr1, 0
|
||||
+ sub.d a0, t0, t1
|
||||
+ jr ra
|
||||
+
|
||||
+
|
||||
+L(less32):
|
||||
+ srli.d t0, a2, 4
|
||||
+ beqz t0, L(less16)
|
||||
+ vld vr0, a0, 0
|
||||
+ vld vr1, a1, 0
|
||||
+
|
||||
+ vld vr2, a3, -16
|
||||
+ vld vr3, a4, -16
|
||||
+L(short_ret):
|
||||
+ vseq.b vr4, vr0, vr1
|
||||
+ vseq.b vr5, vr2, vr3
|
||||
+
|
||||
+ vmsknz.b vr4, vr4
|
||||
+ vmsknz.b vr5, vr5
|
||||
+ vilvl.h vr4, vr5, vr4
|
||||
+ movfr2gr.s t0, fa4
|
||||
+
|
||||
+ cto.w t0, t0
|
||||
+ vreplgr2vr.b vr4, t0
|
||||
+ vshuf.b vr0, vr2, vr0, vr4
|
||||
+ vshuf.b vr1, vr3, vr1, vr4
|
||||
+
|
||||
+
|
||||
+ vpickve2gr.bu t0, vr0, 0
|
||||
+ vpickve2gr.bu t1, vr1, 0
|
||||
+ sub.d a0, t0, t1
|
||||
+ jr ra
|
||||
+
|
||||
+L(less16):
|
||||
+ srli.d t0, a2, 3
|
||||
+ beqz t0, L(less8)
|
||||
+ vldrepl.d vr0, a0, 0
|
||||
+ vldrepl.d vr1, a1, 0
|
||||
+
|
||||
+ vldrepl.d vr2, a3, -8
|
||||
+ vldrepl.d vr3, a4, -8
|
||||
+ b L(short_ret)
|
||||
+ nop
|
||||
+
|
||||
+L(less8):
|
||||
+ srli.d t0, a2, 2
|
||||
+ beqz t0, L(less4)
|
||||
+ vldrepl.w vr0, a0, 0
|
||||
+ vldrepl.w vr1, a1, 0
|
||||
+
|
||||
+
|
||||
+ vldrepl.w vr2, a3, -4
|
||||
+ vldrepl.w vr3, a4, -4
|
||||
+ b L(short_ret)
|
||||
+ nop
|
||||
+
|
||||
+L(less4):
|
||||
+ srli.d t0, a2, 1
|
||||
+ beqz t0, L(less2)
|
||||
+ vldrepl.h vr0, a0, 0
|
||||
+ vldrepl.h vr1, a1, 0
|
||||
+
|
||||
+ vldrepl.h vr2, a3, -2
|
||||
+ vldrepl.h vr3, a4, -2
|
||||
+ b L(short_ret)
|
||||
+ nop
|
||||
+
|
||||
+L(less2):
|
||||
+ beqz a2, L(ret0)
|
||||
+ ld.bu t0, a0, 0
|
||||
+ ld.bu t1, a1, 0
|
||||
+ sub.d a0, t0, t1
|
||||
+
|
||||
+ jr ra
|
||||
+L(ret0):
|
||||
+ move a0, zero
|
||||
+ jr ra
|
||||
+
|
||||
+L(make_aligned):
|
||||
+ xvld xr0, a0, 0
|
||||
+
|
||||
+ xvld xr1, a1, 0
|
||||
+ xvseq.b xr2, xr0, xr1
|
||||
+ xvsetanyeqz.b fcc0, xr2
|
||||
+ bcnez fcc0, L(end)
|
||||
+
|
||||
+ andi t0, a0, 0x1f
|
||||
+ sub.d t0, t2, t0
|
||||
+ sub.d t1, a2, t0
|
||||
+ add.d a0, a0, t0
|
||||
+
|
||||
+ add.d a1, a1, t0
|
||||
+ andi a2, t1, 0x3f
|
||||
+ sub.d t0, t1, a2
|
||||
+ add.d a5, a0, t0
|
||||
+
|
||||
+
|
||||
+L(loop_align):
|
||||
+ xvld xr0, a0, 0
|
||||
+ xvld xr1, a1, 0
|
||||
+ xvld xr2, a0, 32
|
||||
+ xvld xr3, a1, 32
|
||||
+
|
||||
+ xvseq.b xr0, xr0, xr1
|
||||
+ xvseq.b xr1, xr2, xr3
|
||||
+ xvmin.bu xr2, xr1, xr0
|
||||
+ xvsetanyeqz.b fcc0, xr2
|
||||
+
|
||||
+ bcnez fcc0, L(pair_end)
|
||||
+ addi.d a0, a0, 64
|
||||
+ addi.d a1, a1, 64
|
||||
+ bne a0, a5, L(loop_align)
|
||||
+
|
||||
+ bnez a2, L(last_bytes)
|
||||
+ move a0, zero
|
||||
+ jr ra
|
||||
+ nop
|
||||
+
|
||||
+
|
||||
+L(pair_end):
|
||||
+ xvmsknz.b xr0, xr0
|
||||
+ xvmsknz.b xr1, xr1
|
||||
+ xvpickve.w xr2, xr0, 4
|
||||
+ xvpickve.w xr3, xr1, 4
|
||||
+
|
||||
+ vilvl.h vr0, vr2, vr0
|
||||
+ vilvl.h vr1, vr3, vr1
|
||||
+ vilvl.w vr0, vr1, vr0
|
||||
+ movfr2gr.d t0, fa0
|
||||
+
|
||||
+ cto.d t0, t0
|
||||
+ ldx.bu t1, a0, t0
|
||||
+ ldx.bu t2, a1, t0
|
||||
+ sub.d a0, t1, t2
|
||||
+
|
||||
+ jr ra
|
||||
+END(MEMCMP)
|
||||
+
|
||||
+libc_hidden_builtin_def (MEMCMP)
|
||||
+#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S
|
||||
new file mode 100644
|
||||
index 00000000..38a50a4c
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S
|
||||
@@ -0,0 +1,269 @@
|
||||
+/* Optimized memcmp implementation using LoongArch LSX instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc) && !defined __loongarch_soft_float
|
||||
+
|
||||
+#define MEMCMP __memcmp_lsx
|
||||
+
|
||||
+LEAF(MEMCMP, 6)
|
||||
+ beqz a2, L(out)
|
||||
+ pcalau12i t0, %pc_hi20(L(INDEX))
|
||||
+ andi a3, a0, 0xf
|
||||
+ vld vr5, t0, %pc_lo12(L(INDEX))
|
||||
+
|
||||
+ andi a4, a1, 0xf
|
||||
+ bne a3, a4, L(unaligned)
|
||||
+ bstrins.d a0, zero, 3, 0
|
||||
+ xor a1, a1, a4
|
||||
+
|
||||
+ vld vr0, a0, 0
|
||||
+ vld vr1, a1, 0
|
||||
+ li.d t0, 16
|
||||
+ vreplgr2vr.b vr3, a3
|
||||
+
|
||||
+ sub.d t1, t0, a3
|
||||
+ vadd.b vr3, vr3, vr5
|
||||
+ vshuf.b vr0, vr3, vr0, vr3
|
||||
+ vshuf.b vr1, vr3, vr1, vr3
|
||||
+
|
||||
+
|
||||
+ vseq.b vr4, vr0, vr1
|
||||
+ bgeu t1, a2, L(al_end)
|
||||
+ vsetanyeqz.b fcc0, vr4
|
||||
+ bcnez fcc0, L(al_found)
|
||||
+
|
||||
+ sub.d t1, a2, t1
|
||||
+ andi a2, t1, 31
|
||||
+ beq a2, t1, L(al_less_32bytes)
|
||||
+ sub.d t2, t1, a2
|
||||
+
|
||||
+ add.d a4, a0, t2
|
||||
+L(al_loop):
|
||||
+ vld vr0, a0, 16
|
||||
+ vld vr1, a1, 16
|
||||
+ vld vr2, a0, 32
|
||||
+
|
||||
+ vld vr3, a1, 32
|
||||
+ addi.d a0, a0, 32
|
||||
+ addi.d a1, a1, 32
|
||||
+ vseq.b vr4, vr0, vr1
|
||||
+
|
||||
+
|
||||
+ vseq.b vr6, vr2, vr3
|
||||
+ vand.v vr6, vr4, vr6
|
||||
+ vsetanyeqz.b fcc0, vr6
|
||||
+ bcnez fcc0, L(al_pair_end)
|
||||
+
|
||||
+ bne a0, a4, L(al_loop)
|
||||
+L(al_less_32bytes):
|
||||
+ bgeu t0, a2, L(al_less_16bytes)
|
||||
+ vld vr0, a0, 16
|
||||
+ vld vr1, a1, 16
|
||||
+
|
||||
+ vld vr2, a0, 32
|
||||
+ vld vr3, a1, 32
|
||||
+ addi.d a2, a2, -16
|
||||
+ vreplgr2vr.b vr6, a2
|
||||
+
|
||||
+ vslt.b vr5, vr5, vr6
|
||||
+ vseq.b vr4, vr0, vr1
|
||||
+ vseq.b vr6, vr2, vr3
|
||||
+ vorn.v vr6, vr6, vr5
|
||||
+
|
||||
+
|
||||
+L(al_pair_end):
|
||||
+ vsetanyeqz.b fcc0, vr4
|
||||
+ bcnez fcc0, L(al_found)
|
||||
+ vnori.b vr4, vr6, 0
|
||||
+ vfrstpi.b vr4, vr4, 0
|
||||
+
|
||||
+ vshuf.b vr0, vr2, vr2, vr4
|
||||
+ vshuf.b vr1, vr3, vr3, vr4
|
||||
+ vpickve2gr.bu t0, vr0, 0
|
||||
+ vpickve2gr.bu t1, vr1, 0
|
||||
+
|
||||
+ sub.d a0, t0, t1
|
||||
+ jr ra
|
||||
+ nop
|
||||
+ nop
|
||||
+
|
||||
+L(al_less_16bytes):
|
||||
+ beqz a2, L(out)
|
||||
+ vld vr0, a0, 16
|
||||
+ vld vr1, a1, 16
|
||||
+ vseq.b vr4, vr0, vr1
|
||||
+
|
||||
+
|
||||
+L(al_end):
|
||||
+ vreplgr2vr.b vr6, a2
|
||||
+ vslt.b vr5, vr5, vr6
|
||||
+ vorn.v vr4, vr4, vr5
|
||||
+ nop
|
||||
+
|
||||
+L(al_found):
|
||||
+ vnori.b vr4, vr4, 0
|
||||
+ vfrstpi.b vr4, vr4, 0
|
||||
+ vshuf.b vr0, vr0, vr0, vr4
|
||||
+ vshuf.b vr1, vr1, vr1, vr4
|
||||
+
|
||||
+ vpickve2gr.bu t0, vr0, 0
|
||||
+ vpickve2gr.bu t1, vr1, 0
|
||||
+ sub.d a0, t0, t1
|
||||
+ jr ra
|
||||
+
|
||||
+L(out):
|
||||
+ move a0, zero
|
||||
+ jr ra
|
||||
+ nop
|
||||
+ nop
|
||||
+
|
||||
+
|
||||
+L(unaligned):
|
||||
+ xor t2, a0, a1
|
||||
+ sltu a5, a3, a4
|
||||
+ masknez t2, t2, a5
|
||||
+ xor a0, a0, t2
|
||||
+
|
||||
+ xor a1, a1, t2
|
||||
+ andi a3, a0, 0xf
|
||||
+ andi a4, a1, 0xf
|
||||
+ bstrins.d a0, zero, 3, 0
|
||||
+
|
||||
+ xor a1, a1, a4
|
||||
+ vld vr4, a0, 0
|
||||
+ vld vr1, a1, 0
|
||||
+ li.d t0, 16
|
||||
+
|
||||
+ vreplgr2vr.b vr2, a4
|
||||
+ sub.d a6, a4, a3
|
||||
+ sub.d t1, t0, a4
|
||||
+ sub.d t2, t0, a6
|
||||
+
|
||||
+
|
||||
+ vadd.b vr2, vr2, vr5
|
||||
+ vreplgr2vr.b vr6, t2
|
||||
+ vadd.b vr6, vr6, vr5
|
||||
+ vshuf.b vr0, vr4, vr4, vr6
|
||||
+
|
||||
+ vshuf.b vr1, vr2, vr1, vr2
|
||||
+ vshuf.b vr0, vr2, vr0, vr2
|
||||
+ vseq.b vr7, vr0, vr1
|
||||
+ bgeu t1, a2, L(un_end)
|
||||
+
|
||||
+ vsetanyeqz.b fcc0, vr7
|
||||
+ bcnez fcc0, L(un_found)
|
||||
+ sub.d a2, a2, t1
|
||||
+ andi t1, a2, 31
|
||||
+
|
||||
+ beq a2, t1, L(un_less_32bytes)
|
||||
+ sub.d t2, a2, t1
|
||||
+ move a2, t1
|
||||
+ add.d a4, a1, t2
|
||||
+
|
||||
+
|
||||
+L(un_loop):
|
||||
+ vld vr2, a0, 16
|
||||
+ vld vr1, a1, 16
|
||||
+ vld vr3, a1, 32
|
||||
+ addi.d a1, a1, 32
|
||||
+
|
||||
+ addi.d a0, a0, 32
|
||||
+ vshuf.b vr0, vr2, vr4, vr6
|
||||
+ vld vr4, a0, 0
|
||||
+ vseq.b vr7, vr0, vr1
|
||||
+
|
||||
+ vshuf.b vr2, vr4, vr2, vr6
|
||||
+ vseq.b vr8, vr2, vr3
|
||||
+ vand.v vr8, vr7, vr8
|
||||
+ vsetanyeqz.b fcc0, vr8
|
||||
+
|
||||
+ bcnez fcc0, L(un_pair_end)
|
||||
+ bne a1, a4, L(un_loop)
|
||||
+
|
||||
+L(un_less_32bytes):
|
||||
+ bltu a2, t0, L(un_less_16bytes)
|
||||
+ vld vr2, a0, 16
|
||||
+ vld vr1, a1, 16
|
||||
+ addi.d a0, a0, 16
|
||||
+
|
||||
+ addi.d a1, a1, 16
|
||||
+ addi.d a2, a2, -16
|
||||
+ vshuf.b vr0, vr2, vr4, vr6
|
||||
+ vor.v vr4, vr2, vr2
|
||||
+
|
||||
+ vseq.b vr7, vr0, vr1
|
||||
+ vsetanyeqz.b fcc0, vr7
|
||||
+ bcnez fcc0, L(un_found)
|
||||
+L(un_less_16bytes):
|
||||
+ beqz a2, L(out)
|
||||
+ vld vr1, a1, 16
|
||||
+ bgeu a6, a2, 1f
|
||||
+
|
||||
+ vld vr2, a0, 16
|
||||
+1:
|
||||
+ vshuf.b vr0, vr2, vr4, vr6
|
||||
+ vseq.b vr7, vr0, vr1
|
||||
+L(un_end):
|
||||
+ vreplgr2vr.b vr3, a2
|
||||
+
|
||||
+
|
||||
+ vslt.b vr3, vr5, vr3
|
||||
+ vorn.v vr7, vr7, vr3
|
||||
+
|
||||
+L(un_found):
|
||||
+ vnori.b vr7, vr7, 0
|
||||
+ vfrstpi.b vr7, vr7, 0
|
||||
+
|
||||
+ vshuf.b vr0, vr0, vr0, vr7
|
||||
+ vshuf.b vr1, vr1, vr1, vr7
|
||||
+L(calc_result):
|
||||
+ vpickve2gr.bu t0, vr0, 0
|
||||
+ vpickve2gr.bu t1, vr1, 0
|
||||
+
|
||||
+ sub.d t2, t0, t1
|
||||
+ sub.d t3, t1, t0
|
||||
+ masknez t0, t3, a5
|
||||
+ maskeqz t1, t2, a5
|
||||
+
|
||||
+ or a0, t0, t1
|
||||
+ jr ra
|
||||
+L(un_pair_end):
|
||||
+ vsetanyeqz.b fcc0, vr7
|
||||
+ bcnez fcc0, L(un_found)
|
||||
+
|
||||
+
|
||||
+ vnori.b vr7, vr8, 0
|
||||
+ vfrstpi.b vr7, vr7, 0
|
||||
+ vshuf.b vr0, vr2, vr2, vr7
|
||||
+ vshuf.b vr1, vr3, vr3, vr7
|
||||
+
|
||||
+ b L(calc_result)
|
||||
+END(MEMCMP)
|
||||
+
|
||||
+ .section .rodata.cst16,"M",@progbits,16
|
||||
+ .align 4
|
||||
+L(INDEX):
|
||||
+ .dword 0x0706050403020100
|
||||
+ .dword 0x0f0e0d0c0b0a0908
|
||||
+
|
||||
+libc_hidden_builtin_def (MEMCMP)
|
||||
+#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp.c b/sysdeps/loongarch/lp64/multiarch/memcmp.c
|
||||
new file mode 100644
|
||||
index 00000000..32eccac2
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memcmp.c
|
||||
@@ -0,0 +1,43 @@
|
||||
+/* Multiple versions of memcmp.
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+/* Define multiple versions only for the definition in libc. */
|
||||
+#if IS_IN (libc)
|
||||
+# define memcmp __redirect_memcmp
|
||||
+# include <string.h>
|
||||
+# undef memcmp
|
||||
+
|
||||
+# define SYMBOL_NAME memcmp
|
||||
+# include "ifunc-memcmp.h"
|
||||
+
|
||||
+libc_ifunc_redirected (__redirect_memcmp, memcmp,
|
||||
+ IFUNC_SELECTOR ());
|
||||
+# undef bcmp
|
||||
+weak_alias (memcmp, bcmp)
|
||||
+
|
||||
+# undef __memcmpeq
|
||||
+strong_alias (memcmp, __memcmpeq)
|
||||
+libc_hidden_def (__memcmpeq)
|
||||
+
|
||||
+# ifdef SHARED
|
||||
+__hidden_ver1 (memcmp, __GI_memcmp, __redirect_memcmp)
|
||||
+ __attribute__ ((visibility ("hidden"))) __attribute_copy__ (memcmp);
|
||||
+# endif
|
||||
+
|
||||
+#endif
|
||||
--
|
||||
2.33.0
|
||||
|
417
LoongArch-Add-ifunc-support-for-memrchr-lsx-lasx.patch
Normal file
417
LoongArch-Add-ifunc-support-for-memrchr-lsx-lasx.patch
Normal file
|
@ -0,0 +1,417 @@
|
|||
From c4c272fb8067364530a2a78df92c37403acc963f Mon Sep 17 00:00:00 2001
|
||||
From: dengjianbo <dengjianbo@loongson.cn>
|
||||
Date: Mon, 28 Aug 2023 10:08:37 +0800
|
||||
Subject: [PATCH 16/29] LoongArch: Add ifunc support for memrchr{lsx, lasx}
|
||||
|
||||
According to glibc memrchr microbenchmark, this implementation could reduce
|
||||
the runtime as following:
|
||||
|
||||
Name Percent of rutime reduced
|
||||
memrchr-lasx 20%-83%
|
||||
memrchr-lsx 20%-64%
|
||||
|
||||
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/loongarch/lp64/multiarch/Makefile | 3 +
|
||||
.../lp64/multiarch/ifunc-impl-list.c | 8 ++
|
||||
.../loongarch/lp64/multiarch/ifunc-memrchr.h | 40 ++++++
|
||||
.../lp64/multiarch/memrchr-generic.c | 23 ++++
|
||||
.../loongarch/lp64/multiarch/memrchr-lasx.S | 123 ++++++++++++++++++
|
||||
.../loongarch/lp64/multiarch/memrchr-lsx.S | 105 +++++++++++++++
|
||||
sysdeps/loongarch/lp64/multiarch/memrchr.c | 33 +++++
|
||||
7 files changed, 335 insertions(+)
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-memrchr.h
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/memrchr-generic.c
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/memrchr.c
|
||||
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
index 2f4802cf..7b87bc90 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
@@ -27,5 +27,8 @@ sysdep_routines += \
|
||||
memchr-aligned \
|
||||
memchr-lsx \
|
||||
memchr-lasx \
|
||||
+ memrchr-generic \
|
||||
+ memrchr-lsx \
|
||||
+ memrchr-lasx \
|
||||
# sysdep_routines
|
||||
endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
index a567b9cf..8bd5489e 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
@@ -109,5 +109,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
#endif
|
||||
IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_aligned)
|
||||
)
|
||||
+
|
||||
+ IFUNC_IMPL (i, name, memrchr,
|
||||
+#if !defined __loongarch_soft_float
|
||||
+ IFUNC_IMPL_ADD (array, i, memrchr, SUPPORT_LASX, __memrchr_lasx)
|
||||
+ IFUNC_IMPL_ADD (array, i, memrchr, SUPPORT_LSX, __memrchr_lsx)
|
||||
+#endif
|
||||
+ IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_generic)
|
||||
+ )
|
||||
return i;
|
||||
}
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-memrchr.h b/sysdeps/loongarch/lp64/multiarch/ifunc-memrchr.h
|
||||
new file mode 100644
|
||||
index 00000000..8215f9ad
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-memrchr.h
|
||||
@@ -0,0 +1,40 @@
|
||||
+/* Common definition for memrchr implementation.
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <ldsodefs.h>
|
||||
+#include <ifunc-init.h>
|
||||
+
|
||||
+#if !defined __loongarch_soft_float
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
|
||||
+#endif
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
|
||||
+
|
||||
+static inline void *
|
||||
+IFUNC_SELECTOR (void)
|
||||
+{
|
||||
+#if !defined __loongarch_soft_float
|
||||
+ if (SUPPORT_LASX)
|
||||
+ return OPTIMIZE (lasx);
|
||||
+ else if (SUPPORT_LSX)
|
||||
+ return OPTIMIZE (lsx);
|
||||
+ else
|
||||
+#endif
|
||||
+ return OPTIMIZE (generic);
|
||||
+}
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-generic.c b/sysdeps/loongarch/lp64/multiarch/memrchr-generic.c
|
||||
new file mode 100644
|
||||
index 00000000..ced61ebc
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memrchr-generic.c
|
||||
@@ -0,0 +1,23 @@
|
||||
+/* Generic implementation of memrchr.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#if IS_IN (libc)
|
||||
+# define MEMRCHR __memrchr_generic
|
||||
+#endif
|
||||
+
|
||||
+#include <string/memrchr.c>
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S
|
||||
new file mode 100644
|
||||
index 00000000..5f3e0d06
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S
|
||||
@@ -0,0 +1,123 @@
|
||||
+/* Optimized memrchr implementation using LoongArch LASX instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc) && !defined __loongarch_soft_float
|
||||
+
|
||||
+#ifndef MEMRCHR
|
||||
+# define MEMRCHR __memrchr_lasx
|
||||
+#endif
|
||||
+
|
||||
+LEAF(MEMRCHR, 6)
|
||||
+ beqz a2, L(ret0)
|
||||
+ addi.d a2, a2, -1
|
||||
+ add.d a3, a0, a2
|
||||
+ andi t1, a3, 0x3f
|
||||
+
|
||||
+ bstrins.d a3, zero, 5, 0
|
||||
+ addi.d t1, t1, 1
|
||||
+ xvld xr0, a3, 0
|
||||
+ xvld xr1, a3, 32
|
||||
+
|
||||
+ sub.d t2, zero, t1
|
||||
+ li.d t3, -1
|
||||
+ xvreplgr2vr.b xr2, a1
|
||||
+ andi t4, a0, 0x3f
|
||||
+
|
||||
+ srl.d t2, t3, t2
|
||||
+ xvseq.b xr0, xr0, xr2
|
||||
+ xvseq.b xr1, xr1, xr2
|
||||
+ xvmsknz.b xr0, xr0
|
||||
+
|
||||
+
|
||||
+ xvmsknz.b xr1, xr1
|
||||
+ xvpickve.w xr3, xr0, 4
|
||||
+ xvpickve.w xr4, xr1, 4
|
||||
+ vilvl.h vr0, vr3, vr0
|
||||
+
|
||||
+ vilvl.h vr1, vr4, vr1
|
||||
+ vilvl.w vr0, vr1, vr0
|
||||
+ movfr2gr.d t0, fa0
|
||||
+ and t0, t0, t2
|
||||
+
|
||||
+ bltu a2, t1, L(end)
|
||||
+ bnez t0, L(found)
|
||||
+ bstrins.d a0, zero, 5, 0
|
||||
+L(loop):
|
||||
+ xvld xr0, a3, -64
|
||||
+
|
||||
+ xvld xr1, a3, -32
|
||||
+ addi.d a3, a3, -64
|
||||
+ xvseq.b xr0, xr0, xr2
|
||||
+ xvseq.b xr1, xr1, xr2
|
||||
+
|
||||
+
|
||||
+ beq a0, a3, L(out)
|
||||
+ xvmax.bu xr3, xr0, xr1
|
||||
+ xvseteqz.v fcc0, xr3
|
||||
+ bcnez fcc0, L(loop)
|
||||
+
|
||||
+ xvmsknz.b xr0, xr0
|
||||
+ xvmsknz.b xr1, xr1
|
||||
+ xvpickve.w xr3, xr0, 4
|
||||
+ xvpickve.w xr4, xr1, 4
|
||||
+
|
||||
+ vilvl.h vr0, vr3, vr0
|
||||
+ vilvl.h vr1, vr4, vr1
|
||||
+ vilvl.w vr0, vr1, vr0
|
||||
+ movfr2gr.d t0, fa0
|
||||
+
|
||||
+L(found):
|
||||
+ addi.d a0, a3, 63
|
||||
+ clz.d t1, t0
|
||||
+ sub.d a0, a0, t1
|
||||
+ jr ra
|
||||
+
|
||||
+
|
||||
+L(out):
|
||||
+ xvmsknz.b xr0, xr0
|
||||
+ xvmsknz.b xr1, xr1
|
||||
+ xvpickve.w xr3, xr0, 4
|
||||
+ xvpickve.w xr4, xr1, 4
|
||||
+
|
||||
+ vilvl.h vr0, vr3, vr0
|
||||
+ vilvl.h vr1, vr4, vr1
|
||||
+ vilvl.w vr0, vr1, vr0
|
||||
+ movfr2gr.d t0, fa0
|
||||
+
|
||||
+L(end):
|
||||
+ sll.d t2, t3, t4
|
||||
+ and t0, t0, t2
|
||||
+ addi.d a0, a3, 63
|
||||
+ clz.d t1, t0
|
||||
+
|
||||
+ sub.d a0, a0, t1
|
||||
+ maskeqz a0, a0, t0
|
||||
+ jr ra
|
||||
+L(ret0):
|
||||
+ move a0, zero
|
||||
+
|
||||
+
|
||||
+ jr ra
|
||||
+END(MEMRCHR)
|
||||
+
|
||||
+libc_hidden_builtin_def (MEMRCHR)
|
||||
+#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S
|
||||
new file mode 100644
|
||||
index 00000000..39a7c8b0
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S
|
||||
@@ -0,0 +1,105 @@
|
||||
+/* Optimized memrchr implementation using LoongArch LSX instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc) && !defined __loongarch_soft_float
|
||||
+
|
||||
+# define MEMRCHR __memrchr_lsx
|
||||
+
|
||||
+LEAF(MEMRCHR, 6)
|
||||
+ beqz a2, L(ret0)
|
||||
+ addi.d a2, a2, -1
|
||||
+ add.d a3, a0, a2
|
||||
+ andi t1, a3, 0x1f
|
||||
+
|
||||
+ bstrins.d a3, zero, 4, 0
|
||||
+ addi.d t1, t1, 1
|
||||
+ vld vr0, a3, 0
|
||||
+ vld vr1, a3, 16
|
||||
+
|
||||
+ sub.d t2, zero, t1
|
||||
+ li.d t3, -1
|
||||
+ vreplgr2vr.b vr2, a1
|
||||
+ andi t4, a0, 0x1f
|
||||
+
|
||||
+ srl.d t2, t3, t2
|
||||
+ vseq.b vr0, vr0, vr2
|
||||
+ vseq.b vr1, vr1, vr2
|
||||
+ vmsknz.b vr0, vr0
|
||||
+
|
||||
+
|
||||
+ vmsknz.b vr1, vr1
|
||||
+ vilvl.h vr0, vr1, vr0
|
||||
+ movfr2gr.s t0, fa0
|
||||
+ and t0, t0, t2
|
||||
+
|
||||
+ bltu a2, t1, L(end)
|
||||
+ bnez t0, L(found)
|
||||
+ bstrins.d a0, zero, 4, 0
|
||||
+L(loop):
|
||||
+ vld vr0, a3, -32
|
||||
+
|
||||
+ vld vr1, a3, -16
|
||||
+ addi.d a3, a3, -32
|
||||
+ vseq.b vr0, vr0, vr2
|
||||
+ vseq.b vr1, vr1, vr2
|
||||
+
|
||||
+ beq a0, a3, L(out)
|
||||
+ vmax.bu vr3, vr0, vr1
|
||||
+ vseteqz.v fcc0, vr3
|
||||
+ bcnez fcc0, L(loop)
|
||||
+
|
||||
+
|
||||
+ vmsknz.b vr0, vr0
|
||||
+ vmsknz.b vr1, vr1
|
||||
+ vilvl.h vr0, vr1, vr0
|
||||
+ movfr2gr.s t0, fa0
|
||||
+
|
||||
+L(found):
|
||||
+ addi.d a0, a3, 31
|
||||
+ clz.w t1, t0
|
||||
+ sub.d a0, a0, t1
|
||||
+ jr ra
|
||||
+
|
||||
+L(out):
|
||||
+ vmsknz.b vr0, vr0
|
||||
+ vmsknz.b vr1, vr1
|
||||
+ vilvl.h vr0, vr1, vr0
|
||||
+ movfr2gr.s t0, fa0
|
||||
+
|
||||
+L(end):
|
||||
+ sll.d t2, t3, t4
|
||||
+ and t0, t0, t2
|
||||
+ addi.d a0, a3, 31
|
||||
+ clz.w t1, t0
|
||||
+
|
||||
+
|
||||
+ sub.d a0, a0, t1
|
||||
+ maskeqz a0, a0, t0
|
||||
+ jr ra
|
||||
+L(ret0):
|
||||
+ move a0, zero
|
||||
+
|
||||
+ jr ra
|
||||
+END(MEMRCHR)
|
||||
+
|
||||
+libc_hidden_builtin_def (MEMRCHR)
|
||||
+#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr.c b/sysdeps/loongarch/lp64/multiarch/memrchr.c
|
||||
new file mode 100644
|
||||
index 00000000..8baba9ab
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memrchr.c
|
||||
@@ -0,0 +1,33 @@
|
||||
+/* Multiple versions of memrchr.
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+/* Define multiple versions only for the definition in libc. */
|
||||
+#if IS_IN (libc)
|
||||
+# define memrchr __redirect_memrchr
|
||||
+# include <string.h>
|
||||
+# undef memrchr
|
||||
+
|
||||
+# define SYMBOL_NAME memrchr
|
||||
+# include "ifunc-memrchr.h"
|
||||
+
|
||||
+libc_ifunc_redirected (__redirect_memrchr, __memrchr, IFUNC_SELECTOR ());
|
||||
+libc_hidden_def (__memrchr)
|
||||
+weak_alias (__memrchr, memrchr)
|
||||
+
|
||||
+#endif
|
||||
--
|
||||
2.33.0
|
||||
|
784
LoongArch-Add-ifunc-support-for-memset-aligned-unali.patch
Normal file
784
LoongArch-Add-ifunc-support-for-memset-aligned-unali.patch
Normal file
|
@ -0,0 +1,784 @@
|
|||
From 14032f7bbe18443af8492f5d0365f72b76701673 Mon Sep 17 00:00:00 2001
|
||||
From: dengjianbo <dengjianbo@loongson.cn>
|
||||
Date: Mon, 28 Aug 2023 10:08:38 +0800
|
||||
Subject: [PATCH 17/29] LoongArch: Add ifunc support for memset{aligned,
|
||||
unaligned, lsx, lasx}
|
||||
|
||||
According to glibc memset microbenchmark test results, for LSX and LASX
|
||||
versions, A few cases with length less than 8 experience performace
|
||||
degradation, overall, the LASX version could reduce the runtime about
|
||||
15% - 75%, LSX version could reduce the runtime about 15%-50%.
|
||||
|
||||
The unaligned version uses unaligned memmory access to set data which
|
||||
length is less than 64 and make address aligned with 8. For this part,
|
||||
the performace is better than aligned version. Comparing with the generic
|
||||
version, the performance is close when the length is larger than 128. When
|
||||
the length is 8-128, the unaligned version could reduce the runtime about
|
||||
30%-70%, the aligned version could reduce the runtime about 20%-50%.
|
||||
|
||||
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/loongarch/lp64/multiarch/Makefile | 4 +
|
||||
.../lp64/multiarch/dl-symbol-redir-ifunc.h | 24 +++
|
||||
.../lp64/multiarch/ifunc-impl-list.c | 10 +
|
||||
.../loongarch/lp64/multiarch/memset-aligned.S | 174 ++++++++++++++++++
|
||||
.../loongarch/lp64/multiarch/memset-lasx.S | 142 ++++++++++++++
|
||||
sysdeps/loongarch/lp64/multiarch/memset-lsx.S | 135 ++++++++++++++
|
||||
.../lp64/multiarch/memset-unaligned.S | 162 ++++++++++++++++
|
||||
sysdeps/loongarch/lp64/multiarch/memset.c | 37 ++++
|
||||
8 files changed, 688 insertions(+)
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/dl-symbol-redir-ifunc.h
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/memset-aligned.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/memset-lasx.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/memset-lsx.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/memset.c
|
||||
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
index 7b87bc90..216886c5 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
@@ -30,5 +30,9 @@ sysdep_routines += \
|
||||
memrchr-generic \
|
||||
memrchr-lsx \
|
||||
memrchr-lasx \
|
||||
+ memset-aligned \
|
||||
+ memset-unaligned \
|
||||
+ memset-lsx \
|
||||
+ memset-lasx \
|
||||
# sysdep_routines
|
||||
endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/dl-symbol-redir-ifunc.h b/sysdeps/loongarch/lp64/multiarch/dl-symbol-redir-ifunc.h
|
||||
new file mode 100644
|
||||
index 00000000..e2723873
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/dl-symbol-redir-ifunc.h
|
||||
@@ -0,0 +1,24 @@
|
||||
+/* Symbol rediretion for loader/static initialization code.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifndef _DL_IFUNC_GENERIC_H
|
||||
+#define _DL_IFUNC_GENERIC_H
|
||||
+
|
||||
+asm ("memset = __memset_aligned");
|
||||
+
|
||||
+#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
index 8bd5489e..37f60dde 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
@@ -117,5 +117,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
#endif
|
||||
IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_generic)
|
||||
)
|
||||
+
|
||||
+ IFUNC_IMPL (i, name, memset,
|
||||
+#if !defined __loongarch_soft_float
|
||||
+ IFUNC_IMPL_ADD (array, i, memset, SUPPORT_LASX, __memset_lasx)
|
||||
+ IFUNC_IMPL_ADD (array, i, memset, SUPPORT_LSX, __memset_lsx)
|
||||
+#endif
|
||||
+ IFUNC_IMPL_ADD (array, i, memset, SUPPORT_UAL, __memset_unaligned)
|
||||
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_aligned)
|
||||
+ )
|
||||
+
|
||||
return i;
|
||||
}
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memset-aligned.S b/sysdeps/loongarch/lp64/multiarch/memset-aligned.S
|
||||
new file mode 100644
|
||||
index 00000000..1fce95b7
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memset-aligned.S
|
||||
@@ -0,0 +1,174 @@
|
||||
+/* Optimized memset aligned implementation using basic LoongArch instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc)
|
||||
+# define MEMSET_NAME __memset_aligned
|
||||
+#else
|
||||
+# define MEMSET_NAME memset
|
||||
+#endif
|
||||
+
|
||||
+LEAF(MEMSET_NAME, 6)
|
||||
+ move t0, a0
|
||||
+ andi a3, a0, 0x7
|
||||
+ li.w t6, 16
|
||||
+ beqz a3, L(align)
|
||||
+ bltu a2, t6, L(short_data)
|
||||
+
|
||||
+L(make_align):
|
||||
+ li.w t8, 8
|
||||
+ sub.d t2, t8, a3
|
||||
+ pcaddi t1, 11
|
||||
+ slli.d t3, t2, 2
|
||||
+ sub.d t1, t1, t3
|
||||
+ jr t1
|
||||
+
|
||||
+L(al7):
|
||||
+ st.b a1, t0, 6
|
||||
+L(al6):
|
||||
+ st.b a1, t0, 5
|
||||
+L(al5):
|
||||
+ st.b a1, t0, 4
|
||||
+L(al4):
|
||||
+ st.b a1, t0, 3
|
||||
+L(al3):
|
||||
+ st.b a1, t0, 2
|
||||
+L(al2):
|
||||
+ st.b a1, t0, 1
|
||||
+L(al1):
|
||||
+ st.b a1, t0, 0
|
||||
+L(al0):
|
||||
+ add.d t0, t0, t2
|
||||
+ sub.d a2, a2, t2
|
||||
+
|
||||
+L(align):
|
||||
+ bstrins.d a1, a1, 15, 8
|
||||
+ bstrins.d a1, a1, 31, 16
|
||||
+ bstrins.d a1, a1, 63, 32
|
||||
+ bltu a2, t6, L(less_16bytes)
|
||||
+
|
||||
+ andi a4, a2, 0x3f
|
||||
+ beq a4, a2, L(less_64bytes)
|
||||
+
|
||||
+ sub.d t1, a2, a4
|
||||
+ move a2, a4
|
||||
+ add.d a5, t0, t1
|
||||
+
|
||||
+L(loop_64bytes):
|
||||
+ addi.d t0, t0, 64
|
||||
+ st.d a1, t0, -64
|
||||
+ st.d a1, t0, -56
|
||||
+ st.d a1, t0, -48
|
||||
+ st.d a1, t0, -40
|
||||
+
|
||||
+ st.d a1, t0, -32
|
||||
+ st.d a1, t0, -24
|
||||
+ st.d a1, t0, -16
|
||||
+ st.d a1, t0, -8
|
||||
+ bne t0, a5, L(loop_64bytes)
|
||||
+
|
||||
+L(less_64bytes):
|
||||
+ srai.d a4, a2, 5
|
||||
+ beqz a4, L(less_32bytes)
|
||||
+ addi.d a2, a2, -32
|
||||
+ st.d a1, t0, 0
|
||||
+
|
||||
+ st.d a1, t0, 8
|
||||
+ st.d a1, t0, 16
|
||||
+ st.d a1, t0, 24
|
||||
+ addi.d t0, t0, 32
|
||||
+
|
||||
+L(less_32bytes):
|
||||
+ bltu a2, t6, L(less_16bytes)
|
||||
+ addi.d a2, a2, -16
|
||||
+ st.d a1, t0, 0
|
||||
+ st.d a1, t0, 8
|
||||
+ addi.d t0, t0, 16
|
||||
+
|
||||
+L(less_16bytes):
|
||||
+ srai.d a4, a2, 3
|
||||
+ beqz a4, L(less_8bytes)
|
||||
+ addi.d a2, a2, -8
|
||||
+ st.d a1, t0, 0
|
||||
+ addi.d t0, t0, 8
|
||||
+
|
||||
+L(less_8bytes):
|
||||
+ beqz a2, L(less_1byte)
|
||||
+ srai.d a4, a2, 2
|
||||
+ beqz a4, L(less_4bytes)
|
||||
+ addi.d a2, a2, -4
|
||||
+ st.w a1, t0, 0
|
||||
+ addi.d t0, t0, 4
|
||||
+
|
||||
+L(less_4bytes):
|
||||
+ srai.d a3, a2, 1
|
||||
+ beqz a3, L(less_2bytes)
|
||||
+ addi.d a2, a2, -2
|
||||
+ st.h a1, t0, 0
|
||||
+ addi.d t0, t0, 2
|
||||
+
|
||||
+L(less_2bytes):
|
||||
+ beqz a2, L(less_1byte)
|
||||
+ st.b a1, t0, 0
|
||||
+L(less_1byte):
|
||||
+ jr ra
|
||||
+
|
||||
+L(short_data):
|
||||
+ pcaddi t1, 19
|
||||
+ slli.d t3, a2, 2
|
||||
+ sub.d t1, t1, t3
|
||||
+ jr t1
|
||||
+L(short_15):
|
||||
+ st.b a1, a0, 14
|
||||
+L(short_14):
|
||||
+ st.b a1, a0, 13
|
||||
+L(short_13):
|
||||
+ st.b a1, a0, 12
|
||||
+L(short_12):
|
||||
+ st.b a1, a0, 11
|
||||
+L(short_11):
|
||||
+ st.b a1, a0, 10
|
||||
+L(short_10):
|
||||
+ st.b a1, a0, 9
|
||||
+L(short_9):
|
||||
+ st.b a1, a0, 8
|
||||
+L(short_8):
|
||||
+ st.b a1, a0, 7
|
||||
+L(short_7):
|
||||
+ st.b a1, a0, 6
|
||||
+L(short_6):
|
||||
+ st.b a1, a0, 5
|
||||
+L(short_5):
|
||||
+ st.b a1, a0, 4
|
||||
+L(short_4):
|
||||
+ st.b a1, a0, 3
|
||||
+L(short_3):
|
||||
+ st.b a1, a0, 2
|
||||
+L(short_2):
|
||||
+ st.b a1, a0, 1
|
||||
+L(short_1):
|
||||
+ st.b a1, a0, 0
|
||||
+L(short_0):
|
||||
+ jr ra
|
||||
+END(MEMSET_NAME)
|
||||
+
|
||||
+libc_hidden_builtin_def (MEMSET_NAME)
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lasx.S b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S
|
||||
new file mode 100644
|
||||
index 00000000..041abbac
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S
|
||||
@@ -0,0 +1,142 @@
|
||||
+/* Optimized memset implementation using LoongArch LASX instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc) && !defined __loongarch_soft_float
|
||||
+
|
||||
+# define MEMSET __memset_lasx
|
||||
+
|
||||
+LEAF(MEMSET, 6)
|
||||
+ li.d t1, 32
|
||||
+ move a3, a0
|
||||
+ xvreplgr2vr.b xr0, a1
|
||||
+ add.d a4, a0, a2
|
||||
+
|
||||
+ bgeu t1, a2, L(less_32bytes)
|
||||
+ li.d t3, 128
|
||||
+ li.d t2, 64
|
||||
+ blt t3, a2, L(long_bytes)
|
||||
+
|
||||
+L(less_128bytes):
|
||||
+ bgeu t2, a2, L(less_64bytes)
|
||||
+ xvst xr0, a3, 0
|
||||
+ xvst xr0, a3, 32
|
||||
+ xvst xr0, a4, -32
|
||||
+
|
||||
+ xvst xr0, a4, -64
|
||||
+ jr ra
|
||||
+L(less_64bytes):
|
||||
+ xvst xr0, a3, 0
|
||||
+ xvst xr0, a4, -32
|
||||
+
|
||||
+
|
||||
+ jr ra
|
||||
+L(less_32bytes):
|
||||
+ srli.d t0, a2, 4
|
||||
+ beqz t0, L(less_16bytes)
|
||||
+ vst vr0, a3, 0
|
||||
+
|
||||
+ vst vr0, a4, -16
|
||||
+ jr ra
|
||||
+L(less_16bytes):
|
||||
+ srli.d t0, a2, 3
|
||||
+ beqz t0, L(less_8bytes)
|
||||
+
|
||||
+ vstelm.d vr0, a3, 0, 0
|
||||
+ vstelm.d vr0, a4, -8, 0
|
||||
+ jr ra
|
||||
+L(less_8bytes):
|
||||
+ srli.d t0, a2, 2
|
||||
+
|
||||
+ beqz t0, L(less_4bytes)
|
||||
+ vstelm.w vr0, a3, 0, 0
|
||||
+ vstelm.w vr0, a4, -4, 0
|
||||
+ jr ra
|
||||
+
|
||||
+
|
||||
+L(less_4bytes):
|
||||
+ srli.d t0, a2, 1
|
||||
+ beqz t0, L(less_2bytes)
|
||||
+ vstelm.h vr0, a3, 0, 0
|
||||
+ vstelm.h vr0, a4, -2, 0
|
||||
+
|
||||
+ jr ra
|
||||
+L(less_2bytes):
|
||||
+ beqz a2, L(less_1bytes)
|
||||
+ st.b a1, a3, 0
|
||||
+L(less_1bytes):
|
||||
+ jr ra
|
||||
+
|
||||
+L(long_bytes):
|
||||
+ xvst xr0, a3, 0
|
||||
+ bstrins.d a3, zero, 4, 0
|
||||
+ addi.d a3, a3, 32
|
||||
+ sub.d a2, a4, a3
|
||||
+
|
||||
+ andi t0, a2, 0xff
|
||||
+ beq t0, a2, L(long_end)
|
||||
+ move a2, t0
|
||||
+ sub.d t0, a4, t0
|
||||
+
|
||||
+
|
||||
+L(loop_256):
|
||||
+ xvst xr0, a3, 0
|
||||
+ xvst xr0, a3, 32
|
||||
+ xvst xr0, a3, 64
|
||||
+ xvst xr0, a3, 96
|
||||
+
|
||||
+ xvst xr0, a3, 128
|
||||
+ xvst xr0, a3, 160
|
||||
+ xvst xr0, a3, 192
|
||||
+ xvst xr0, a3, 224
|
||||
+
|
||||
+ addi.d a3, a3, 256
|
||||
+ bne a3, t0, L(loop_256)
|
||||
+L(long_end):
|
||||
+ bltu a2, t3, L(end_less_128)
|
||||
+ addi.d a2, a2, -128
|
||||
+
|
||||
+ xvst xr0, a3, 0
|
||||
+ xvst xr0, a3, 32
|
||||
+ xvst xr0, a3, 64
|
||||
+ xvst xr0, a3, 96
|
||||
+
|
||||
+
|
||||
+ addi.d a3, a3, 128
|
||||
+L(end_less_128):
|
||||
+ bltu a2, t2, L(end_less_64)
|
||||
+ addi.d a2, a2, -64
|
||||
+ xvst xr0, a3, 0
|
||||
+
|
||||
+ xvst xr0, a3, 32
|
||||
+ addi.d a3, a3, 64
|
||||
+L(end_less_64):
|
||||
+ bltu a2, t1, L(end_less_32)
|
||||
+ xvst xr0, a3, 0
|
||||
+
|
||||
+L(end_less_32):
|
||||
+ xvst xr0, a4, -32
|
||||
+ jr ra
|
||||
+END(MEMSET)
|
||||
+
|
||||
+libc_hidden_builtin_def (MEMSET)
|
||||
+#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lsx.S b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S
|
||||
new file mode 100644
|
||||
index 00000000..3d3982aa
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S
|
||||
@@ -0,0 +1,135 @@
|
||||
+/* Optimized memset implementation using LoongArch LSX instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc) && !defined __loongarch_soft_float
|
||||
+
|
||||
+# define MEMSET __memset_lsx
|
||||
+
|
||||
+LEAF(MEMSET, 6)
|
||||
+ li.d t1, 16
|
||||
+ move a3, a0
|
||||
+ vreplgr2vr.b vr0, a1
|
||||
+ add.d a4, a0, a2
|
||||
+
|
||||
+ bgeu t1, a2, L(less_16bytes)
|
||||
+ li.d t3, 64
|
||||
+ li.d t2, 32
|
||||
+ bgeu a2, t3, L(long_bytes)
|
||||
+
|
||||
+L(less_64bytes):
|
||||
+ bgeu t2, a2, L(less_32bytes)
|
||||
+ vst vr0, a3, 0
|
||||
+ vst vr0, a3, 16
|
||||
+ vst vr0, a4, -32
|
||||
+
|
||||
+ vst vr0, a4, -16
|
||||
+ jr ra
|
||||
+L(less_32bytes):
|
||||
+ vst vr0, a3, 0
|
||||
+ vst vr0, a4, -16
|
||||
+
|
||||
+
|
||||
+ jr ra
|
||||
+L(less_16bytes):
|
||||
+ srli.d t0, a2, 3
|
||||
+ beqz t0, L(less_8bytes)
|
||||
+ vstelm.d vr0, a3, 0, 0
|
||||
+
|
||||
+ vstelm.d vr0, a4, -8, 0
|
||||
+ jr ra
|
||||
+L(less_8bytes):
|
||||
+ srli.d t0, a2, 2
|
||||
+ beqz t0, L(less_4bytes)
|
||||
+
|
||||
+ vstelm.w vr0, a3, 0, 0
|
||||
+ vstelm.w vr0, a4, -4, 0
|
||||
+ jr ra
|
||||
+L(less_4bytes):
|
||||
+ srli.d t0, a2, 1
|
||||
+
|
||||
+ beqz t0, L(less_2bytes)
|
||||
+ vstelm.h vr0, a3, 0, 0
|
||||
+ vstelm.h vr0, a4, -2, 0
|
||||
+ jr ra
|
||||
+
|
||||
+
|
||||
+L(less_2bytes):
|
||||
+ beqz a2, L(less_1bytes)
|
||||
+ vstelm.b vr0, a3, 0, 0
|
||||
+L(less_1bytes):
|
||||
+ jr ra
|
||||
+L(long_bytes):
|
||||
+ vst vr0, a3, 0
|
||||
+
|
||||
+ bstrins.d a3, zero, 3, 0
|
||||
+ addi.d a3, a3, 16
|
||||
+ sub.d a2, a4, a3
|
||||
+ andi t0, a2, 0x7f
|
||||
+
|
||||
+ beq t0, a2, L(long_end)
|
||||
+ move a2, t0
|
||||
+ sub.d t0, a4, t0
|
||||
+
|
||||
+L(loop_128):
|
||||
+ vst vr0, a3, 0
|
||||
+
|
||||
+ vst vr0, a3, 16
|
||||
+ vst vr0, a3, 32
|
||||
+ vst vr0, a3, 48
|
||||
+ vst vr0, a3, 64
|
||||
+
|
||||
+
|
||||
+ vst vr0, a3, 80
|
||||
+ vst vr0, a3, 96
|
||||
+ vst vr0, a3, 112
|
||||
+ addi.d a3, a3, 128
|
||||
+
|
||||
+ bne a3, t0, L(loop_128)
|
||||
+L(long_end):
|
||||
+ bltu a2, t3, L(end_less_64)
|
||||
+ addi.d a2, a2, -64
|
||||
+ vst vr0, a3, 0
|
||||
+
|
||||
+ vst vr0, a3, 16
|
||||
+ vst vr0, a3, 32
|
||||
+ vst vr0, a3, 48
|
||||
+ addi.d a3, a3, 64
|
||||
+
|
||||
+L(end_less_64):
|
||||
+ bltu a2, t2, L(end_less_32)
|
||||
+ addi.d a2, a2, -32
|
||||
+ vst vr0, a3, 0
|
||||
+ vst vr0, a3, 16
|
||||
+
|
||||
+ addi.d a3, a3, 32
|
||||
+L(end_less_32):
|
||||
+ bltu a2, t1, L(end_less_16)
|
||||
+ vst vr0, a3, 0
|
||||
+
|
||||
+L(end_less_16):
|
||||
+ vst vr0, a4, -16
|
||||
+ jr ra
|
||||
+END(MEMSET)
|
||||
+
|
||||
+libc_hidden_builtin_def (MEMSET)
|
||||
+#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
|
||||
new file mode 100644
|
||||
index 00000000..f7d32039
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
|
||||
@@ -0,0 +1,162 @@
|
||||
+/* Optimized memset unaligned implementation using basic LoongArch instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc)
|
||||
+
|
||||
+# define MEMSET_NAME __memset_unaligned
|
||||
+
|
||||
+#define ST_128(n) \
|
||||
+ st.d a1, a0, n; \
|
||||
+ st.d a1, a0, n+8 ; \
|
||||
+ st.d a1, a0, n+16 ; \
|
||||
+ st.d a1, a0, n+24 ; \
|
||||
+ st.d a1, a0, n+32 ; \
|
||||
+ st.d a1, a0, n+40 ; \
|
||||
+ st.d a1, a0, n+48 ; \
|
||||
+ st.d a1, a0, n+56 ; \
|
||||
+ st.d a1, a0, n+64 ; \
|
||||
+ st.d a1, a0, n+72 ; \
|
||||
+ st.d a1, a0, n+80 ; \
|
||||
+ st.d a1, a0, n+88 ; \
|
||||
+ st.d a1, a0, n+96 ; \
|
||||
+ st.d a1, a0, n+104; \
|
||||
+ st.d a1, a0, n+112; \
|
||||
+ st.d a1, a0, n+120;
|
||||
+
|
||||
+LEAF(MEMSET_NAME, 6)
|
||||
+ bstrins.d a1, a1, 15, 8
|
||||
+ add.d t7, a0, a2
|
||||
+ bstrins.d a1, a1, 31, 16
|
||||
+ move t0, a0
|
||||
+
|
||||
+ bstrins.d a1, a1, 63, 32
|
||||
+ srai.d t8, a2, 4
|
||||
+ beqz t8, L(less_16bytes)
|
||||
+ srai.d t8, a2, 6
|
||||
+
|
||||
+ bnez t8, L(more_64bytes)
|
||||
+ srai.d t8, a2, 5
|
||||
+ beqz t8, L(less_32bytes)
|
||||
+
|
||||
+ st.d a1, a0, 0
|
||||
+ st.d a1, a0, 8
|
||||
+ st.d a1, a0, 16
|
||||
+ st.d a1, a0, 24
|
||||
+
|
||||
+ st.d a1, t7, -32
|
||||
+ st.d a1, t7, -24
|
||||
+ st.d a1, t7, -16
|
||||
+ st.d a1, t7, -8
|
||||
+
|
||||
+ jr ra
|
||||
+
|
||||
+L(less_32bytes):
|
||||
+ st.d a1, a0, 0
|
||||
+ st.d a1, a0, 8
|
||||
+ st.d a1, t7, -16
|
||||
+ st.d a1, t7, -8
|
||||
+
|
||||
+ jr ra
|
||||
+
|
||||
+L(less_16bytes):
|
||||
+ srai.d t8, a2, 3
|
||||
+ beqz t8, L(less_8bytes)
|
||||
+ st.d a1, a0, 0
|
||||
+ st.d a1, t7, -8
|
||||
+
|
||||
+ jr ra
|
||||
+
|
||||
+L(less_8bytes):
|
||||
+ srai.d t8, a2, 2
|
||||
+ beqz t8, L(less_4bytes)
|
||||
+ st.w a1, a0, 0
|
||||
+ st.w a1, t7, -4
|
||||
+
|
||||
+ jr ra
|
||||
+
|
||||
+L(less_4bytes):
|
||||
+ srai.d t8, a2, 1
|
||||
+ beqz t8, L(less_2bytes)
|
||||
+ st.h a1, a0, 0
|
||||
+ st.h a1, t7, -2
|
||||
+
|
||||
+ jr ra
|
||||
+
|
||||
+L(less_2bytes):
|
||||
+ beqz a2, L(less_1bytes)
|
||||
+ st.b a1, a0, 0
|
||||
+
|
||||
+ jr ra
|
||||
+
|
||||
+L(less_1bytes):
|
||||
+ jr ra
|
||||
+
|
||||
+L(more_64bytes):
|
||||
+ srli.d a0, a0, 3
|
||||
+ slli.d a0, a0, 3
|
||||
+ addi.d a0, a0, 0x8
|
||||
+ st.d a1, t0, 0
|
||||
+
|
||||
+ sub.d t2, t0, a0
|
||||
+ add.d a2, t2, a2
|
||||
+ addi.d a2, a2, -0x80
|
||||
+ blt a2, zero, L(end_unalign_proc)
|
||||
+
|
||||
+L(loop_less):
|
||||
+ ST_128(0)
|
||||
+ addi.d a0, a0, 0x80
|
||||
+ addi.d a2, a2, -0x80
|
||||
+ bge a2, zero, L(loop_less)
|
||||
+
|
||||
+L(end_unalign_proc):
|
||||
+ addi.d a2, a2, 0x80
|
||||
+ pcaddi t1, 20
|
||||
+ andi t5, a2, 0x78
|
||||
+ srli.d t5, t5, 1
|
||||
+
|
||||
+ sub.d t1, t1, t5
|
||||
+ jr t1
|
||||
+
|
||||
+ st.d a1, a0, 112
|
||||
+ st.d a1, a0, 104
|
||||
+ st.d a1, a0, 96
|
||||
+ st.d a1, a0, 88
|
||||
+ st.d a1, a0, 80
|
||||
+ st.d a1, a0, 72
|
||||
+ st.d a1, a0, 64
|
||||
+ st.d a1, a0, 56
|
||||
+ st.d a1, a0, 48
|
||||
+ st.d a1, a0, 40
|
||||
+ st.d a1, a0, 32
|
||||
+ st.d a1, a0, 24
|
||||
+ st.d a1, a0, 16
|
||||
+ st.d a1, a0, 8
|
||||
+ st.d a1, a0, 0
|
||||
+ st.d a1, t7, -8
|
||||
+
|
||||
+ move a0, t0
|
||||
+ jr ra
|
||||
+END(MEMSET_NAME)
|
||||
+
|
||||
+libc_hidden_builtin_def (MEMSET_NAME)
|
||||
+#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memset.c b/sysdeps/loongarch/lp64/multiarch/memset.c
|
||||
new file mode 100644
|
||||
index 00000000..3ff60d8a
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memset.c
|
||||
@@ -0,0 +1,37 @@
|
||||
+/* Multiple versions of memset.
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+/* Define multiple versions only for the definition in libc. */
|
||||
+#if IS_IN (libc)
|
||||
+# define memset __redirect_memset
|
||||
+# include <string.h>
|
||||
+# undef memset
|
||||
+
|
||||
+# define SYMBOL_NAME memset
|
||||
+# include "ifunc-lasx.h"
|
||||
+
|
||||
+libc_ifunc_redirected (__redirect_memset, memset,
|
||||
+ IFUNC_SELECTOR ());
|
||||
+
|
||||
+# ifdef SHARED
|
||||
+__hidden_ver1 (memset, __GI_memset, __redirect_memset)
|
||||
+ __attribute__ ((visibility ("hidden"))) __attribute_copy__ (memset);
|
||||
+# endif
|
||||
+
|
||||
+#endif
|
||||
--
|
||||
2.33.0
|
||||
|
448
LoongArch-Add-ifunc-support-for-rawmemchr-aligned-ls.patch
Normal file
448
LoongArch-Add-ifunc-support-for-rawmemchr-aligned-ls.patch
Normal file
|
@ -0,0 +1,448 @@
|
|||
From b412bcb2cf4914a664bcd24924d670a2e37394b3 Mon Sep 17 00:00:00 2001
|
||||
From: dengjianbo <dengjianbo@loongson.cn>
|
||||
Date: Mon, 28 Aug 2023 10:08:35 +0800
|
||||
Subject: [PATCH 14/29] LoongArch: Add ifunc support for rawmemchr{aligned,
|
||||
lsx, lasx}
|
||||
|
||||
According to glibc rawmemchr microbenchmark, A few cases tested with
|
||||
char '\0' experience performance degradation due to the lasx and lsx
|
||||
versions don't handle the '\0' separately. Overall, rawmemchr-lasx
|
||||
implementation could reduce the runtime about 40%-80%, rawmemchr-lsx
|
||||
implementation could reduce the runtime about 40%-66%, rawmemchr-aligned
|
||||
implementation could reduce the runtime about 20%-40%.
|
||||
|
||||
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/loongarch/lp64/multiarch/Makefile | 3 +
|
||||
.../lp64/multiarch/ifunc-impl-list.c | 8 ++
|
||||
.../lp64/multiarch/ifunc-rawmemchr.h | 40 ++++++
|
||||
.../lp64/multiarch/rawmemchr-aligned.S | 124 ++++++++++++++++++
|
||||
.../loongarch/lp64/multiarch/rawmemchr-lasx.S | 82 ++++++++++++
|
||||
.../loongarch/lp64/multiarch/rawmemchr-lsx.S | 71 ++++++++++
|
||||
sysdeps/loongarch/lp64/multiarch/rawmemchr.c | 37 ++++++
|
||||
7 files changed, 365 insertions(+)
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-rawmemchr.h
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/rawmemchr.c
|
||||
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
index 5d7ae7ae..64416b02 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
@@ -21,5 +21,8 @@ sysdep_routines += \
|
||||
memmove-unaligned \
|
||||
memmove-lsx \
|
||||
memmove-lasx \
|
||||
+ rawmemchr-aligned \
|
||||
+ rawmemchr-lsx \
|
||||
+ rawmemchr-lasx \
|
||||
# sysdep_routines
|
||||
endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
index c8ba87bd..3db9af14 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
@@ -94,5 +94,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_aligned)
|
||||
)
|
||||
|
||||
+ IFUNC_IMPL (i, name, rawmemchr,
|
||||
+#if !defined __loongarch_soft_float
|
||||
+ IFUNC_IMPL_ADD (array, i, rawmemchr, SUPPORT_LASX, __rawmemchr_lasx)
|
||||
+ IFUNC_IMPL_ADD (array, i, rawmemchr, SUPPORT_LSX, __rawmemchr_lsx)
|
||||
+#endif
|
||||
+ IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_aligned)
|
||||
+ )
|
||||
+
|
||||
return i;
|
||||
}
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-rawmemchr.h b/sysdeps/loongarch/lp64/multiarch/ifunc-rawmemchr.h
|
||||
new file mode 100644
|
||||
index 00000000..a7bb4cf9
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-rawmemchr.h
|
||||
@@ -0,0 +1,40 @@
|
||||
+/* Common definition for rawmemchr ifunc selections.
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <ldsodefs.h>
|
||||
+#include <ifunc-init.h>
|
||||
+
|
||||
+#if !defined __loongarch_soft_float
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
|
||||
+#endif
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
|
||||
+
|
||||
+static inline void *
|
||||
+IFUNC_SELECTOR (void)
|
||||
+{
|
||||
+#if !defined __loongarch_soft_float
|
||||
+ if (SUPPORT_LASX)
|
||||
+ return OPTIMIZE (lasx);
|
||||
+ else if (SUPPORT_LSX)
|
||||
+ return OPTIMIZE (lsx);
|
||||
+ else
|
||||
+#endif
|
||||
+ return OPTIMIZE (aligned);
|
||||
+}
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S
|
||||
new file mode 100644
|
||||
index 00000000..9c7155ae
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S
|
||||
@@ -0,0 +1,124 @@
|
||||
+/* Optimized rawmemchr implementation using basic LoongArch instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc)
|
||||
+# define RAWMEMCHR_NAME __rawmemchr_aligned
|
||||
+#else
|
||||
+# define RAWMEMCHR_NAME __rawmemchr
|
||||
+#endif
|
||||
+
|
||||
+LEAF(RAWMEMCHR_NAME, 6)
|
||||
+ andi t1, a0, 0x7
|
||||
+ bstrins.d a0, zero, 2, 0
|
||||
+ lu12i.w a2, 0x01010
|
||||
+ bstrins.d a1, a1, 15, 8
|
||||
+
|
||||
+ ld.d t0, a0, 0
|
||||
+ slli.d t1, t1, 3
|
||||
+ ori a2, a2, 0x101
|
||||
+ bstrins.d a1, a1, 31, 16
|
||||
+
|
||||
+ li.w t8, -1
|
||||
+ bstrins.d a1, a1, 63, 32
|
||||
+ bstrins.d a2, a2, 63, 32
|
||||
+ sll.d t2, t8, t1
|
||||
+
|
||||
+ sll.d t3, a1, t1
|
||||
+ orn t0, t0, t2
|
||||
+ slli.d a3, a2, 7
|
||||
+ beqz a1, L(find_zero)
|
||||
+
|
||||
+ xor t0, t0, t3
|
||||
+ sub.d t1, t0, a2
|
||||
+ andn t2, a3, t0
|
||||
+ and t3, t1, t2
|
||||
+
|
||||
+ bnez t3, L(count_pos)
|
||||
+ addi.d a0, a0, 8
|
||||
+
|
||||
+L(loop):
|
||||
+ ld.d t0, a0, 0
|
||||
+ xor t0, t0, a1
|
||||
+
|
||||
+ sub.d t1, t0, a2
|
||||
+ andn t2, a3, t0
|
||||
+ and t3, t1, t2
|
||||
+ bnez t3, L(count_pos)
|
||||
+
|
||||
+ ld.d t0, a0, 8
|
||||
+ addi.d a0, a0, 16
|
||||
+ xor t0, t0, a1
|
||||
+ sub.d t1, t0, a2
|
||||
+
|
||||
+ andn t2, a3, t0
|
||||
+ and t3, t1, t2
|
||||
+ beqz t3, L(loop)
|
||||
+ addi.d a0, a0, -8
|
||||
+L(count_pos):
|
||||
+ ctz.d t0, t3
|
||||
+ srli.d t0, t0, 3
|
||||
+ add.d a0, a0, t0
|
||||
+ jr ra
|
||||
+
|
||||
+L(loop_7bit):
|
||||
+ ld.d t0, a0, 0
|
||||
+L(find_zero):
|
||||
+ sub.d t1, t0, a2
|
||||
+ and t2, t1, a3
|
||||
+ bnez t2, L(more_check)
|
||||
+
|
||||
+ ld.d t0, a0, 8
|
||||
+ addi.d a0, a0, 16
|
||||
+ sub.d t1, t0, a2
|
||||
+ and t2, t1, a3
|
||||
+
|
||||
+ beqz t2, L(loop_7bit)
|
||||
+ addi.d a0, a0, -8
|
||||
+
|
||||
+L(more_check):
|
||||
+ andn t2, a3, t0
|
||||
+ and t3, t1, t2
|
||||
+ bnez t3, L(count_pos)
|
||||
+ addi.d a0, a0, 8
|
||||
+
|
||||
+L(loop_8bit):
|
||||
+ ld.d t0, a0, 0
|
||||
+
|
||||
+ sub.d t1, t0, a2
|
||||
+ andn t2, a3, t0
|
||||
+ and t3, t1, t2
|
||||
+ bnez t3, L(count_pos)
|
||||
+
|
||||
+ ld.d t0, a0, 8
|
||||
+ addi.d a0, a0, 16
|
||||
+ sub.d t1, t0, a2
|
||||
+
|
||||
+ andn t2, a3, t0
|
||||
+ and t3, t1, t2
|
||||
+ beqz t3, L(loop_8bit)
|
||||
+
|
||||
+ addi.d a0, a0, -8
|
||||
+ b L(count_pos)
|
||||
+
|
||||
+END(RAWMEMCHR_NAME)
|
||||
+
|
||||
+libc_hidden_builtin_def (__rawmemchr)
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S
|
||||
new file mode 100644
|
||||
index 00000000..be2eb59d
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S
|
||||
@@ -0,0 +1,82 @@
|
||||
+/* Optimized rawmemchr implementation using LoongArch LASX instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/asm.h>
|
||||
+#include <sys/regdef.h>
|
||||
+
|
||||
+#if IS_IN (libc) && !defined __loongarch_soft_float
|
||||
+
|
||||
+# define RAWMEMCHR __rawmemchr_lasx
|
||||
+
|
||||
+LEAF(RAWMEMCHR, 6)
|
||||
+ move a2, a0
|
||||
+ bstrins.d a0, zero, 5, 0
|
||||
+ xvld xr0, a0, 0
|
||||
+ xvld xr1, a0, 32
|
||||
+
|
||||
+ xvreplgr2vr.b xr2, a1
|
||||
+ xvseq.b xr0, xr0, xr2
|
||||
+ xvseq.b xr1, xr1, xr2
|
||||
+ xvmsknz.b xr0, xr0
|
||||
+
|
||||
+ xvmsknz.b xr1, xr1
|
||||
+ xvpickve.w xr3, xr0, 4
|
||||
+ xvpickve.w xr4, xr1, 4
|
||||
+ vilvl.h vr0, vr3, vr0
|
||||
+
|
||||
+ vilvl.h vr1, vr4, vr1
|
||||
+ vilvl.w vr0, vr1, vr0
|
||||
+ movfr2gr.d t0, fa0
|
||||
+ sra.d t0, t0, a2
|
||||
+
|
||||
+
|
||||
+ beqz t0, L(loop)
|
||||
+ ctz.d t0, t0
|
||||
+ add.d a0, a2, t0
|
||||
+ jr ra
|
||||
+
|
||||
+L(loop):
|
||||
+ xvld xr0, a0, 64
|
||||
+ xvld xr1, a0, 96
|
||||
+ addi.d a0, a0, 64
|
||||
+ xvseq.b xr0, xr0, xr2
|
||||
+
|
||||
+ xvseq.b xr1, xr1, xr2
|
||||
+ xvmax.bu xr3, xr0, xr1
|
||||
+ xvseteqz.v fcc0, xr3
|
||||
+ bcnez fcc0, L(loop)
|
||||
+
|
||||
+ xvmsknz.b xr0, xr0
|
||||
+ xvmsknz.b xr1, xr1
|
||||
+ xvpickve.w xr3, xr0, 4
|
||||
+ xvpickve.w xr4, xr1, 4
|
||||
+
|
||||
+
|
||||
+ vilvl.h vr0, vr3, vr0
|
||||
+ vilvl.h vr1, vr4, vr1
|
||||
+ vilvl.w vr0, vr1, vr0
|
||||
+ movfr2gr.d t0, fa0
|
||||
+
|
||||
+ ctz.d t0, t0
|
||||
+ add.d a0, a0, t0
|
||||
+ jr ra
|
||||
+END(RAWMEMCHR)
|
||||
+
|
||||
+libc_hidden_builtin_def (RAWMEMCHR)
|
||||
+#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S
|
||||
new file mode 100644
|
||||
index 00000000..2f6fe024
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S
|
||||
@@ -0,0 +1,71 @@
|
||||
+/* Optimized rawmemchr implementation using LoongArch LSX instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc) && !defined __loongarch_soft_float
|
||||
+
|
||||
+# define RAWMEMCHR __rawmemchr_lsx
|
||||
+
|
||||
+LEAF(RAWMEMCHR, 6)
|
||||
+ move a2, a0
|
||||
+ bstrins.d a0, zero, 4, 0
|
||||
+ vld vr0, a0, 0
|
||||
+ vld vr1, a0, 16
|
||||
+
|
||||
+ vreplgr2vr.b vr2, a1
|
||||
+ vseq.b vr0, vr0, vr2
|
||||
+ vseq.b vr1, vr1, vr2
|
||||
+ vmsknz.b vr0, vr0
|
||||
+
|
||||
+ vmsknz.b vr1, vr1
|
||||
+ vilvl.h vr0, vr1, vr0
|
||||
+ movfr2gr.s t0, fa0
|
||||
+ sra.w t0, t0, a2
|
||||
+
|
||||
+ beqz t0, L(loop)
|
||||
+ ctz.w t0, t0
|
||||
+ add.d a0, a2, t0
|
||||
+ jr ra
|
||||
+
|
||||
+
|
||||
+L(loop):
|
||||
+ vld vr0, a0, 32
|
||||
+ vld vr1, a0, 48
|
||||
+ addi.d a0, a0, 32
|
||||
+ vseq.b vr0, vr0, vr2
|
||||
+
|
||||
+ vseq.b vr1, vr1, vr2
|
||||
+ vmax.bu vr3, vr0, vr1
|
||||
+ vseteqz.v fcc0, vr3
|
||||
+ bcnez fcc0, L(loop)
|
||||
+
|
||||
+ vmsknz.b vr0, vr0
|
||||
+ vmsknz.b vr1, vr1
|
||||
+ vilvl.h vr0, vr1, vr0
|
||||
+ movfr2gr.s t0, fa0
|
||||
+
|
||||
+ ctz.w t0, t0
|
||||
+ add.d a0, a0, t0
|
||||
+ jr ra
|
||||
+END(RAWMEMCHR)
|
||||
+
|
||||
+libc_hidden_builtin_def (RAWMEMCHR)
|
||||
+#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr.c b/sysdeps/loongarch/lp64/multiarch/rawmemchr.c
|
||||
new file mode 100644
|
||||
index 00000000..89c7ffff
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr.c
|
||||
@@ -0,0 +1,37 @@
|
||||
+/* Multiple versions of rawmemchr.
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#if IS_IN (libc)
|
||||
+# define rawmemchr __redirect_rawmemchr
|
||||
+# define __rawmemchr __redirect___rawmemchr
|
||||
+# include <string.h>
|
||||
+# undef rawmemchr
|
||||
+# undef __rawmemchr
|
||||
+
|
||||
+# define SYMBOL_NAME rawmemchr
|
||||
+# include "ifunc-rawmemchr.h"
|
||||
+
|
||||
+libc_ifunc_redirected (__redirect_rawmemchr, __rawmemchr,
|
||||
+ IFUNC_SELECTOR ());
|
||||
+weak_alias (__rawmemchr, rawmemchr)
|
||||
+# ifdef SHARED
|
||||
+__hidden_ver1 (__rawmemchr, __GI___rawmemchr, __redirect___rawmemchr)
|
||||
+ __attribute__((visibility ("hidden")));
|
||||
+# endif
|
||||
+#endif
|
||||
--
|
||||
2.33.0
|
||||
|
499
LoongArch-Add-ifunc-support-for-strcmp-aligned-lsx.patch
Normal file
499
LoongArch-Add-ifunc-support-for-strcmp-aligned-lsx.patch
Normal file
|
@ -0,0 +1,499 @@
|
|||
From e258cfcf92f5e31e902fa045b41652f00fcf2521 Mon Sep 17 00:00:00 2001
|
||||
From: dengjianbo <dengjianbo@loongson.cn>
|
||||
Date: Thu, 24 Aug 2023 16:50:18 +0800
|
||||
Subject: [PATCH 09/29] LoongArch: Add ifunc support for strcmp{aligned, lsx}
|
||||
|
||||
Based on the glibc microbenchmark, strcmp-aligned implementation could
|
||||
reduce the runtime 0%-10% for aligned comparison, 10%-20% for unaligned
|
||||
comparison, strcmp-lsx implemenation could reduce the runtime 0%-50%.
|
||||
|
||||
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/loongarch/lp64/multiarch/Makefile | 2 +
|
||||
.../lp64/multiarch/ifunc-impl-list.c | 7 +
|
||||
.../loongarch/lp64/multiarch/ifunc-strcmp.h | 38 ++++
|
||||
.../loongarch/lp64/multiarch/strcmp-aligned.S | 179 ++++++++++++++++++
|
||||
sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S | 165 ++++++++++++++++
|
||||
sysdeps/loongarch/lp64/multiarch/strcmp.c | 35 ++++
|
||||
6 files changed, 426 insertions(+)
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strcmp.h
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strcmp.c
|
||||
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
index c4dd3143..d5a500de 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
@@ -12,6 +12,8 @@ sysdep_routines += \
|
||||
strchrnul-aligned \
|
||||
strchrnul-lsx \
|
||||
strchrnul-lasx \
|
||||
+ strcmp-aligned \
|
||||
+ strcmp-lsx \
|
||||
memcpy-aligned \
|
||||
memcpy-unaligned \
|
||||
memmove-unaligned \
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
index 7cec0b77..9183b7da 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
@@ -62,6 +62,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_aligned)
|
||||
)
|
||||
|
||||
+ IFUNC_IMPL (i, name, strcmp,
|
||||
+#if !defined __loongarch_soft_float
|
||||
+ IFUNC_IMPL_ADD (array, i, strcmp, SUPPORT_LSX, __strcmp_lsx)
|
||||
+#endif
|
||||
+ IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_aligned)
|
||||
+ )
|
||||
+
|
||||
IFUNC_IMPL (i, name, memcpy,
|
||||
#if !defined __loongarch_soft_float
|
||||
IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_LASX, __memcpy_lasx)
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strcmp.h b/sysdeps/loongarch/lp64/multiarch/ifunc-strcmp.h
|
||||
new file mode 100644
|
||||
index 00000000..ca26352b
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strcmp.h
|
||||
@@ -0,0 +1,38 @@
|
||||
+/* Common definition for strcmp ifunc selection.
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <ldsodefs.h>
|
||||
+#include <ifunc-init.h>
|
||||
+
|
||||
+#if !defined __loongarch_soft_float
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
|
||||
+#endif
|
||||
+
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
|
||||
+
|
||||
+static inline void *
|
||||
+IFUNC_SELECTOR (void)
|
||||
+{
|
||||
+#if !defined __loongarch_soft_float
|
||||
+ if (SUPPORT_LSX)
|
||||
+ return OPTIMIZE (lsx);
|
||||
+ else
|
||||
+#endif
|
||||
+ return OPTIMIZE (aligned);
|
||||
+}
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S
|
||||
new file mode 100644
|
||||
index 00000000..f5f4f336
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S
|
||||
@@ -0,0 +1,179 @@
|
||||
+/* Optimized strcmp implementation using basic Loongarch instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc)
|
||||
+# define STRCMP_NAME __strcmp_aligned
|
||||
+#else
|
||||
+# define STRCMP_NAME strcmp
|
||||
+#endif
|
||||
+
|
||||
+LEAF(STRCMP_NAME, 6)
|
||||
+ lu12i.w a4, 0x01010
|
||||
+ andi a2, a0, 0x7
|
||||
+ ori a4, a4, 0x101
|
||||
+ andi a3, a1, 0x7
|
||||
+
|
||||
+ bstrins.d a4, a4, 63, 32
|
||||
+ li.d t7, -1
|
||||
+ li.d t8, 8
|
||||
+ slli.d a5, a4, 7
|
||||
+
|
||||
+ bne a2, a3, L(unaligned)
|
||||
+ bstrins.d a0, zero, 2, 0
|
||||
+ bstrins.d a1, zero, 2, 0
|
||||
+ ld.d t0, a0, 0
|
||||
+
|
||||
+ ld.d t1, a1, 0
|
||||
+ slli.d t3, a2, 3
|
||||
+ sll.d t2, t7, t3
|
||||
+ orn t0, t0, t2
|
||||
+
|
||||
+
|
||||
+ orn t1, t1, t2
|
||||
+ sub.d t2, t0, a4
|
||||
+ andn t3, a5, t0
|
||||
+ and t2, t2, t3
|
||||
+
|
||||
+ bne t0, t1, L(al_end)
|
||||
+L(al_loop):
|
||||
+ bnez t2, L(ret0)
|
||||
+ ldx.d t0, a0, t8
|
||||
+ ldx.d t1, a1, t8
|
||||
+
|
||||
+ addi.d t8, t8, 8
|
||||
+ sub.d t2, t0, a4
|
||||
+ andn t3, a5, t0
|
||||
+ and t2, t2, t3
|
||||
+
|
||||
+ beq t0, t1, L(al_loop)
|
||||
+L(al_end):
|
||||
+ xor t3, t0, t1
|
||||
+ or t2, t2, t3
|
||||
+ ctz.d t3, t2
|
||||
+
|
||||
+
|
||||
+ bstrins.d t3, zero, 2, 0
|
||||
+ srl.d t0, t0, t3
|
||||
+ srl.d t1, t1, t3
|
||||
+ andi t0, t0, 0xff
|
||||
+
|
||||
+ andi t1, t1, 0xff
|
||||
+ sub.d a0, t0, t1
|
||||
+ jr ra
|
||||
+ nop
|
||||
+
|
||||
+L(ret0):
|
||||
+ move a0, zero
|
||||
+ jr ra
|
||||
+ nop
|
||||
+ nop
|
||||
+
|
||||
+L(unaligned):
|
||||
+ slt a6, a3, a2
|
||||
+ xor t0, a0, a1
|
||||
+ maskeqz t0, t0, a6
|
||||
+ xor a0, a0, t0
|
||||
+
|
||||
+
|
||||
+ xor a1, a1, t0
|
||||
+ andi a2, a0, 0x7
|
||||
+ andi a3, a1, 0x7
|
||||
+ bstrins.d a0, zero, 2, 0
|
||||
+
|
||||
+ bstrins.d a1, zero, 2, 0
|
||||
+ ld.d t4, a0, 0
|
||||
+ ld.d t1, a1, 0
|
||||
+ slli.d a2, a2, 3
|
||||
+
|
||||
+ slli.d a3, a3, 3
|
||||
+ srl.d t0, t4, a2
|
||||
+ srl.d t1, t1, a3
|
||||
+ srl.d t5, t7, a3
|
||||
+
|
||||
+ orn t0, t0, t5
|
||||
+ orn t1, t1, t5
|
||||
+ bne t0, t1, L(not_equal)
|
||||
+ sll.d t5, t7, a2
|
||||
+
|
||||
+
|
||||
+ sub.d a3, a2, a3
|
||||
+ orn t4, t4, t5
|
||||
+ sub.d a2, zero, a3
|
||||
+ sub.d t2, t4, a4
|
||||
+
|
||||
+ andn t3, a5, t4
|
||||
+ and t2, t2, t3
|
||||
+ bnez t2, L(find_zero)
|
||||
+L(un_loop):
|
||||
+ srl.d t5, t4, a3
|
||||
+
|
||||
+ ldx.d t4, a0, t8
|
||||
+ ldx.d t1, a1, t8
|
||||
+ addi.d t8, t8, 8
|
||||
+ sll.d t0, t4, a2
|
||||
+
|
||||
+ or t0, t0, t5
|
||||
+ bne t0, t1, L(not_equal)
|
||||
+ sub.d t2, t4, a4
|
||||
+ andn t3, a5, t4
|
||||
+
|
||||
+
|
||||
+ and t2, t2, t3
|
||||
+ beqz t2, L(un_loop)
|
||||
+L(find_zero):
|
||||
+ sub.d t2, t0, a4
|
||||
+ andn t3, a5, t0
|
||||
+
|
||||
+ and t2, t2, t3
|
||||
+ bnez t2, L(ret0)
|
||||
+ ldx.d t1, a1, t8
|
||||
+ srl.d t0, t4, a3
|
||||
+
|
||||
+L(not_equal):
|
||||
+ sub.d t2, t0, a4
|
||||
+ andn t3, a5, t0
|
||||
+ and t2, t2, t3
|
||||
+ xor t3, t0, t1
|
||||
+
|
||||
+ or t2, t2, t3
|
||||
+L(un_end):
|
||||
+ ctz.d t3, t2
|
||||
+ bstrins.d t3, zero, 2, 0
|
||||
+ srl.d t0, t0, t3
|
||||
+
|
||||
+
|
||||
+ srl.d t1, t1, t3
|
||||
+ andi t0, t0, 0xff
|
||||
+ andi t1, t1, 0xff
|
||||
+ sub.d t2, t0, t1
|
||||
+
|
||||
+
|
||||
+ sub.d t3, t1, t0
|
||||
+ masknez t0, t2, a6
|
||||
+ maskeqz t1, t3, a6
|
||||
+ or a0, t0, t1
|
||||
+
|
||||
+ jr ra
|
||||
+END(STRCMP_NAME)
|
||||
+
|
||||
+libc_hidden_builtin_def (STRCMP_NAME)
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
|
||||
new file mode 100644
|
||||
index 00000000..2e177a38
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
|
||||
@@ -0,0 +1,165 @@
|
||||
+/* Optimized strcmp implementation using Loongarch LSX instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc) && !defined __loongarch_soft_float
|
||||
+
|
||||
+# define STRCMP __strcmp_lsx
|
||||
+
|
||||
+LEAF(STRCMP, 6)
|
||||
+ pcalau12i t0, %pc_hi20(L(INDEX))
|
||||
+ andi a2, a0, 0xf
|
||||
+ vld vr2, t0, %pc_lo12(L(INDEX))
|
||||
+ andi a3, a1, 0xf
|
||||
+
|
||||
+ bne a2, a3, L(unaligned)
|
||||
+ bstrins.d a0, zero, 3, 0
|
||||
+ bstrins.d a1, zero, 3, 0
|
||||
+ vld vr0, a0, 0
|
||||
+
|
||||
+ vld vr1, a1, 0
|
||||
+ vreplgr2vr.b vr3, a2
|
||||
+ vslt.b vr2, vr2, vr3
|
||||
+ vseq.b vr3, vr0, vr1
|
||||
+
|
||||
+ vmin.bu vr3, vr0, vr3
|
||||
+ vor.v vr3, vr3, vr2
|
||||
+ vsetanyeqz.b fcc0, vr3
|
||||
+ bcnez fcc0, L(al_out)
|
||||
+
|
||||
+
|
||||
+L(al_loop):
|
||||
+ vld vr0, a0, 16
|
||||
+ vld vr1, a1, 16
|
||||
+ addi.d a0, a0, 16
|
||||
+ addi.d a1, a1, 16
|
||||
+
|
||||
+ vseq.b vr3, vr0, vr1
|
||||
+ vmin.bu vr3, vr0, vr3
|
||||
+ vsetanyeqz.b fcc0, vr3
|
||||
+ bceqz fcc0, L(al_loop)
|
||||
+
|
||||
+L(al_out):
|
||||
+ vseqi.b vr3, vr3, 0
|
||||
+ vfrstpi.b vr3, vr3, 0
|
||||
+ vshuf.b vr0, vr0, vr0, vr3
|
||||
+ vshuf.b vr1, vr1, vr1, vr3
|
||||
+
|
||||
+ vpickve2gr.bu t0, vr0, 0
|
||||
+ vpickve2gr.bu t1, vr1, 0
|
||||
+ sub.d a0, t0, t1
|
||||
+ jr ra
|
||||
+
|
||||
+
|
||||
+L(unaligned):
|
||||
+ slt a4, a3, a2
|
||||
+ xor t0, a0, a1
|
||||
+ maskeqz t0, t0, a4
|
||||
+ xor a0, a0, t0
|
||||
+
|
||||
+ xor a1, a1, t0
|
||||
+ andi a2, a0, 0xf
|
||||
+ andi a3, a1, 0xf
|
||||
+ bstrins.d a0, zero, 3, 0
|
||||
+
|
||||
+ bstrins.d a1, zero, 3, 0
|
||||
+ vld vr3, a0, 0
|
||||
+ vld vr1, a1, 0
|
||||
+ vreplgr2vr.b vr4, a2
|
||||
+
|
||||
+ vreplgr2vr.b vr5, a3
|
||||
+ vslt.b vr7, vr2, vr5
|
||||
+ vsub.b vr5, vr5, vr4
|
||||
+ vaddi.bu vr6, vr2, 16
|
||||
+
|
||||
+
|
||||
+ vsub.b vr6, vr6, vr5
|
||||
+ vshuf.b vr0, vr3, vr3, vr6
|
||||
+ vor.v vr0, vr0, vr7
|
||||
+ vor.v vr1, vr1, vr7
|
||||
+
|
||||
+ vseq.b vr5, vr0, vr1
|
||||
+ vsetanyeqz.b fcc0, vr5
|
||||
+ bcnez fcc0, L(not_equal)
|
||||
+ vslt.b vr4, vr2, vr4
|
||||
+
|
||||
+ vor.v vr0, vr3, vr4
|
||||
+ vsetanyeqz.b fcc0, vr0
|
||||
+ bcnez fcc0, L(find_zero)
|
||||
+ nop
|
||||
+
|
||||
+L(un_loop):
|
||||
+ vld vr3, a0, 16
|
||||
+ vld vr1, a1, 16
|
||||
+ addi.d a0, a0, 16
|
||||
+ addi.d a1, a1, 16
|
||||
+
|
||||
+
|
||||
+ vshuf.b vr0, vr3, vr0, vr6
|
||||
+ vseq.b vr5, vr0, vr1
|
||||
+ vsetanyeqz.b fcc0, vr5
|
||||
+ bcnez fcc0, L(not_equal)
|
||||
+
|
||||
+ vsetanyeqz.b fcc0, vr3
|
||||
+ vor.v vr0, vr3, vr3
|
||||
+ bceqz fcc0, L(un_loop)
|
||||
+L(find_zero):
|
||||
+ vmin.bu vr5, vr1, vr5
|
||||
+
|
||||
+ vsetanyeqz.b fcc0, vr5
|
||||
+ bcnez fcc0, L(ret0)
|
||||
+ vld vr1, a1, 16
|
||||
+ vshuf.b vr0, vr3, vr3, vr6
|
||||
+
|
||||
+ vseq.b vr5, vr0, vr1
|
||||
+L(not_equal):
|
||||
+ vmin.bu vr5, vr0, vr5
|
||||
+L(un_end):
|
||||
+ vseqi.b vr5, vr5, 0
|
||||
+ vfrstpi.b vr5, vr5, 0
|
||||
+
|
||||
+
|
||||
+ vshuf.b vr0, vr0, vr0, vr5
|
||||
+ vshuf.b vr1, vr1, vr1, vr5
|
||||
+ vpickve2gr.bu t0, vr0, 0
|
||||
+ vpickve2gr.bu t1, vr1, 0
|
||||
+
|
||||
+ sub.d t3, t0, t1
|
||||
+ sub.d t4, t1, t0
|
||||
+ masknez t0, t3, a4
|
||||
+ maskeqz t1, t4, a4
|
||||
+
|
||||
+ or a0, t0, t1
|
||||
+ jr ra
|
||||
+L(ret0):
|
||||
+ move a0, zero
|
||||
+ jr ra
|
||||
+END(STRCMP)
|
||||
+
|
||||
+ .section .rodata.cst16,"M",@progbits,16
|
||||
+ .align 4
|
||||
+L(INDEX):
|
||||
+ .dword 0x0706050403020100
|
||||
+ .dword 0x0f0e0d0c0b0a0908
|
||||
+
|
||||
+libc_hidden_builtin_def (STRCMP)
|
||||
+#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp.c b/sysdeps/loongarch/lp64/multiarch/strcmp.c
|
||||
new file mode 100644
|
||||
index 00000000..6f249c0b
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strcmp.c
|
||||
@@ -0,0 +1,35 @@
|
||||
+/* Multiple versions of strcmp.
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+/* Define multiple versions only for the definition in libc. */
|
||||
+#if IS_IN (libc)
|
||||
+# define strcmp __redirect_strcmp
|
||||
+# include <string.h>
|
||||
+# undef strcmp
|
||||
+
|
||||
+# define SYMBOL_NAME strcmp
|
||||
+# include "ifunc-strcmp.h"
|
||||
+
|
||||
+libc_ifunc_redirected (__redirect_strcmp, strcmp, IFUNC_SELECTOR ());
|
||||
+
|
||||
+# ifdef SHARED
|
||||
+__hidden_ver1 (strcmp, __GI_strcmp, __redirect_strcmp)
|
||||
+ __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strcmp);
|
||||
+# endif
|
||||
+#endif
|
||||
--
|
||||
2.33.0
|
||||
|
1099
LoongArch-Add-ifunc-support-for-strcpy-stpcpy-aligne.patch
Normal file
1099
LoongArch-Add-ifunc-support-for-strcpy-stpcpy-aligne.patch
Normal file
File diff suppressed because it is too large
Load diff
583
LoongArch-Add-ifunc-support-for-strncmp-aligned-lsx.patch
Normal file
583
LoongArch-Add-ifunc-support-for-strncmp-aligned-lsx.patch
Normal file
|
@ -0,0 +1,583 @@
|
|||
From 6f03da2d7ef218c0f78375cf706dada59c3fee63 Mon Sep 17 00:00:00 2001
|
||||
From: dengjianbo <dengjianbo@loongson.cn>
|
||||
Date: Thu, 24 Aug 2023 16:50:19 +0800
|
||||
Subject: [PATCH 10/29] LoongArch: Add ifunc support for strncmp{aligned, lsx}
|
||||
|
||||
Based on the glibc microbenchmark, only a few short inputs with this
|
||||
strncmp-aligned and strncmp-lsx implementation experience performance
|
||||
degradation, overall, strncmp-aligned could reduce the runtime 0%-10%
|
||||
for aligned comparision, 10%-25% for unaligend comparision, strncmp-lsx
|
||||
could reduce the runtime about 0%-60%.
|
||||
|
||||
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/loongarch/lp64/multiarch/Makefile | 2 +
|
||||
.../lp64/multiarch/ifunc-impl-list.c | 7 +
|
||||
.../loongarch/lp64/multiarch/ifunc-strncmp.h | 38 +++
|
||||
.../lp64/multiarch/strncmp-aligned.S | 218 ++++++++++++++++++
|
||||
.../loongarch/lp64/multiarch/strncmp-lsx.S | 208 +++++++++++++++++
|
||||
sysdeps/loongarch/lp64/multiarch/strncmp.c | 35 +++
|
||||
6 files changed, 508 insertions(+)
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strncmp.h
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strncmp.c
|
||||
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
index d5a500de..5d7ae7ae 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
@@ -14,6 +14,8 @@ sysdep_routines += \
|
||||
strchrnul-lasx \
|
||||
strcmp-aligned \
|
||||
strcmp-lsx \
|
||||
+ strncmp-aligned \
|
||||
+ strncmp-lsx \
|
||||
memcpy-aligned \
|
||||
memcpy-unaligned \
|
||||
memmove-unaligned \
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
index 9183b7da..c8ba87bd 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
@@ -69,6 +69,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_aligned)
|
||||
)
|
||||
|
||||
+ IFUNC_IMPL (i, name, strncmp,
|
||||
+#if !defined __loongarch_soft_float
|
||||
+ IFUNC_IMPL_ADD (array, i, strncmp, SUPPORT_LSX, __strncmp_lsx)
|
||||
+#endif
|
||||
+ IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_aligned)
|
||||
+ )
|
||||
+
|
||||
IFUNC_IMPL (i, name, memcpy,
|
||||
#if !defined __loongarch_soft_float
|
||||
IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_LASX, __memcpy_lasx)
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strncmp.h b/sysdeps/loongarch/lp64/multiarch/ifunc-strncmp.h
|
||||
new file mode 100644
|
||||
index 00000000..1a7dc36b
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strncmp.h
|
||||
@@ -0,0 +1,38 @@
|
||||
+/* Common definition for strncmp ifunc selection.
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <ldsodefs.h>
|
||||
+#include <ifunc-init.h>
|
||||
+
|
||||
+#if !defined __loongarch_soft_float
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
|
||||
+#endif
|
||||
+
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
|
||||
+
|
||||
+static inline void *
|
||||
+IFUNC_SELECTOR (void)
|
||||
+{
|
||||
+#if !defined __loongarch_soft_float
|
||||
+ if (SUPPORT_LSX)
|
||||
+ return OPTIMIZE (lsx);
|
||||
+ else
|
||||
+#endif
|
||||
+ return OPTIMIZE (aligned);
|
||||
+}
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S
|
||||
new file mode 100644
|
||||
index 00000000..e2687fa7
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S
|
||||
@@ -0,0 +1,218 @@
|
||||
+/* Optimized strncmp implementation using basic Loongarch instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc)
|
||||
+# define STRNCMP __strncmp_aligned
|
||||
+#else
|
||||
+# define STRNCMP strncmp
|
||||
+#endif
|
||||
+
|
||||
+LEAF(STRNCMP, 6)
|
||||
+ beqz a2, L(ret0)
|
||||
+ lu12i.w a5, 0x01010
|
||||
+ andi a3, a0, 0x7
|
||||
+ ori a5, a5, 0x101
|
||||
+
|
||||
+ andi a4, a1, 0x7
|
||||
+ bstrins.d a5, a5, 63, 32
|
||||
+ li.d t7, -1
|
||||
+ li.d t8, 8
|
||||
+
|
||||
+ addi.d a2, a2, -1
|
||||
+ slli.d a6, a5, 7
|
||||
+ bne a3, a4, L(unaligned)
|
||||
+ bstrins.d a0, zero, 2, 0
|
||||
+
|
||||
+ bstrins.d a1, zero, 2, 0
|
||||
+ ld.d t0, a0, 0
|
||||
+ ld.d t1, a1, 0
|
||||
+ slli.d t2, a3, 3
|
||||
+
|
||||
+
|
||||
+ sub.d t5, t8, a3
|
||||
+ srl.d t3, t7, t2
|
||||
+ srl.d t0, t0, t2
|
||||
+ srl.d t1, t1, t2
|
||||
+
|
||||
+ orn t0, t0, t3
|
||||
+ orn t1, t1, t3
|
||||
+ sub.d t2, t0, a5
|
||||
+ andn t3, a6, t0
|
||||
+
|
||||
+ and t2, t2, t3
|
||||
+ bne t0, t1, L(al_end)
|
||||
+ sltu t4, a2, t5
|
||||
+ sub.d a2, a2, t5
|
||||
+
|
||||
+L(al_loop):
|
||||
+ or t4, t2, t4
|
||||
+ bnez t4, L(ret0)
|
||||
+ ldx.d t0, a0, t8
|
||||
+ ldx.d t1, a1, t8
|
||||
+
|
||||
+
|
||||
+ addi.d t8, t8, 8
|
||||
+ sltui t4, a2, 8
|
||||
+ addi.d a2, a2, -8
|
||||
+ sub.d t2, t0, a5
|
||||
+
|
||||
+ andn t3, a6, t0
|
||||
+ and t2, t2, t3
|
||||
+ beq t0, t1, L(al_loop)
|
||||
+ addi.d a2, a2, 8
|
||||
+
|
||||
+L(al_end):
|
||||
+ xor t3, t0, t1
|
||||
+ or t2, t2, t3
|
||||
+ ctz.d t2, t2
|
||||
+ srli.d t4, t2, 3
|
||||
+
|
||||
+ bstrins.d t2, zero, 2, 0
|
||||
+ srl.d t0, t0, t2
|
||||
+ srl.d t1, t1, t2
|
||||
+ andi t0, t0, 0xff
|
||||
+
|
||||
+
|
||||
+ andi t1, t1, 0xff
|
||||
+ sltu t2, a2, t4
|
||||
+ sub.d a0, t0, t1
|
||||
+ masknez a0, a0, t2
|
||||
+
|
||||
+ jr ra
|
||||
+L(ret0):
|
||||
+ move a0, zero
|
||||
+ jr ra
|
||||
+ nop
|
||||
+
|
||||
+L(unaligned):
|
||||
+ slt a7, a4, a3
|
||||
+ xor t0, a0, a1
|
||||
+ maskeqz t0, t0, a7
|
||||
+ xor a0, a0, t0
|
||||
+
|
||||
+ xor a1, a1, t0
|
||||
+ andi a3, a0, 0x7
|
||||
+ andi a4, a1, 0x7
|
||||
+ bstrins.d a0, zero, 2, 0
|
||||
+
|
||||
+
|
||||
+ bstrins.d a1, zero, 2, 0
|
||||
+ ld.d t4, a0, 0
|
||||
+ ld.d t1, a1, 0
|
||||
+ slli.d t2, a3, 3
|
||||
+
|
||||
+ slli.d t3, a4, 3
|
||||
+ srl.d t5, t7, t3
|
||||
+ srl.d t0, t4, t2
|
||||
+ srl.d t1, t1, t3
|
||||
+
|
||||
+ orn t0, t0, t5
|
||||
+ orn t1, t1, t5
|
||||
+ bne t0, t1, L(not_equal)
|
||||
+ sub.d t6, t8, a4
|
||||
+
|
||||
+ sub.d a4, t2, t3
|
||||
+ sll.d t2, t7, t2
|
||||
+ sub.d t5, t8, a3
|
||||
+ orn t4, t4, t2
|
||||
+
|
||||
+
|
||||
+ sub.d t2, t4, a5
|
||||
+ andn t3, a6, t4
|
||||
+ sltu t7, a2, t5
|
||||
+ and t2, t2, t3
|
||||
+
|
||||
+ sub.d a3, zero, a4
|
||||
+ or t2, t2, t7
|
||||
+ bnez t2, L(un_end)
|
||||
+ sub.d t7, t5, t6
|
||||
+
|
||||
+ sub.d a2, a2, t5
|
||||
+ sub.d t6, t8, t7
|
||||
+L(un_loop):
|
||||
+ srl.d t5, t4, a4
|
||||
+ ldx.d t4, a0, t8
|
||||
+
|
||||
+ ldx.d t1, a1, t8
|
||||
+ addi.d t8, t8, 8
|
||||
+ sll.d t0, t4, a3
|
||||
+ or t0, t0, t5
|
||||
+
|
||||
+
|
||||
+ bne t0, t1, L(loop_not_equal)
|
||||
+ sub.d t2, t4, a5
|
||||
+ andn t3, a6, t4
|
||||
+ sltui t5, a2, 8
|
||||
+
|
||||
+ and t2, t2, t3
|
||||
+ addi.d a2, a2, -8
|
||||
+ or t3, t2, t5
|
||||
+ beqz t3, L(un_loop)
|
||||
+
|
||||
+ addi.d a2, a2, 8
|
||||
+L(un_end):
|
||||
+ sub.d t2, t0, a5
|
||||
+ andn t3, a6, t0
|
||||
+ sltu t5, a2, t6
|
||||
+
|
||||
+ and t2, t2, t3
|
||||
+ or t2, t2, t5
|
||||
+ bnez t2, L(ret0)
|
||||
+ ldx.d t1, a1, t8
|
||||
+
|
||||
+
|
||||
+ srl.d t0, t4, a4
|
||||
+ sub.d a2, a2, t6
|
||||
+L(not_equal):
|
||||
+ sub.d t2, t0, a5
|
||||
+ andn t3, a6, t0
|
||||
+
|
||||
+ xor t4, t0, t1
|
||||
+ and t2, t2, t3
|
||||
+ or t2, t2, t4
|
||||
+ ctz.d t2, t2
|
||||
+
|
||||
+ bstrins.d t2, zero, 2, 0
|
||||
+ srli.d t4, t2, 3
|
||||
+ srl.d t0, t0, t2
|
||||
+ srl.d t1, t1, t2
|
||||
+
|
||||
+ andi t0, t0, 0xff
|
||||
+ andi t1, t1, 0xff
|
||||
+ sub.d t2, t0, t1
|
||||
+ sub.d t3, t1, t0
|
||||
+
|
||||
+
|
||||
+ masknez t0, t2, a7
|
||||
+ maskeqz t1, t3, a7
|
||||
+ sltu t2, a2, t4
|
||||
+ or a0, t0, t1
|
||||
+
|
||||
+ masknez a0, a0, t2
|
||||
+ jr ra
|
||||
+L(loop_not_equal):
|
||||
+ add.d a2, a2, t7
|
||||
+ b L(not_equal)
|
||||
+END(STRNCMP)
|
||||
+
|
||||
+libc_hidden_builtin_def (STRNCMP)
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
|
||||
new file mode 100644
|
||||
index 00000000..0b4eee2a
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
|
||||
@@ -0,0 +1,208 @@
|
||||
+/* Optimized strncmp implementation using Loongarch LSX instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc) && !defined __loongarch_soft_float
|
||||
+
|
||||
+# define STRNCMP __strncmp_lsx
|
||||
+
|
||||
+LEAF(STRNCMP, 6)
|
||||
+ beqz a2, L(ret0)
|
||||
+ pcalau12i t0, %pc_hi20(L(INDEX))
|
||||
+ andi a3, a0, 0xf
|
||||
+ vld vr2, t0, %pc_lo12(L(INDEX))
|
||||
+
|
||||
+ andi a4, a1, 0xf
|
||||
+ li.d t2, 16
|
||||
+ bne a3, a4, L(unaligned)
|
||||
+ xor t0, a0, a3
|
||||
+
|
||||
+ xor t1, a1, a4
|
||||
+ vld vr0, t0, 0
|
||||
+ vld vr1, t1, 0
|
||||
+ vreplgr2vr.b vr3, a3
|
||||
+
|
||||
+
|
||||
+ sub.d t2, t2, a3
|
||||
+ vadd.b vr3, vr3, vr2
|
||||
+ vshuf.b vr0, vr3, vr0, vr3
|
||||
+ vshuf.b vr1, vr3, vr1, vr3
|
||||
+
|
||||
+ vseq.b vr3, vr0, vr1
|
||||
+ vmin.bu vr3, vr0, vr3
|
||||
+ bgeu t2, a2, L(al_early_end)
|
||||
+ vsetanyeqz.b fcc0, vr3
|
||||
+
|
||||
+ bcnez fcc0, L(al_end)
|
||||
+ add.d a3, a0, a2
|
||||
+ addi.d a4, a3, -1
|
||||
+ bstrins.d a4, zero, 3, 0
|
||||
+
|
||||
+ sub.d a2, a3, a4
|
||||
+L(al_loop):
|
||||
+ vld vr0, t0, 16
|
||||
+ vld vr1, t1, 16
|
||||
+ addi.d t0, t0, 16
|
||||
+
|
||||
+
|
||||
+ addi.d t1, t1, 16
|
||||
+ vseq.b vr3, vr0, vr1
|
||||
+ vmin.bu vr3, vr0, vr3
|
||||
+ beq t0, a4, L(al_early_end)
|
||||
+
|
||||
+ vsetanyeqz.b fcc0, vr3
|
||||
+ bceqz fcc0, L(al_loop)
|
||||
+L(al_end):
|
||||
+ vseqi.b vr3, vr3, 0
|
||||
+ vfrstpi.b vr3, vr3, 0
|
||||
+
|
||||
+ vshuf.b vr0, vr0, vr0, vr3
|
||||
+ vshuf.b vr1, vr1, vr1, vr3
|
||||
+ vpickve2gr.bu t0, vr0, 0
|
||||
+ vpickve2gr.bu t1, vr1, 0
|
||||
+
|
||||
+ sub.d a0, t0, t1
|
||||
+ jr ra
|
||||
+L(al_early_end):
|
||||
+ vreplgr2vr.b vr4, a2
|
||||
+ vslt.b vr4, vr2, vr4
|
||||
+
|
||||
+
|
||||
+ vorn.v vr3, vr3, vr4
|
||||
+ b L(al_end)
|
||||
+L(unaligned):
|
||||
+ slt a5, a3, a4
|
||||
+ xor t0, a0, a1
|
||||
+
|
||||
+ maskeqz t0, t0, a5
|
||||
+ xor a0, a0, t0
|
||||
+ xor a1, a1, t0
|
||||
+ andi a3, a0, 0xf
|
||||
+
|
||||
+ andi a4, a1, 0xf
|
||||
+ xor t0, a0, a3
|
||||
+ xor t1, a1, a4
|
||||
+ vld vr0, t0, 0
|
||||
+
|
||||
+ vld vr3, t1, 0
|
||||
+ sub.d t2, t2, a3
|
||||
+ vreplgr2vr.b vr4, a3
|
||||
+ vreplgr2vr.b vr5, a4
|
||||
+
|
||||
+
|
||||
+ vaddi.bu vr6, vr2, 16
|
||||
+ vsub.b vr7, vr4, vr5
|
||||
+ vsub.b vr6, vr6, vr7
|
||||
+ vadd.b vr4, vr2, vr4
|
||||
+
|
||||
+ vshuf.b vr1, vr3, vr3, vr6
|
||||
+ vshuf.b vr0, vr7, vr0, vr4
|
||||
+ vshuf.b vr1, vr7, vr1, vr4
|
||||
+ vseq.b vr4, vr0, vr1
|
||||
+
|
||||
+ vmin.bu vr4, vr0, vr4
|
||||
+ bgeu t2, a2, L(un_early_end)
|
||||
+ vsetanyeqz.b fcc0, vr4
|
||||
+ bcnez fcc0, L(un_end)
|
||||
+
|
||||
+ add.d a6, a0, a2
|
||||
+ vslt.b vr5, vr2, vr5
|
||||
+ addi.d a7, a6, -1
|
||||
+ vor.v vr3, vr3, vr5
|
||||
+
|
||||
+
|
||||
+ bstrins.d a7, zero, 3, 0
|
||||
+ sub.d a2, a6, a7
|
||||
+L(un_loop):
|
||||
+ vld vr0, t0, 16
|
||||
+ addi.d t0, t0, 16
|
||||
+
|
||||
+ vsetanyeqz.b fcc0, vr3
|
||||
+ bcnez fcc0, L(has_zero)
|
||||
+ beq t0, a7, L(end_with_len)
|
||||
+ vor.v vr1, vr3, vr3
|
||||
+
|
||||
+ vld vr3, t1, 16
|
||||
+ addi.d t1, t1, 16
|
||||
+ vshuf.b vr1, vr3, vr1, vr6
|
||||
+ vseq.b vr4, vr0, vr1
|
||||
+
|
||||
+ vmin.bu vr4, vr0, vr4
|
||||
+ vsetanyeqz.b fcc0, vr4
|
||||
+ bceqz fcc0, L(un_loop)
|
||||
+L(un_end):
|
||||
+ vseqi.b vr4, vr4, 0
|
||||
+
|
||||
+
|
||||
+ vfrstpi.b vr4, vr4, 0
|
||||
+ vshuf.b vr0, vr0, vr0, vr4
|
||||
+ vshuf.b vr1, vr1, vr1, vr4
|
||||
+ vpickve2gr.bu t0, vr0, 0
|
||||
+
|
||||
+ vpickve2gr.bu t1, vr1, 0
|
||||
+ sub.d t2, t0, t1
|
||||
+ sub.d t3, t1, t0
|
||||
+ masknez t0, t2, a5
|
||||
+
|
||||
+ maskeqz t1, t3, a5
|
||||
+ or a0, t0, t1
|
||||
+ jr ra
|
||||
+L(has_zero):
|
||||
+ vshuf.b vr1, vr3, vr3, vr6
|
||||
+
|
||||
+ vseq.b vr4, vr0, vr1
|
||||
+ vmin.bu vr4, vr0, vr4
|
||||
+ bne t0, a7, L(un_end)
|
||||
+L(un_early_end):
|
||||
+ vreplgr2vr.b vr5, a2
|
||||
+
|
||||
+ vslt.b vr5, vr2, vr5
|
||||
+ vorn.v vr4, vr4, vr5
|
||||
+ b L(un_end)
|
||||
+L(end_with_len):
|
||||
+ sub.d a6, a3, a4
|
||||
+
|
||||
+ bgeu a6, a2, 1f
|
||||
+ vld vr4, t1, 16
|
||||
+1:
|
||||
+ vshuf.b vr1, vr4, vr3, vr6
|
||||
+ vseq.b vr4, vr0, vr1
|
||||
+
|
||||
+ vmin.bu vr4, vr0, vr4
|
||||
+ vreplgr2vr.b vr5, a2
|
||||
+ vslt.b vr5, vr2, vr5
|
||||
+ vorn.v vr4, vr4, vr5
|
||||
+
|
||||
+ b L(un_end)
|
||||
+L(ret0):
|
||||
+ move a0, zero
|
||||
+ jr ra
|
||||
+END(STRNCMP)
|
||||
+
|
||||
+ .section .rodata.cst16,"M",@progbits,16
|
||||
+ .align 4
|
||||
+L(INDEX):
|
||||
+ .dword 0x0706050403020100
|
||||
+ .dword 0x0f0e0d0c0b0a0908
|
||||
+
|
||||
+libc_hidden_builtin_def (STRNCMP)
|
||||
+#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp.c b/sysdeps/loongarch/lp64/multiarch/strncmp.c
|
||||
new file mode 100644
|
||||
index 00000000..af6d0bc4
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strncmp.c
|
||||
@@ -0,0 +1,35 @@
|
||||
+/* Multiple versions of strncmp.
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+/* Define multiple versions only for the definition in libc. */
|
||||
+#if IS_IN (libc)
|
||||
+# define strncmp __redirect_strncmp
|
||||
+# include <string.h>
|
||||
+# undef strncmp
|
||||
+
|
||||
+# define SYMBOL_NAME strncmp
|
||||
+# include "ifunc-strncmp.h"
|
||||
+
|
||||
+libc_ifunc_redirected (__redirect_strncmp, strncmp, IFUNC_SELECTOR ());
|
||||
+
|
||||
+# ifdef SHARED
|
||||
+__hidden_ver1 (strncmp, __GI_strncmp, __redirect_strncmp)
|
||||
+ __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strncmp);
|
||||
+# endif
|
||||
+#endif
|
||||
--
|
||||
2.33.0
|
||||
|
465
LoongArch-Add-ifunc-support-for-strnlen-aligned-lsx-.patch
Normal file
465
LoongArch-Add-ifunc-support-for-strnlen-aligned-lsx-.patch
Normal file
|
@ -0,0 +1,465 @@
|
|||
From e494d32d3b76eee0d59cfab37789a356459b517a Mon Sep 17 00:00:00 2001
|
||||
From: dengjianbo <dengjianbo@loongson.cn>
|
||||
Date: Thu, 24 Aug 2023 16:50:17 +0800
|
||||
Subject: [PATCH 08/29] LoongArch: Add ifunc support for strnlen{aligned, lsx,
|
||||
lasx}
|
||||
|
||||
Based on the glibc microbenchmark, strnlen-aligned implementation could
|
||||
reduce the runtime more than 10%, strnlen-lsx implementation could reduce
|
||||
the runtime about 50%-78%, strnlen-lasx implementation could reduce the
|
||||
runtime about 50%-88%.
|
||||
|
||||
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/loongarch/lp64/multiarch/Makefile | 3 +
|
||||
.../lp64/multiarch/ifunc-impl-list.c | 8 ++
|
||||
.../loongarch/lp64/multiarch/ifunc-strnlen.h | 41 +++++++
|
||||
.../lp64/multiarch/strnlen-aligned.S | 102 ++++++++++++++++++
|
||||
.../loongarch/lp64/multiarch/strnlen-lasx.S | 100 +++++++++++++++++
|
||||
.../loongarch/lp64/multiarch/strnlen-lsx.S | 89 +++++++++++++++
|
||||
sysdeps/loongarch/lp64/multiarch/strnlen.c | 39 +++++++
|
||||
7 files changed, 382 insertions(+)
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strnlen.h
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen.c
|
||||
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
index afa51041..c4dd3143 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
@@ -3,6 +3,9 @@ sysdep_routines += \
|
||||
strlen-aligned \
|
||||
strlen-lsx \
|
||||
strlen-lasx \
|
||||
+ strnlen-aligned \
|
||||
+ strnlen-lsx \
|
||||
+ strnlen-lasx \
|
||||
strchr-aligned \
|
||||
strchr-lsx \
|
||||
strchr-lasx \
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
index 25eb96b0..7cec0b77 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
@@ -38,6 +38,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_aligned)
|
||||
)
|
||||
|
||||
+ IFUNC_IMPL (i, name, strnlen,
|
||||
+#if !defined __loongarch_soft_float
|
||||
+ IFUNC_IMPL_ADD (array, i, strnlen, SUPPORT_LASX, __strnlen_lasx)
|
||||
+ IFUNC_IMPL_ADD (array, i, strnlen, SUPPORT_LSX, __strnlen_lsx)
|
||||
+#endif
|
||||
+ IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_aligned)
|
||||
+ )
|
||||
+
|
||||
IFUNC_IMPL (i, name, strchr,
|
||||
#if !defined __loongarch_soft_float
|
||||
IFUNC_IMPL_ADD (array, i, strchr, SUPPORT_LASX, __strchr_lasx)
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strnlen.h b/sysdeps/loongarch/lp64/multiarch/ifunc-strnlen.h
|
||||
new file mode 100644
|
||||
index 00000000..5cf89810
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strnlen.h
|
||||
@@ -0,0 +1,41 @@
|
||||
+/* Common definition for strnlen ifunc selections.
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <ldsodefs.h>
|
||||
+#include <ifunc-init.h>
|
||||
+
|
||||
+#if !defined __loongarch_soft_float
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
|
||||
+#endif
|
||||
+
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
|
||||
+
|
||||
+static inline void *
|
||||
+IFUNC_SELECTOR (void)
|
||||
+{
|
||||
+#if !defined __loongarch_soft_float
|
||||
+ if (SUPPORT_LASX)
|
||||
+ return OPTIMIZE (lasx);
|
||||
+ else if (SUPPORT_LSX)
|
||||
+ return OPTIMIZE (lsx);
|
||||
+ else
|
||||
+#endif
|
||||
+ return OPTIMIZE (aligned);
|
||||
+}
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S
|
||||
new file mode 100644
|
||||
index 00000000..b900430a
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S
|
||||
@@ -0,0 +1,102 @@
|
||||
+/* Optimized strnlen implementation using basic Loongarch instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc)
|
||||
+# define STRNLEN __strnlen_aligned
|
||||
+#else
|
||||
+# define STRNLEN __strnlen
|
||||
+#endif
|
||||
+
|
||||
+LEAF(STRNLEN, 6)
|
||||
+ beqz a1, L(out)
|
||||
+ lu12i.w a2, 0x01010
|
||||
+ andi t1, a0, 0x7
|
||||
+ move t4, a0
|
||||
+
|
||||
+ bstrins.d a0, zero, 2, 0
|
||||
+ ori a2, a2, 0x101
|
||||
+ li.w t0, -1
|
||||
+ ld.d t2, a0, 0
|
||||
+
|
||||
+ slli.d t3, t1, 3
|
||||
+ bstrins.d a2, a2, 63, 32
|
||||
+ li.w t5, 8
|
||||
+ slli.d a3, a2, 7
|
||||
+
|
||||
+ sub.w t1, t5, t1
|
||||
+ sll.d t0, t0, t3
|
||||
+ orn t2, t2, t0
|
||||
+ sub.d t0, t2, a2
|
||||
+
|
||||
+
|
||||
+ andn t3, a3, t2
|
||||
+ and t0, t0, t3
|
||||
+ bnez t0, L(count_pos)
|
||||
+ sub.d t5, a1, t1
|
||||
+
|
||||
+ bgeu t1, a1, L(out)
|
||||
+ addi.d a0, a0, 8
|
||||
+L(loop):
|
||||
+ ld.d t2, a0, 0
|
||||
+ sub.d t0, t2, a2
|
||||
+
|
||||
+ andn t1, a3, t2
|
||||
+ sltui t6, t5, 9
|
||||
+ and t0, t0, t1
|
||||
+ or t7, t0, t6
|
||||
+
|
||||
+ bnez t7, L(count_pos)
|
||||
+ ld.d t2, a0, 8
|
||||
+ addi.d a0, a0, 16
|
||||
+ sub.d t0, t2, a2
|
||||
+
|
||||
+
|
||||
+ andn t1, a3, t2
|
||||
+ sltui t6, t5, 17
|
||||
+ and t0, t0, t1
|
||||
+ addi.d t5, t5, -16
|
||||
+
|
||||
+ or t7, t0, t6
|
||||
+ beqz t7, L(loop)
|
||||
+ addi.d a0, a0, -8
|
||||
+L(count_pos):
|
||||
+ ctz.d t1, t0
|
||||
+
|
||||
+ sub.d a0, a0, t4
|
||||
+ srli.d t1, t1, 3
|
||||
+ add.d a0, t1, a0
|
||||
+ sltu t0, a0, a1
|
||||
+
|
||||
+ masknez t1, a1, t0
|
||||
+ maskeqz a0, a0, t0
|
||||
+ or a0, a0, t1
|
||||
+ jr ra
|
||||
+
|
||||
+
|
||||
+L(out):
|
||||
+ move a0, a1
|
||||
+ jr ra
|
||||
+END(STRNLEN)
|
||||
+
|
||||
+weak_alias (STRNLEN, strnlen)
|
||||
+libc_hidden_builtin_def (STRNLEN)
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
|
||||
new file mode 100644
|
||||
index 00000000..2c03d3d9
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
|
||||
@@ -0,0 +1,100 @@
|
||||
+/* Optimized strnlen implementation using loongarch LASX instructions
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc) && !defined __loongarch_soft_float
|
||||
+
|
||||
+# define STRNLEN __strnlen_lasx
|
||||
+
|
||||
+LEAF(STRNLEN, 6)
|
||||
+ beqz a1, L(ret0)
|
||||
+ andi t1, a0, 0x3f
|
||||
+ li.d t3, 65
|
||||
+ sub.d a2, a0, t1
|
||||
+
|
||||
+ xvld xr0, a2, 0
|
||||
+ xvld xr1, a2, 32
|
||||
+ sub.d t1, t3, t1
|
||||
+ move a3, a0
|
||||
+
|
||||
+ sltu t1, a1, t1
|
||||
+ xvmsknz.b xr0, xr0
|
||||
+ xvmsknz.b xr1, xr1
|
||||
+ xvpickve.w xr2, xr0, 4
|
||||
+
|
||||
+ xvpickve.w xr3, xr1, 4
|
||||
+ vilvl.h vr0, vr2, vr0
|
||||
+ vilvl.h vr1, vr3, vr1
|
||||
+ vilvl.w vr0, vr1, vr0
|
||||
+
|
||||
+
|
||||
+ movfr2gr.d t0, fa0
|
||||
+ sra.d t0, t0, a0
|
||||
+ orn t1, t1, t0
|
||||
+ bnez t1, L(end)
|
||||
+
|
||||
+ add.d a4, a0, a1
|
||||
+ move a0, a2
|
||||
+ addi.d a4, a4, -1
|
||||
+ bstrins.d a4, zero, 5, 0
|
||||
+
|
||||
+L(loop):
|
||||
+ xvld xr0, a0, 64
|
||||
+ xvld xr1, a0, 96
|
||||
+ addi.d a0, a0, 64
|
||||
+ beq a0, a4, L(out)
|
||||
+
|
||||
+ xvmin.bu xr2, xr0, xr1
|
||||
+ xvsetanyeqz.b fcc0, xr2
|
||||
+ bceqz fcc0, L(loop)
|
||||
+L(out):
|
||||
+ xvmsknz.b xr0, xr0
|
||||
+
|
||||
+
|
||||
+ xvmsknz.b xr1, xr1
|
||||
+ xvpickve.w xr2, xr0, 4
|
||||
+ xvpickve.w xr3, xr1, 4
|
||||
+ vilvl.h vr0, vr2, vr0
|
||||
+
|
||||
+ vilvl.h vr1, vr3, vr1
|
||||
+ vilvl.w vr0, vr1, vr0
|
||||
+ movfr2gr.d t0, fa0
|
||||
+L(end):
|
||||
+ sub.d a0, a0, a3
|
||||
+
|
||||
+ cto.d t0, t0
|
||||
+ add.d a0, a0, t0
|
||||
+ sltu t1, a0, a1
|
||||
+ masknez t0, a1, t1
|
||||
+
|
||||
+ maskeqz t1, a0, t1
|
||||
+ or a0, t0, t1
|
||||
+ jr ra
|
||||
+L(ret0):
|
||||
+ move a0, zero
|
||||
+
|
||||
+
|
||||
+ jr ra
|
||||
+END(STRNLEN)
|
||||
+
|
||||
+libc_hidden_def (STRNLEN)
|
||||
+#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
|
||||
new file mode 100644
|
||||
index 00000000..b769a895
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
|
||||
@@ -0,0 +1,89 @@
|
||||
+/* Optimized strnlen implementation using loongarch LSX instructions
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc) && !defined __loongarch_soft_float
|
||||
+
|
||||
+# define STRNLEN __strnlen_lsx
|
||||
+
|
||||
+LEAF(STRNLEN, 6)
|
||||
+ beqz a1, L(ret0)
|
||||
+ andi t1, a0, 0x1f
|
||||
+ li.d t3, 33
|
||||
+ sub.d a2, a0, t1
|
||||
+
|
||||
+ vld vr0, a2, 0
|
||||
+ vld vr1, a2, 16
|
||||
+ sub.d t1, t3, t1
|
||||
+ move a3, a0
|
||||
+
|
||||
+ sltu t1, a1, t1
|
||||
+ vmsknz.b vr0, vr0
|
||||
+ vmsknz.b vr1, vr1
|
||||
+ vilvl.h vr0, vr1, vr0
|
||||
+
|
||||
+ movfr2gr.s t0, fa0
|
||||
+ sra.w t0, t0, a0
|
||||
+ orn t1, t1, t0
|
||||
+ bnez t1, L(end)
|
||||
+
|
||||
+
|
||||
+ add.d a4, a0, a1
|
||||
+ move a0, a2
|
||||
+ addi.d a4, a4, -1
|
||||
+ bstrins.d a4, zero, 4, 0
|
||||
+
|
||||
+L(loop):
|
||||
+ vld vr0, a0, 32
|
||||
+ vld vr1, a0, 48
|
||||
+ addi.d a0, a0, 32
|
||||
+ beq a0, a4, L(out)
|
||||
+
|
||||
+ vmin.bu vr2, vr0, vr1
|
||||
+ vsetanyeqz.b fcc0, vr2
|
||||
+ bceqz fcc0, L(loop)
|
||||
+L(out):
|
||||
+ vmsknz.b vr0, vr0
|
||||
+
|
||||
+ vmsknz.b vr1, vr1
|
||||
+ vilvl.h vr0, vr1, vr0
|
||||
+ movfr2gr.s t0, fa0
|
||||
+L(end):
|
||||
+ sub.d a0, a0, a3
|
||||
+
|
||||
+
|
||||
+ cto.w t0, t0
|
||||
+ add.d a0, a0, t0
|
||||
+ sltu t1, a0, a1
|
||||
+ masknez t0, a1, t1
|
||||
+
|
||||
+ maskeqz t1, a0, t1
|
||||
+ or a0, t0, t1
|
||||
+ jr ra
|
||||
+L(ret0):
|
||||
+ move a0, zero
|
||||
+
|
||||
+ jr ra
|
||||
+END(STRNLEN)
|
||||
+
|
||||
+libc_hidden_builtin_def (STRNLEN)
|
||||
+#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen.c b/sysdeps/loongarch/lp64/multiarch/strnlen.c
|
||||
new file mode 100644
|
||||
index 00000000..38b7a25a
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strnlen.c
|
||||
@@ -0,0 +1,39 @@
|
||||
+/* Multiple versions of strnlen.
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+/* Define multiple versions only for the definition in libc. */
|
||||
+#if IS_IN (libc)
|
||||
+# define strnlen __redirect_strnlen
|
||||
+# define __strnlen __redirect___strnlen
|
||||
+# include <string.h>
|
||||
+# undef __strnlen
|
||||
+# undef strnlen
|
||||
+
|
||||
+# define SYMBOL_NAME strnlen
|
||||
+# include "ifunc-strnlen.h"
|
||||
+
|
||||
+libc_ifunc_redirected (__redirect_strnlen, __strnlen, IFUNC_SELECTOR ());
|
||||
+weak_alias (__strnlen, strnlen);
|
||||
+# ifdef SHARED
|
||||
+__hidden_ver1 (__strnlen, __GI___strnlen, __redirect___strnlen)
|
||||
+ __attribute__((visibility ("hidden"))) __attribute_copy__ (strnlen);
|
||||
+__hidden_ver1 (strnlen, __GI_strnlen, __redirect_strnlen)
|
||||
+ __attribute__((weak, visibility ("hidden"))) __attribute_copy__ (strnlen);
|
||||
+# endif
|
||||
+#endif
|
||||
--
|
||||
2.33.0
|
||||
|
670
LoongArch-Add-ifunc-support-for-strrchr-aligned-lsx-.patch
Normal file
670
LoongArch-Add-ifunc-support-for-strrchr-aligned-lsx-.patch
Normal file
|
@ -0,0 +1,670 @@
|
|||
From d537d0ab45a55048c8da483e73be4448ddb45525 Mon Sep 17 00:00:00 2001
|
||||
From: dengjianbo <dengjianbo@loongson.cn>
|
||||
Date: Wed, 13 Sep 2023 15:35:00 +0800
|
||||
Subject: [PATCH 23/29] LoongArch: Add ifunc support for strrchr{aligned, lsx,
|
||||
lasx}
|
||||
|
||||
According to glibc strrchr microbenchmark test results, this implementation
|
||||
could reduce the runtime time as following:
|
||||
|
||||
Name Percent of rutime reduced
|
||||
strrchr-lasx 10%-50%
|
||||
strrchr-lsx 0%-50%
|
||||
strrchr-aligned 5%-50%
|
||||
|
||||
Generic strrchr is implemented by function strlen + memrchr, the lasx version
|
||||
will compare with generic strrchr implemented by strlen-lasx + memrchr-lasx,
|
||||
the lsx version will compare with generic strrchr implemented by strlen-lsx +
|
||||
memrchr-lsx, the aligned version will compare with generic strrchr implemented
|
||||
by strlen-aligned + memrchr-generic.
|
||||
|
||||
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/loongarch/lp64/multiarch/Makefile | 3 +
|
||||
.../lp64/multiarch/ifunc-impl-list.c | 8 +
|
||||
.../loongarch/lp64/multiarch/ifunc-strrchr.h | 41 ++++
|
||||
.../lp64/multiarch/strrchr-aligned.S | 170 +++++++++++++++++
|
||||
.../loongarch/lp64/multiarch/strrchr-lasx.S | 176 ++++++++++++++++++
|
||||
.../loongarch/lp64/multiarch/strrchr-lsx.S | 144 ++++++++++++++
|
||||
sysdeps/loongarch/lp64/multiarch/strrchr.c | 36 ++++
|
||||
7 files changed, 578 insertions(+)
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strrchr.h
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strrchr.c
|
||||
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
index 39550bea..fe863e1b 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
@@ -9,6 +9,9 @@ sysdep_routines += \
|
||||
strchr-aligned \
|
||||
strchr-lsx \
|
||||
strchr-lasx \
|
||||
+ strrchr-aligned \
|
||||
+ strrchr-lsx \
|
||||
+ strrchr-lasx \
|
||||
strchrnul-aligned \
|
||||
strchrnul-lsx \
|
||||
strchrnul-lasx \
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
index 39a14f1d..529e2369 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
@@ -94,6 +94,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_aligned)
|
||||
)
|
||||
|
||||
+ IFUNC_IMPL (i, name, strrchr,
|
||||
+#if !defined __loongarch_soft_float
|
||||
+ IFUNC_IMPL_ADD (array, i, strrchr, SUPPORT_LASX, __strrchr_lasx)
|
||||
+ IFUNC_IMPL_ADD (array, i, strrchr, SUPPORT_LSX, __strrchr_lsx)
|
||||
+#endif
|
||||
+ IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_aligned)
|
||||
+ )
|
||||
+
|
||||
IFUNC_IMPL (i, name, memcpy,
|
||||
#if !defined __loongarch_soft_float
|
||||
IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_LASX, __memcpy_lasx)
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strrchr.h b/sysdeps/loongarch/lp64/multiarch/ifunc-strrchr.h
|
||||
new file mode 100644
|
||||
index 00000000..bbb34089
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strrchr.h
|
||||
@@ -0,0 +1,41 @@
|
||||
+/* Common definition for strrchr ifunc selections.
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <ldsodefs.h>
|
||||
+#include <ifunc-init.h>
|
||||
+
|
||||
+#if !defined __loongarch_soft_float
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
|
||||
+#endif
|
||||
+
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
|
||||
+
|
||||
+static inline void *
|
||||
+IFUNC_SELECTOR (void)
|
||||
+{
|
||||
+#if !defined __loongarch_soft_float
|
||||
+ if (SUPPORT_LASX)
|
||||
+ return OPTIMIZE (lasx);
|
||||
+ else if (SUPPORT_LSX)
|
||||
+ return OPTIMIZE (lsx);
|
||||
+ else
|
||||
+#endif
|
||||
+ return OPTIMIZE (aligned);
|
||||
+}
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S
|
||||
new file mode 100644
|
||||
index 00000000..a73deb78
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S
|
||||
@@ -0,0 +1,170 @@
|
||||
+/* Optimized strrchr implementation using basic LoongArch instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc)
|
||||
+# define STRRCHR __strrchr_aligned
|
||||
+#else
|
||||
+# define STRRCHR strrchr
|
||||
+#endif
|
||||
+
|
||||
+LEAF(STRRCHR, 6)
|
||||
+ slli.d t0, a0, 3
|
||||
+ bstrins.d a0, zero, 2, 0
|
||||
+ lu12i.w a2, 0x01010
|
||||
+ ld.d t2, a0, 0
|
||||
+
|
||||
+ andi a1, a1, 0xff
|
||||
+ ori a2, a2, 0x101
|
||||
+ li.d t3, -1
|
||||
+ bstrins.d a2, a2, 63, 32
|
||||
+
|
||||
+ sll.d t5, t3, t0
|
||||
+ slli.d a3, a2, 7
|
||||
+ orn t4, t2, t5
|
||||
+ mul.d a1, a1, a2
|
||||
+
|
||||
+ sub.d t0, t4, a2
|
||||
+ andn t1, a3, t4
|
||||
+ and t1, t0, t1
|
||||
+ beqz t1, L(find_tail)
|
||||
+
|
||||
+
|
||||
+ ctz.d t0, t1
|
||||
+ orn t0, zero, t0
|
||||
+ xor t2, t4, a1
|
||||
+ srl.d t0, t3, t0
|
||||
+
|
||||
+ orn t2, t2, t0
|
||||
+ orn t2, t2, t5
|
||||
+ revb.d t2, t2
|
||||
+ sub.d t1, t2, a2
|
||||
+
|
||||
+ andn t0, a3, t2
|
||||
+ and t1, t0, t1
|
||||
+ ctz.d t0, t1
|
||||
+ srli.d t0, t0, 3
|
||||
+
|
||||
+ addi.d a0, a0, 7
|
||||
+ sub.d a0, a0, t0
|
||||
+ maskeqz a0, a0, t1
|
||||
+ jr ra
|
||||
+
|
||||
+
|
||||
+L(find_tail):
|
||||
+ addi.d a4, a0, 8
|
||||
+ addi.d a0, a0, 8
|
||||
+L(loop_ascii):
|
||||
+ ld.d t2, a0, 0
|
||||
+ sub.d t1, t2, a2
|
||||
+
|
||||
+ and t0, t1, a3
|
||||
+ bnez t0, L(more_check)
|
||||
+ ld.d t2, a0, 8
|
||||
+ sub.d t1, t2, a2
|
||||
+
|
||||
+ and t0, t1, a3
|
||||
+ addi.d a0, a0, 16
|
||||
+ beqz t0, L(loop_ascii)
|
||||
+ addi.d a0, a0, -8
|
||||
+
|
||||
+L(more_check):
|
||||
+ andn t0, a3, t2
|
||||
+ and t1, t1, t0
|
||||
+ bnez t1, L(tail)
|
||||
+ addi.d a0, a0, 8
|
||||
+
|
||||
+
|
||||
+L(loop_nonascii):
|
||||
+ ld.d t2, a0, 0
|
||||
+ sub.d t1, t2, a2
|
||||
+ andn t0, a3, t2
|
||||
+ and t1, t0, t1
|
||||
+
|
||||
+ bnez t1, L(tail)
|
||||
+ ld.d t2, a0, 8
|
||||
+ addi.d a0, a0, 16
|
||||
+ sub.d t1, t2, a2
|
||||
+
|
||||
+ andn t0, a3, t2
|
||||
+ and t1, t0, t1
|
||||
+ beqz t1, L(loop_nonascii)
|
||||
+ addi.d a0, a0, -8
|
||||
+
|
||||
+L(tail):
|
||||
+ ctz.d t0, t1
|
||||
+ orn t0, zero, t0
|
||||
+ xor t2, t2, a1
|
||||
+ srl.d t0, t3, t0
|
||||
+
|
||||
+
|
||||
+ orn t2, t2, t0
|
||||
+ revb.d t2, t2
|
||||
+ sub.d t1, t2, a2
|
||||
+ andn t0, a3, t2
|
||||
+
|
||||
+ and t1, t0, t1
|
||||
+ bnez t1, L(count_pos)
|
||||
+L(find_loop):
|
||||
+ beq a0, a4, L(find_end)
|
||||
+ ld.d t2, a0, -8
|
||||
+
|
||||
+ addi.d a0, a0, -8
|
||||
+ xor t2, t2, a1
|
||||
+ sub.d t1, t2, a2
|
||||
+ andn t0, a3, t2
|
||||
+
|
||||
+ and t1, t0, t1
|
||||
+ beqz t1, L(find_loop)
|
||||
+ revb.d t2, t2
|
||||
+ sub.d t1, t2, a2
|
||||
+
|
||||
+
|
||||
+ andn t0, a3, t2
|
||||
+ and t1, t0, t1
|
||||
+L(count_pos):
|
||||
+ ctz.d t0, t1
|
||||
+ addi.d a0, a0, 7
|
||||
+
|
||||
+ srli.d t0, t0, 3
|
||||
+ sub.d a0, a0, t0
|
||||
+ jr ra
|
||||
+ nop
|
||||
+
|
||||
+L(find_end):
|
||||
+ xor t2, t4, a1
|
||||
+ orn t2, t2, t5
|
||||
+ revb.d t2, t2
|
||||
+ sub.d t1, t2, a2
|
||||
+
|
||||
+
|
||||
+ andn t0, a3, t2
|
||||
+ and t1, t0, t1
|
||||
+ ctz.d t0, t1
|
||||
+ srli.d t0, t0, 3
|
||||
+
|
||||
+ addi.d a0, a4, -1
|
||||
+ sub.d a0, a0, t0
|
||||
+ maskeqz a0, a0, t1
|
||||
+ jr ra
|
||||
+END(STRRCHR)
|
||||
+
|
||||
+libc_hidden_builtin_def(STRRCHR)
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S
|
||||
new file mode 100644
|
||||
index 00000000..5a6e2297
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S
|
||||
@@ -0,0 +1,176 @@
|
||||
+/* Optimized strrchr implementation using LoongArch LASX instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc) && !defined __loongarch_soft_float
|
||||
+
|
||||
+#define STRRCHR __strrchr_lasx
|
||||
+
|
||||
+LEAF(STRRCHR, 6)
|
||||
+ move a2, a0
|
||||
+ bstrins.d a0, zero, 5, 0
|
||||
+ xvld xr0, a0, 0
|
||||
+ xvld xr1, a0, 32
|
||||
+
|
||||
+ li.d t2, -1
|
||||
+ xvreplgr2vr.b xr4, a1
|
||||
+ xvmsknz.b xr2, xr0
|
||||
+ xvmsknz.b xr3, xr1
|
||||
+
|
||||
+ xvpickve.w xr5, xr2, 4
|
||||
+ xvpickve.w xr6, xr3, 4
|
||||
+ vilvl.h vr2, vr5, vr2
|
||||
+ vilvl.h vr3, vr6, vr3
|
||||
+
|
||||
+ vilvl.w vr2, vr3, vr2
|
||||
+ movfr2gr.d t0, fa2
|
||||
+ sra.d t0, t0, a2
|
||||
+ beq t0, t2, L(find_tail)
|
||||
+
|
||||
+
|
||||
+ xvseq.b xr2, xr0, xr4
|
||||
+ xvseq.b xr3, xr1, xr4
|
||||
+ xvmsknz.b xr2, xr2
|
||||
+ xvmsknz.b xr3, xr3
|
||||
+
|
||||
+ xvpickve.w xr4, xr2, 4
|
||||
+ xvpickve.w xr5, xr3, 4
|
||||
+ vilvl.h vr2, vr4, vr2
|
||||
+ vilvl.h vr3, vr5, vr3
|
||||
+
|
||||
+ vilvl.w vr1, vr3, vr2
|
||||
+ slli.d t3, t2, 1
|
||||
+ movfr2gr.d t1, fa1
|
||||
+ cto.d t0, t0
|
||||
+
|
||||
+ srl.d t1, t1, a2
|
||||
+ sll.d t3, t3, t0
|
||||
+ addi.d a0, a2, 63
|
||||
+ andn t1, t1, t3
|
||||
+
|
||||
+
|
||||
+ clz.d t0, t1
|
||||
+ sub.d a0, a0, t0
|
||||
+ maskeqz a0, a0, t1
|
||||
+ jr ra
|
||||
+
|
||||
+ .align 5
|
||||
+L(find_tail):
|
||||
+ addi.d a3, a0, 64
|
||||
+L(loop):
|
||||
+ xvld xr2, a0, 64
|
||||
+ xvld xr3, a0, 96
|
||||
+ addi.d a0, a0, 64
|
||||
+
|
||||
+ xvmin.bu xr5, xr2, xr3
|
||||
+ xvsetanyeqz.b fcc0, xr5
|
||||
+ bceqz fcc0, L(loop)
|
||||
+ xvmsknz.b xr5, xr2
|
||||
+
|
||||
+
|
||||
+ xvmsknz.b xr6, xr3
|
||||
+ xvpickve.w xr7, xr5, 4
|
||||
+ xvpickve.w xr8, xr6, 4
|
||||
+ vilvl.h vr5, vr7, vr5
|
||||
+
|
||||
+ vilvl.h vr6, vr8, vr6
|
||||
+ xvseq.b xr2, xr2, xr4
|
||||
+ xvseq.b xr3, xr3, xr4
|
||||
+ xvmsknz.b xr2, xr2
|
||||
+
|
||||
+ xvmsknz.b xr3, xr3
|
||||
+ xvpickve.w xr7, xr2, 4
|
||||
+ xvpickve.w xr8, xr3, 4
|
||||
+ vilvl.h vr2, vr7, vr2
|
||||
+
|
||||
+ vilvl.h vr3, vr8, vr3
|
||||
+ vilvl.w vr5, vr6, vr5
|
||||
+ vilvl.w vr2, vr3, vr2
|
||||
+ movfr2gr.d t0, fa5
|
||||
+
|
||||
+
|
||||
+ movfr2gr.d t1, fa2
|
||||
+ slli.d t3, t2, 1
|
||||
+ cto.d t0, t0
|
||||
+ sll.d t3, t3, t0
|
||||
+
|
||||
+ andn t1, t1, t3
|
||||
+ beqz t1, L(find_loop)
|
||||
+ clz.d t0, t1
|
||||
+ addi.d a0, a0, 63
|
||||
+
|
||||
+ sub.d a0, a0, t0
|
||||
+ jr ra
|
||||
+L(find_loop):
|
||||
+ beq a0, a3, L(find_end)
|
||||
+ xvld xr2, a0, -64
|
||||
+
|
||||
+ xvld xr3, a0, -32
|
||||
+ addi.d a0, a0, -64
|
||||
+ xvseq.b xr2, xr2, xr4
|
||||
+ xvseq.b xr3, xr3, xr4
|
||||
+
|
||||
+
|
||||
+ xvmax.bu xr5, xr2, xr3
|
||||
+ xvseteqz.v fcc0, xr5
|
||||
+ bcnez fcc0, L(find_loop)
|
||||
+ xvmsknz.b xr0, xr2
|
||||
+
|
||||
+ xvmsknz.b xr1, xr3
|
||||
+ xvpickve.w xr2, xr0, 4
|
||||
+ xvpickve.w xr3, xr1, 4
|
||||
+ vilvl.h vr0, vr2, vr0
|
||||
+
|
||||
+ vilvl.h vr1, vr3, vr1
|
||||
+ vilvl.w vr0, vr1, vr0
|
||||
+ movfr2gr.d t0, fa0
|
||||
+ addi.d a0, a0, 63
|
||||
+
|
||||
+ clz.d t0, t0
|
||||
+ sub.d a0, a0, t0
|
||||
+ jr ra
|
||||
+ nop
|
||||
+
|
||||
+
|
||||
+L(find_end):
|
||||
+ xvseq.b xr2, xr0, xr4
|
||||
+ xvseq.b xr3, xr1, xr4
|
||||
+ xvmsknz.b xr2, xr2
|
||||
+ xvmsknz.b xr3, xr3
|
||||
+
|
||||
+ xvpickve.w xr4, xr2, 4
|
||||
+ xvpickve.w xr5, xr3, 4
|
||||
+ vilvl.h vr2, vr4, vr2
|
||||
+ vilvl.h vr3, vr5, vr3
|
||||
+
|
||||
+ vilvl.w vr1, vr3, vr2
|
||||
+ movfr2gr.d t1, fa1
|
||||
+ addi.d a0, a2, 63
|
||||
+ srl.d t1, t1, a2
|
||||
+
|
||||
+ clz.d t0, t1
|
||||
+ sub.d a0, a0, t0
|
||||
+ maskeqz a0, a0, t1
|
||||
+ jr ra
|
||||
+END(STRRCHR)
|
||||
+
|
||||
+libc_hidden_builtin_def(STRRCHR)
|
||||
+#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S
|
||||
new file mode 100644
|
||||
index 00000000..8f2fd22e
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S
|
||||
@@ -0,0 +1,144 @@
|
||||
+/* Optimized strrchr implementation using LoongArch LSX instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc) && !defined __loongarch_soft_float
|
||||
+
|
||||
+#define STRRCHR __strrchr_lsx
|
||||
+
|
||||
+LEAF(STRRCHR, 6)
|
||||
+ move a2, a0
|
||||
+ bstrins.d a0, zero, 4, 0
|
||||
+ vld vr0, a0, 0
|
||||
+ vld vr1, a0, 16
|
||||
+
|
||||
+ li.d t2, -1
|
||||
+ vreplgr2vr.b vr4, a1
|
||||
+ vmsknz.b vr2, vr0
|
||||
+ vmsknz.b vr3, vr1
|
||||
+
|
||||
+ vilvl.h vr2, vr3, vr2
|
||||
+ movfr2gr.s t0, fa2
|
||||
+ sra.w t0, t0, a2
|
||||
+ beq t0, t2, L(find_tail)
|
||||
+
|
||||
+ vseq.b vr2, vr0, vr4
|
||||
+ vseq.b vr3, vr1, vr4
|
||||
+ vmsknz.b vr2, vr2
|
||||
+ vmsknz.b vr3, vr3
|
||||
+
|
||||
+
|
||||
+ vilvl.h vr1, vr3, vr2
|
||||
+ slli.d t3, t2, 1
|
||||
+ movfr2gr.s t1, fa1
|
||||
+ cto.w t0, t0
|
||||
+
|
||||
+ srl.w t1, t1, a2
|
||||
+ sll.d t3, t3, t0
|
||||
+ addi.d a0, a2, 31
|
||||
+ andn t1, t1, t3
|
||||
+
|
||||
+ clz.w t0, t1
|
||||
+ sub.d a0, a0, t0
|
||||
+ maskeqz a0, a0, t1
|
||||
+ jr ra
|
||||
+
|
||||
+ .align 5
|
||||
+L(find_tail):
|
||||
+ addi.d a3, a0, 32
|
||||
+L(loop):
|
||||
+ vld vr2, a0, 32
|
||||
+ vld vr3, a0, 48
|
||||
+ addi.d a0, a0, 32
|
||||
+
|
||||
+ vmin.bu vr5, vr2, vr3
|
||||
+ vsetanyeqz.b fcc0, vr5
|
||||
+ bceqz fcc0, L(loop)
|
||||
+ vmsknz.b vr5, vr2
|
||||
+
|
||||
+ vmsknz.b vr6, vr3
|
||||
+ vilvl.h vr5, vr6, vr5
|
||||
+ vseq.b vr2, vr2, vr4
|
||||
+ vseq.b vr3, vr3, vr4
|
||||
+
|
||||
+ vmsknz.b vr2, vr2
|
||||
+ vmsknz.b vr3, vr3
|
||||
+ vilvl.h vr2, vr3, vr2
|
||||
+ movfr2gr.s t0, fa5
|
||||
+
|
||||
+
|
||||
+ movfr2gr.s t1, fa2
|
||||
+ slli.d t3, t2, 1
|
||||
+ cto.w t0, t0
|
||||
+ sll.d t3, t3, t0
|
||||
+
|
||||
+ andn t1, t1, t3
|
||||
+ beqz t1, L(find_loop)
|
||||
+ clz.w t0, t1
|
||||
+ addi.d a0, a0, 31
|
||||
+
|
||||
+ sub.d a0, a0, t0
|
||||
+ jr ra
|
||||
+L(find_loop):
|
||||
+ beq a0, a3, L(find_end)
|
||||
+ vld vr2, a0, -32
|
||||
+
|
||||
+ vld vr3, a0, -16
|
||||
+ addi.d a0, a0, -32
|
||||
+ vseq.b vr2, vr2, vr4
|
||||
+ vseq.b vr3, vr3, vr4
|
||||
+
|
||||
+
|
||||
+ vmax.bu vr5, vr2, vr3
|
||||
+ vseteqz.v fcc0, vr5
|
||||
+ bcnez fcc0, L(find_loop)
|
||||
+ vmsknz.b vr0, vr2
|
||||
+
|
||||
+ vmsknz.b vr1, vr3
|
||||
+ vilvl.h vr0, vr1, vr0
|
||||
+ movfr2gr.s t0, fa0
|
||||
+ addi.d a0, a0, 31
|
||||
+
|
||||
+ clz.w t0, t0
|
||||
+ sub.d a0, a0, t0
|
||||
+ jr ra
|
||||
+ nop
|
||||
+
|
||||
+L(find_end):
|
||||
+ vseq.b vr2, vr0, vr4
|
||||
+ vseq.b vr3, vr1, vr4
|
||||
+ vmsknz.b vr2, vr2
|
||||
+ vmsknz.b vr3, vr3
|
||||
+
|
||||
+
|
||||
+ vilvl.h vr1, vr3, vr2
|
||||
+ movfr2gr.s t1, fa1
|
||||
+ addi.d a0, a2, 31
|
||||
+ srl.w t1, t1, a2
|
||||
+
|
||||
+ clz.w t0, t1
|
||||
+ sub.d a0, a0, t0
|
||||
+ maskeqz a0, a0, t1
|
||||
+ jr ra
|
||||
+END(STRRCHR)
|
||||
+
|
||||
+libc_hidden_builtin_def(STRRCHR)
|
||||
+#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr.c b/sysdeps/loongarch/lp64/multiarch/strrchr.c
|
||||
new file mode 100644
|
||||
index 00000000..d9c9f660
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strrchr.c
|
||||
@@ -0,0 +1,36 @@
|
||||
+/* Multiple versions of strrchr.
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+/* Define multiple versions only for the definition in libc. */
|
||||
+#if IS_IN (libc)
|
||||
+# define strrchr __redirect_strrchr
|
||||
+# include <string.h>
|
||||
+# undef strrchr
|
||||
+
|
||||
+# define SYMBOL_NAME strrchr
|
||||
+# include "ifunc-strrchr.h"
|
||||
+
|
||||
+libc_ifunc_redirected (__redirect_strrchr, strrchr, IFUNC_SELECTOR ());
|
||||
+weak_alias (strrchr, rindex)
|
||||
+# ifdef SHARED
|
||||
+__hidden_ver1 (strrchr, __GI_strrchr, __redirect_strrchr)
|
||||
+ __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strrchr);
|
||||
+# endif
|
||||
+
|
||||
+#endif
|
||||
--
|
||||
2.33.0
|
||||
|
626
LoongArch-Add-lasx-lsx-support-for-_dl_runtime_profi.patch
Normal file
626
LoongArch-Add-lasx-lsx-support-for-_dl_runtime_profi.patch
Normal file
|
@ -0,0 +1,626 @@
|
|||
From b5979df8ad07823c79a934c1fa0a91ec0abffb61 Mon Sep 17 00:00:00 2001
|
||||
From: caiyinyu <caiyinyu@loongson.cn>
|
||||
Date: Fri, 8 Sep 2023 14:10:55 +0800
|
||||
Subject: [PATCH 20/29] LoongArch: Add lasx/lsx support for
|
||||
_dl_runtime_profile.
|
||||
|
||||
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/loongarch/bits/link.h | 24 ++-
|
||||
sysdeps/loongarch/bits/link_lavcurrent.h | 25 +++
|
||||
sysdeps/loongarch/dl-audit-check.h | 23 +++
|
||||
sysdeps/loongarch/dl-link.sym | 8 +-
|
||||
sysdeps/loongarch/dl-machine.h | 11 +-
|
||||
sysdeps/loongarch/dl-trampoline.S | 177 +----------------
|
||||
sysdeps/loongarch/dl-trampoline.h | 242 +++++++++++++++++++++++
|
||||
7 files changed, 331 insertions(+), 179 deletions(-)
|
||||
create mode 100644 sysdeps/loongarch/bits/link_lavcurrent.h
|
||||
create mode 100644 sysdeps/loongarch/dl-audit-check.h
|
||||
|
||||
diff --git a/sysdeps/loongarch/bits/link.h b/sysdeps/loongarch/bits/link.h
|
||||
index 7fa61312..00f6f25f 100644
|
||||
--- a/sysdeps/loongarch/bits/link.h
|
||||
+++ b/sysdeps/loongarch/bits/link.h
|
||||
@@ -20,10 +20,26 @@
|
||||
#error "Never include <bits/link.h> directly; use <link.h> instead."
|
||||
#endif
|
||||
|
||||
+#ifndef __loongarch_soft_float
|
||||
+typedef float La_loongarch_vr
|
||||
+ __attribute__ ((__vector_size__ (16), __aligned__ (16)));
|
||||
+typedef float La_loongarch_xr
|
||||
+ __attribute__ ((__vector_size__ (32), __aligned__ (16)));
|
||||
+
|
||||
+typedef union
|
||||
+{
|
||||
+ double fpreg[4];
|
||||
+ La_loongarch_vr vr[2];
|
||||
+ La_loongarch_xr xr[1];
|
||||
+} La_loongarch_vector __attribute__ ((__aligned__ (16)));
|
||||
+#endif
|
||||
+
|
||||
typedef struct La_loongarch_regs
|
||||
{
|
||||
unsigned long int lr_reg[8]; /* a0 - a7 */
|
||||
- double lr_fpreg[8]; /* fa0 - fa7 */
|
||||
+#ifndef __loongarch_soft_float
|
||||
+ La_loongarch_vector lr_vec[8]; /* fa0 - fa7 or vr0 - vr7 or xr0 - xr7*/
|
||||
+#endif
|
||||
unsigned long int lr_ra;
|
||||
unsigned long int lr_sp;
|
||||
} La_loongarch_regs;
|
||||
@@ -33,8 +49,10 @@ typedef struct La_loongarch_retval
|
||||
{
|
||||
unsigned long int lrv_a0;
|
||||
unsigned long int lrv_a1;
|
||||
- double lrv_fa0;
|
||||
- double lrv_fa1;
|
||||
+#ifndef __loongarch_soft_float
|
||||
+ La_loongarch_vector lrv_vec0;
|
||||
+ La_loongarch_vector lrv_vec1;
|
||||
+#endif
|
||||
} La_loongarch_retval;
|
||||
|
||||
__BEGIN_DECLS
|
||||
diff --git a/sysdeps/loongarch/bits/link_lavcurrent.h b/sysdeps/loongarch/bits/link_lavcurrent.h
|
||||
new file mode 100644
|
||||
index 00000000..15f1eb84
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/bits/link_lavcurrent.h
|
||||
@@ -0,0 +1,25 @@
|
||||
+/* Data structure for communication from the run-time dynamic linker for
|
||||
+ loaded ELF shared objects. LAV_CURRENT definition.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifndef _LINK_H
|
||||
+# error "Never include <bits/link_lavcurrent.h> directly; use <link.h> instead."
|
||||
+#endif
|
||||
+
|
||||
+/* Version numbers for la_version handshake interface. */
|
||||
+#define LAV_CURRENT 3
|
||||
diff --git a/sysdeps/loongarch/dl-audit-check.h b/sysdeps/loongarch/dl-audit-check.h
|
||||
new file mode 100644
|
||||
index 00000000..a139c939
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/dl-audit-check.h
|
||||
@@ -0,0 +1,23 @@
|
||||
+/* rtld-audit version check. LoongArch version.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+static inline bool
|
||||
+_dl_audit_check_version (unsigned int lav)
|
||||
+{
|
||||
+ return lav == LAV_CURRENT;
|
||||
+}
|
||||
diff --git a/sysdeps/loongarch/dl-link.sym b/sysdeps/loongarch/dl-link.sym
|
||||
index 868ab7c6..b534968e 100644
|
||||
--- a/sysdeps/loongarch/dl-link.sym
|
||||
+++ b/sysdeps/loongarch/dl-link.sym
|
||||
@@ -6,9 +6,13 @@ DL_SIZEOF_RG sizeof(struct La_loongarch_regs)
|
||||
DL_SIZEOF_RV sizeof(struct La_loongarch_retval)
|
||||
|
||||
DL_OFFSET_RG_A0 offsetof(struct La_loongarch_regs, lr_reg)
|
||||
-DL_OFFSET_RG_FA0 offsetof(struct La_loongarch_regs, lr_fpreg)
|
||||
+#ifndef __loongarch_soft_float
|
||||
+DL_OFFSET_RG_VEC0 offsetof(struct La_loongarch_regs, lr_vec)
|
||||
+#endif
|
||||
DL_OFFSET_RG_RA offsetof(struct La_loongarch_regs, lr_ra)
|
||||
DL_OFFSET_RG_SP offsetof(struct La_loongarch_regs, lr_sp)
|
||||
|
||||
DL_OFFSET_RV_A0 offsetof(struct La_loongarch_retval, lrv_a0)
|
||||
-DL_OFFSET_RV_FA0 offsetof(struct La_loongarch_retval, lrv_a1)
|
||||
+#ifndef __loongarch_soft_float
|
||||
+DL_OFFSET_RV_VEC0 offsetof(struct La_loongarch_retval, lrv_vec0)
|
||||
+#endif
|
||||
diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
|
||||
index 066bb233..8a2db9de 100644
|
||||
--- a/sysdeps/loongarch/dl-machine.h
|
||||
+++ b/sysdeps/loongarch/dl-machine.h
|
||||
@@ -273,6 +273,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
|
||||
#if !defined __loongarch_soft_float
|
||||
extern void _dl_runtime_resolve_lasx (void) attribute_hidden;
|
||||
extern void _dl_runtime_resolve_lsx (void) attribute_hidden;
|
||||
+ extern void _dl_runtime_profile_lasx (void) attribute_hidden;
|
||||
+ extern void _dl_runtime_profile_lsx (void) attribute_hidden;
|
||||
#endif
|
||||
extern void _dl_runtime_resolve (void) attribute_hidden;
|
||||
extern void _dl_runtime_profile (void) attribute_hidden;
|
||||
@@ -287,7 +289,14 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
|
||||
end in this function. */
|
||||
if (profile != 0)
|
||||
{
|
||||
- gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile;
|
||||
+#if !defined __loongarch_soft_float
|
||||
+ if (SUPPORT_LASX)
|
||||
+ gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile_lasx;
|
||||
+ else if (SUPPORT_LSX)
|
||||
+ gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile_lsx;
|
||||
+ else
|
||||
+#endif
|
||||
+ gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile;
|
||||
|
||||
if (GLRO(dl_profile) != NULL
|
||||
&& _dl_name_match_p (GLRO(dl_profile), l))
|
||||
diff --git a/sysdeps/loongarch/dl-trampoline.S b/sysdeps/loongarch/dl-trampoline.S
|
||||
index 8fd91469..bb449ecf 100644
|
||||
--- a/sysdeps/loongarch/dl-trampoline.S
|
||||
+++ b/sysdeps/loongarch/dl-trampoline.S
|
||||
@@ -22,190 +22,21 @@
|
||||
#if !defined __loongarch_soft_float
|
||||
#define USE_LASX
|
||||
#define _dl_runtime_resolve _dl_runtime_resolve_lasx
|
||||
+#define _dl_runtime_profile _dl_runtime_profile_lasx
|
||||
#include "dl-trampoline.h"
|
||||
#undef FRAME_SIZE
|
||||
#undef USE_LASX
|
||||
#undef _dl_runtime_resolve
|
||||
+#undef _dl_runtime_profile
|
||||
|
||||
#define USE_LSX
|
||||
#define _dl_runtime_resolve _dl_runtime_resolve_lsx
|
||||
+#define _dl_runtime_profile _dl_runtime_profile_lsx
|
||||
#include "dl-trampoline.h"
|
||||
#undef FRAME_SIZE
|
||||
#undef USE_LSX
|
||||
#undef _dl_runtime_resolve
|
||||
+#undef _dl_runtime_profile
|
||||
#endif
|
||||
|
||||
#include "dl-trampoline.h"
|
||||
-
|
||||
-#include "dl-link.h"
|
||||
-
|
||||
-ENTRY (_dl_runtime_profile)
|
||||
- /* LoongArch we get called with:
|
||||
- t0 linkr_map pointer
|
||||
- t1 the scaled offset stored in t0, which can be used
|
||||
- to calculate the offset of the current symbol in .rela.plt
|
||||
- t2 %hi(%pcrel(.got.plt)) stored in t2, no use in this function
|
||||
- t3 dl resolver entry point, no use in this function
|
||||
-
|
||||
- Stack frame layout:
|
||||
- [sp, #96] La_loongarch_regs
|
||||
- [sp, #48] La_loongarch_retval
|
||||
- [sp, #40] frame size return from pltenter
|
||||
- [sp, #32] dl_profile_call saved a1
|
||||
- [sp, #24] dl_profile_call saved a0
|
||||
- [sp, #16] T1
|
||||
- [sp, #0] ra, fp <- fp
|
||||
- */
|
||||
-
|
||||
-# define OFFSET_T1 16
|
||||
-# define OFFSET_SAVED_CALL_A0 OFFSET_T1 + 8
|
||||
-# define OFFSET_FS OFFSET_SAVED_CALL_A0 + 16
|
||||
-# define OFFSET_RV OFFSET_FS + 8
|
||||
-# define OFFSET_RG OFFSET_RV + DL_SIZEOF_RV
|
||||
-
|
||||
-# define SF_SIZE (-(-(OFFSET_RG + DL_SIZEOF_RG) & ALMASK))
|
||||
-
|
||||
- /* Save arguments to stack. */
|
||||
- ADDI sp, sp, -SF_SIZE
|
||||
- REG_S ra, sp, 0
|
||||
- REG_S fp, sp, 8
|
||||
-
|
||||
- or fp, sp, zero
|
||||
-
|
||||
- REG_S a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
|
||||
- REG_S a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
|
||||
- REG_S a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
|
||||
- REG_S a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
|
||||
- REG_S a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
|
||||
- REG_S a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
|
||||
- REG_S a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
|
||||
- REG_S a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
|
||||
-
|
||||
-#ifndef __loongarch_soft_float
|
||||
- FREG_S fa0, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 0*SZFREG
|
||||
- FREG_S fa1, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 1*SZFREG
|
||||
- FREG_S fa2, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 2*SZFREG
|
||||
- FREG_S fa3, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 3*SZFREG
|
||||
- FREG_S fa4, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 4*SZFREG
|
||||
- FREG_S fa5, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 5*SZFREG
|
||||
- FREG_S fa6, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 6*SZFREG
|
||||
- FREG_S fa7, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 7*SZFREG
|
||||
-#endif
|
||||
-
|
||||
- /* Update .got.plt and obtain runtime address of callee. */
|
||||
- SLLI a1, t1, 1
|
||||
- or a0, t0, zero
|
||||
- ADD a1, a1, t1
|
||||
- or a2, ra, zero /* return addr */
|
||||
- ADDI a3, fp, OFFSET_RG /* La_loongarch_regs pointer */
|
||||
- ADDI a4, fp, OFFSET_FS /* frame size return from pltenter */
|
||||
-
|
||||
- REG_S a0, fp, OFFSET_SAVED_CALL_A0
|
||||
- REG_S a1, fp, OFFSET_SAVED_CALL_A0 + SZREG
|
||||
-
|
||||
- la t2, _dl_profile_fixup
|
||||
- jirl ra, t2, 0
|
||||
-
|
||||
- REG_L t3, fp, OFFSET_FS
|
||||
- bge t3, zero, 1f
|
||||
-
|
||||
- /* Save the return. */
|
||||
- or t4, v0, zero
|
||||
-
|
||||
- /* Restore arguments from stack. */
|
||||
- REG_L a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
|
||||
- REG_L a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
|
||||
- REG_L a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
|
||||
- REG_L a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
|
||||
- REG_L a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
|
||||
- REG_L a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
|
||||
- REG_L a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
|
||||
- REG_L a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
|
||||
-
|
||||
-#ifndef __loongarch_soft_float
|
||||
- FREG_L fa0, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 0*SZFREG
|
||||
- FREG_L fa1, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 1*SZFREG
|
||||
- FREG_L fa2, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 2*SZFREG
|
||||
- FREG_L fa3, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 3*SZFREG
|
||||
- FREG_L fa4, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 4*SZFREG
|
||||
- FREG_L fa5, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 5*SZFREG
|
||||
- FREG_L fa6, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 6*SZFREG
|
||||
- FREG_L fa7, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 7*SZFREG
|
||||
-#endif
|
||||
-
|
||||
- REG_L ra, fp, 0
|
||||
- REG_L fp, fp, SZREG
|
||||
-
|
||||
- ADDI sp, sp, SF_SIZE
|
||||
- jirl zero, t4, 0
|
||||
-
|
||||
-1:
|
||||
- /* The new frame size is in t3. */
|
||||
- SUB sp, fp, t3
|
||||
- BSTRINS sp, zero, 3, 0
|
||||
-
|
||||
- REG_S a0, fp, OFFSET_T1
|
||||
-
|
||||
- or a0, sp, zero
|
||||
- ADDI a1, fp, SF_SIZE
|
||||
- or a2, t3, zero
|
||||
- la t5, memcpy
|
||||
- jirl ra, t5, 0
|
||||
-
|
||||
- REG_L t6, fp, OFFSET_T1
|
||||
-
|
||||
- /* Call the function. */
|
||||
- REG_L a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
|
||||
- REG_L a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
|
||||
- REG_L a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
|
||||
- REG_L a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
|
||||
- REG_L a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
|
||||
- REG_L a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
|
||||
- REG_L a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
|
||||
- REG_L a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
|
||||
-
|
||||
-#ifndef __loongarch_soft_float
|
||||
- FREG_L fa0, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 0*SZFREG
|
||||
- FREG_L fa1, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 1*SZFREG
|
||||
- FREG_L fa2, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 2*SZFREG
|
||||
- FREG_L fa3, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 3*SZFREG
|
||||
- FREG_L fa4, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 4*SZFREG
|
||||
- FREG_L fa5, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 5*SZFREG
|
||||
- FREG_L fa6, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 6*SZFREG
|
||||
- FREG_L fa7, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 7*SZFREG
|
||||
-#endif
|
||||
- jirl ra, t6, 0
|
||||
-
|
||||
- REG_S a0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_A0
|
||||
- REG_S a1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_A0 + SZREG
|
||||
-
|
||||
-#ifndef __loongarch_soft_float
|
||||
- FREG_S fa0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_FA0
|
||||
- FREG_S fa1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_FA0 + SZFREG
|
||||
-#endif
|
||||
-
|
||||
- /* Setup call to pltexit. */
|
||||
- REG_L a0, fp, OFFSET_SAVED_CALL_A0
|
||||
- REG_L a1, fp, OFFSET_SAVED_CALL_A0 + SZREG
|
||||
- ADDI a2, fp, OFFSET_RG
|
||||
- ADDI a3, fp, OFFSET_RV
|
||||
- la t7, _dl_audit_pltexit
|
||||
- jirl ra, t7, 0
|
||||
-
|
||||
- REG_L a0, fp, OFFSET_RV + DL_OFFSET_RV_A0
|
||||
- REG_L a1, fp, OFFSET_RV + DL_OFFSET_RV_A0 + SZREG
|
||||
-
|
||||
-#ifndef __loongarch_soft_float
|
||||
- FREG_L fa0, fp, OFFSET_RV + DL_OFFSET_RV_FA0
|
||||
- FREG_L fa1, fp, OFFSET_RV + DL_OFFSET_RV_FA0 + SZFREG
|
||||
-#endif
|
||||
-
|
||||
- /* RA from within La_loongarch_reg. */
|
||||
- REG_L ra, fp, OFFSET_RG + DL_OFFSET_RG_RA
|
||||
- or sp, fp, zero
|
||||
- ADDI sp, sp, SF_SIZE
|
||||
- REG_S fp, fp, SZREG
|
||||
-
|
||||
- jirl zero, ra, 0
|
||||
-
|
||||
-END (_dl_runtime_profile)
|
||||
diff --git a/sysdeps/loongarch/dl-trampoline.h b/sysdeps/loongarch/dl-trampoline.h
|
||||
index 99fcacab..e298439d 100644
|
||||
--- a/sysdeps/loongarch/dl-trampoline.h
|
||||
+++ b/sysdeps/loongarch/dl-trampoline.h
|
||||
@@ -125,3 +125,245 @@ ENTRY (_dl_runtime_resolve)
|
||||
/* Invoke the callee. */
|
||||
jirl zero, t1, 0
|
||||
END (_dl_runtime_resolve)
|
||||
+
|
||||
+#include "dl-link.h"
|
||||
+
|
||||
+ENTRY (_dl_runtime_profile)
|
||||
+ /* LoongArch we get called with:
|
||||
+ t0 linkr_map pointer
|
||||
+ t1 the scaled offset stored in t0, which can be used
|
||||
+ to calculate the offset of the current symbol in .rela.plt
|
||||
+ t2 %hi(%pcrel(.got.plt)) stored in t2, no use in this function
|
||||
+ t3 dl resolver entry point, no use in this function
|
||||
+
|
||||
+ Stack frame layout:
|
||||
+ [sp, #208] La_loongarch_regs
|
||||
+ [sp, #128] La_loongarch_retval // align: 16
|
||||
+ [sp, #112] frame size return from pltenter
|
||||
+ [sp, #80 ] dl_profile_call saved vec1
|
||||
+ [sp, #48 ] dl_profile_call saved vec0 // align: 16
|
||||
+ [sp, #32 ] dl_profile_call saved a1
|
||||
+ [sp, #24 ] dl_profile_call saved a0
|
||||
+ [sp, #16 ] T1
|
||||
+ [sp, #0 ] ra, fp <- fp
|
||||
+ */
|
||||
+
|
||||
+# define OFFSET_T1 16
|
||||
+# define OFFSET_SAVED_CALL_A0 OFFSET_T1 + 8
|
||||
+# define OFFSET_FS OFFSET_SAVED_CALL_A0 + 16 + 8 + 64
|
||||
+# define OFFSET_RV OFFSET_FS + 8 + 8
|
||||
+# define OFFSET_RG OFFSET_RV + DL_SIZEOF_RV
|
||||
+
|
||||
+# define SF_SIZE (-(-(OFFSET_RG + DL_SIZEOF_RG) & ALMASK))
|
||||
+
|
||||
+ /* Save arguments to stack. */
|
||||
+ ADDI sp, sp, -SF_SIZE
|
||||
+ REG_S ra, sp, 0
|
||||
+ REG_S fp, sp, 8
|
||||
+
|
||||
+ or fp, sp, zero
|
||||
+
|
||||
+ REG_S a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
|
||||
+ REG_S a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
|
||||
+ REG_S a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
|
||||
+ REG_S a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
|
||||
+ REG_S a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
|
||||
+ REG_S a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
|
||||
+ REG_S a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
|
||||
+ REG_S a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
|
||||
+
|
||||
+#ifdef USE_LASX
|
||||
+ xvst xr0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZXREG
|
||||
+ xvst xr1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZXREG
|
||||
+ xvst xr2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZXREG
|
||||
+ xvst xr3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZXREG
|
||||
+ xvst xr4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZXREG
|
||||
+ xvst xr5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZXREG
|
||||
+ xvst xr6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZXREG
|
||||
+ xvst xr7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZXREG
|
||||
+#elif defined USE_LSX
|
||||
+ vst vr0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZVREG
|
||||
+ vst vr1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZVREG
|
||||
+ vst vr2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZVREG
|
||||
+ vst vr3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZVREG
|
||||
+ vst vr4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZVREG
|
||||
+ vst vr5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZVREG
|
||||
+ vst vr6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZVREG
|
||||
+ vst vr7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZVREG
|
||||
+#elif !defined __loongarch_soft_float
|
||||
+ FREG_S fa0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZFREG
|
||||
+ FREG_S fa1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZFREG
|
||||
+ FREG_S fa2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZFREG
|
||||
+ FREG_S fa3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZFREG
|
||||
+ FREG_S fa4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZFREG
|
||||
+ FREG_S fa5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZFREG
|
||||
+ FREG_S fa6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZFREG
|
||||
+ FREG_S fa7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZFREG
|
||||
+#endif
|
||||
+
|
||||
+ /* Update .got.plt and obtain runtime address of callee. */
|
||||
+ SLLI a1, t1, 1
|
||||
+ or a0, t0, zero
|
||||
+ ADD a1, a1, t1
|
||||
+ or a2, ra, zero /* return addr */
|
||||
+ ADDI a3, fp, OFFSET_RG /* La_loongarch_regs pointer */
|
||||
+ ADDI a4, fp, OFFSET_FS /* frame size return from pltenter */
|
||||
+
|
||||
+ REG_S a0, fp, OFFSET_SAVED_CALL_A0
|
||||
+ REG_S a1, fp, OFFSET_SAVED_CALL_A0 + SZREG
|
||||
+
|
||||
+ la t2, _dl_profile_fixup
|
||||
+ jirl ra, t2, 0
|
||||
+
|
||||
+ REG_L t3, fp, OFFSET_FS
|
||||
+ bge t3, zero, 1f
|
||||
+
|
||||
+ /* Save the return. */
|
||||
+ or t4, v0, zero
|
||||
+
|
||||
+ /* Restore arguments from stack. */
|
||||
+ REG_L a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
|
||||
+ REG_L a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
|
||||
+ REG_L a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
|
||||
+ REG_L a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
|
||||
+ REG_L a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
|
||||
+ REG_L a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
|
||||
+ REG_L a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
|
||||
+ REG_L a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
|
||||
+
|
||||
+#ifdef USE_LASX
|
||||
+ xvld xr0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZXREG
|
||||
+ xvld xr1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZXREG
|
||||
+ xvld xr2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZXREG
|
||||
+ xvld xr3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZXREG
|
||||
+ xvld xr4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZXREG
|
||||
+ xvld xr5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZXREG
|
||||
+ xvld xr6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZXREG
|
||||
+ xvld xr7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZXREG
|
||||
+#elif defined USE_LSX
|
||||
+ vld vr0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZVREG
|
||||
+ vld vr1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZVREG
|
||||
+ vld vr2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZVREG
|
||||
+ vld vr3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZVREG
|
||||
+ vld vr4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZVREG
|
||||
+ vld vr5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZVREG
|
||||
+ vld vr6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZVREG
|
||||
+ vld vr7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZVREG
|
||||
+#elif !defined __loongarch_soft_float
|
||||
+ FREG_L fa0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZFREG
|
||||
+ FREG_L fa1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZFREG
|
||||
+ FREG_L fa2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZFREG
|
||||
+ FREG_L fa3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZFREG
|
||||
+ FREG_L fa4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZFREG
|
||||
+ FREG_L fa5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZFREG
|
||||
+ FREG_L fa6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZFREG
|
||||
+ FREG_L fa7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZFREG
|
||||
+#endif
|
||||
+
|
||||
+ REG_L ra, fp, 0
|
||||
+ REG_L fp, fp, SZREG
|
||||
+
|
||||
+ ADDI sp, sp, SF_SIZE
|
||||
+ jirl zero, t4, 0
|
||||
+
|
||||
+1:
|
||||
+ /* The new frame size is in t3. */
|
||||
+ SUB sp, fp, t3
|
||||
+ BSTRINS sp, zero, 3, 0
|
||||
+
|
||||
+ REG_S a0, fp, OFFSET_T1
|
||||
+
|
||||
+ or a0, sp, zero
|
||||
+ ADDI a1, fp, SF_SIZE
|
||||
+ or a2, t3, zero
|
||||
+ la t5, memcpy
|
||||
+ jirl ra, t5, 0
|
||||
+
|
||||
+ REG_L t6, fp, OFFSET_T1
|
||||
+
|
||||
+ /* Call the function. */
|
||||
+ REG_L a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
|
||||
+ REG_L a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
|
||||
+ REG_L a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
|
||||
+ REG_L a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
|
||||
+ REG_L a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
|
||||
+ REG_L a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
|
||||
+ REG_L a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
|
||||
+ REG_L a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
|
||||
+
|
||||
+#ifdef USE_LASX
|
||||
+ xvld xr0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZXREG
|
||||
+ xvld xr1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZXREG
|
||||
+ xvld xr2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZXREG
|
||||
+ xvld xr3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZXREG
|
||||
+ xvld xr4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZXREG
|
||||
+ xvld xr5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZXREG
|
||||
+ xvld xr6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZXREG
|
||||
+ xvld xr7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZXREG
|
||||
+#elif defined USE_LSX
|
||||
+ vld vr0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZVREG
|
||||
+ vld vr1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZVREG
|
||||
+ vld vr2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZVREG
|
||||
+ vld vr3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZVREG
|
||||
+ vld vr4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZVREG
|
||||
+ vld vr5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZVREG
|
||||
+ vld vr6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZVREG
|
||||
+ vld vr7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZVREG
|
||||
+#elif !defined __loongarch_soft_float
|
||||
+ FREG_L fa0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZFREG
|
||||
+ FREG_L fa1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZFREG
|
||||
+ FREG_L fa2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZFREG
|
||||
+ FREG_L fa3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZFREG
|
||||
+ FREG_L fa4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZFREG
|
||||
+ FREG_L fa5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZFREG
|
||||
+ FREG_L fa6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZFREG
|
||||
+ FREG_L fa7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZFREG
|
||||
+#endif
|
||||
+
|
||||
+ jirl ra, t6, 0
|
||||
+
|
||||
+ REG_S a0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_A0
|
||||
+ REG_S a1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_A0 + SZREG
|
||||
+
|
||||
+#ifdef USE_LASX
|
||||
+ xvst xr0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0
|
||||
+ xvst xr1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0 + SZXREG
|
||||
+#elif defined USE_LSX
|
||||
+ vst vr0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0
|
||||
+ vst vr1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0 + SZVREG
|
||||
+#elif !defined __loongarch_soft_float
|
||||
+ FREG_S fa0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0
|
||||
+ FREG_S fa1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0 + SZFREG
|
||||
+#endif
|
||||
+
|
||||
+ /* Setup call to pltexit. */
|
||||
+ REG_L a0, fp, OFFSET_SAVED_CALL_A0
|
||||
+ REG_L a1, fp, OFFSET_SAVED_CALL_A0 + SZREG
|
||||
+ ADDI a2, fp, OFFSET_RG
|
||||
+ ADDI a3, fp, OFFSET_RV
|
||||
+ la t7, _dl_audit_pltexit
|
||||
+ jirl ra, t7, 0
|
||||
+
|
||||
+ REG_L a0, fp, OFFSET_RV + DL_OFFSET_RV_A0
|
||||
+ REG_L a1, fp, OFFSET_RV + DL_OFFSET_RV_A0 + SZREG
|
||||
+
|
||||
+#ifdef USE_LASX
|
||||
+ xvld xr0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0
|
||||
+ xvld xr1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0 + SZXREG
|
||||
+#elif defined USE_LSX
|
||||
+ vld vr0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0
|
||||
+ vld vr1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0 + SZVREG
|
||||
+#elif !defined __loongarch_soft_float
|
||||
+ FREG_L fa0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0
|
||||
+ FREG_L fa1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0 + SZFREG
|
||||
+#endif
|
||||
+
|
||||
+ /* RA from within La_loongarch_reg. */
|
||||
+ REG_L ra, fp, OFFSET_RG + DL_OFFSET_RG_RA
|
||||
+ or sp, fp, zero
|
||||
+ ADDI sp, sp, SF_SIZE
|
||||
+ REG_S fp, fp, SZREG
|
||||
+
|
||||
+ jirl zero, ra, 0
|
||||
+
|
||||
+END (_dl_runtime_profile)
|
||||
--
|
||||
2.33.0
|
||||
|
102
LoongArch-Add-minuimum-binutils-required-version.patch
Normal file
102
LoongArch-Add-minuimum-binutils-required-version.patch
Normal file
|
@ -0,0 +1,102 @@
|
|||
From 7353f21f6ed1754b67e455e2b80123787efa9e91 Mon Sep 17 00:00:00 2001
|
||||
From: dengjianbo <dengjianbo@loongson.cn>
|
||||
Date: Tue, 8 Aug 2023 14:15:43 +0800
|
||||
Subject: [PATCH 02/29] LoongArch: Add minuimum binutils required version
|
||||
|
||||
LoongArch glibc can add some LASX/LSX vector instructions codes,
|
||||
change the required minimum binutils version to 2.41 which could
|
||||
support vector instructions. HAVE_LOONGARCH_VEC_ASM is removed
|
||||
accordingly.
|
||||
|
||||
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
config.h.in | 5 -----
|
||||
sysdeps/loongarch/configure | 5 ++---
|
||||
sysdeps/loongarch/configure.ac | 4 ++--
|
||||
sysdeps/loongarch/dl-machine.h | 4 ++--
|
||||
sysdeps/loongarch/dl-trampoline.S | 2 +-
|
||||
5 files changed, 7 insertions(+), 13 deletions(-)
|
||||
|
||||
diff --git a/config.h.in b/config.h.in
|
||||
index 0dedc124..44a34072 100644
|
||||
--- a/config.h.in
|
||||
+++ b/config.h.in
|
||||
@@ -141,11 +141,6 @@
|
||||
/* LOONGARCH floating-point ABI for ld.so. */
|
||||
#undef LOONGARCH_ABI_FRLEN
|
||||
|
||||
-/* Assembler support LoongArch LASX/LSX vector instructions.
|
||||
- This macro becomes obsolete when glibc increased the minimum
|
||||
- required version of GNU 'binutils' to 2.41 or later. */
|
||||
-#define HAVE_LOONGARCH_VEC_ASM 0
|
||||
-
|
||||
/* Linux specific: minimum supported kernel version. */
|
||||
#undef __LINUX_KERNEL_VERSION
|
||||
|
||||
diff --git a/sysdeps/loongarch/configure b/sysdeps/loongarch/configure
|
||||
index 5843c7cf..395ddc92 100644
|
||||
--- a/sysdeps/loongarch/configure
|
||||
+++ b/sysdeps/loongarch/configure
|
||||
@@ -128,8 +128,7 @@ rm -f conftest*
|
||||
fi
|
||||
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_loongarch_vec_asm" >&5
|
||||
printf "%s\n" "$libc_cv_loongarch_vec_asm" >&6; }
|
||||
-if test $libc_cv_loongarch_vec_asm = yes; then
|
||||
- printf "%s\n" "#define HAVE_LOONGARCH_VEC_ASM 1" >>confdefs.h
|
||||
-
|
||||
+if test $libc_cv_loongarch_vec_asm = no; then
|
||||
+ as_fn_error $? "binutils version is too old, use 2.41 or newer version" "$LINENO" 5
|
||||
fi
|
||||
|
||||
diff --git a/sysdeps/loongarch/configure.ac b/sysdeps/loongarch/configure.ac
|
||||
index ba89d834..989287c6 100644
|
||||
--- a/sysdeps/loongarch/configure.ac
|
||||
+++ b/sysdeps/loongarch/configure.ac
|
||||
@@ -74,6 +74,6 @@ else
|
||||
libc_cv_loongarch_vec_asm=no
|
||||
fi
|
||||
rm -f conftest*])
|
||||
-if test $libc_cv_loongarch_vec_asm = yes; then
|
||||
- AC_DEFINE(HAVE_LOONGARCH_VEC_ASM)
|
||||
+if test $libc_cv_loongarch_vec_asm = no; then
|
||||
+ AC_MSG_ERROR([binutils version is too old, use 2.41 or newer version])
|
||||
fi
|
||||
diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
|
||||
index 51ce9af8..066bb233 100644
|
||||
--- a/sysdeps/loongarch/dl-machine.h
|
||||
+++ b/sysdeps/loongarch/dl-machine.h
|
||||
@@ -270,7 +270,7 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
|
||||
/* If using PLTs, fill in the first two entries of .got.plt. */
|
||||
if (l->l_info[DT_JMPREL])
|
||||
{
|
||||
-#if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float
|
||||
+#if !defined __loongarch_soft_float
|
||||
extern void _dl_runtime_resolve_lasx (void) attribute_hidden;
|
||||
extern void _dl_runtime_resolve_lsx (void) attribute_hidden;
|
||||
#endif
|
||||
@@ -300,7 +300,7 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
|
||||
/* This function will get called to fix up the GOT entry
|
||||
indicated by the offset on the stack, and then jump to
|
||||
the resolved address. */
|
||||
-#if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float
|
||||
+#if !defined __loongarch_soft_float
|
||||
if (SUPPORT_LASX)
|
||||
gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lasx;
|
||||
else if (SUPPORT_LSX)
|
||||
diff --git a/sysdeps/loongarch/dl-trampoline.S b/sysdeps/loongarch/dl-trampoline.S
|
||||
index f6ba5e44..8fd91469 100644
|
||||
--- a/sysdeps/loongarch/dl-trampoline.S
|
||||
+++ b/sysdeps/loongarch/dl-trampoline.S
|
||||
@@ -19,7 +19,7 @@
|
||||
#include <sysdep.h>
|
||||
#include <sys/asm.h>
|
||||
|
||||
-#if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float
|
||||
+#if !defined __loongarch_soft_float
|
||||
#define USE_LASX
|
||||
#define _dl_runtime_resolve _dl_runtime_resolve_lasx
|
||||
#include "dl-trampoline.h"
|
||||
--
|
||||
2.33.0
|
||||
|
277
LoongArch-Change-loongarch-to-LoongArch-in-comments.patch
Normal file
277
LoongArch-Change-loongarch-to-LoongArch-in-comments.patch
Normal file
|
@ -0,0 +1,277 @@
|
|||
From e5ccd79e81de7ad5821fde83875973e878d85d4b Mon Sep 17 00:00:00 2001
|
||||
From: dengjianbo <dengjianbo@loongson.cn>
|
||||
Date: Mon, 28 Aug 2023 10:08:40 +0800
|
||||
Subject: [PATCH 19/29] LoongArch: Change loongarch to LoongArch in comments
|
||||
|
||||
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S | 2 +-
|
||||
sysdeps/loongarch/lp64/multiarch/memcpy-lasx.S | 2 +-
|
||||
sysdeps/loongarch/lp64/multiarch/memcpy-lsx.S | 2 +-
|
||||
sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S | 2 +-
|
||||
sysdeps/loongarch/lp64/multiarch/memmove-aligned.S | 2 +-
|
||||
sysdeps/loongarch/lp64/multiarch/memmove-lasx.S | 2 +-
|
||||
sysdeps/loongarch/lp64/multiarch/memmove-lsx.S | 2 +-
|
||||
sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S | 2 +-
|
||||
sysdeps/loongarch/lp64/multiarch/strchr-aligned.S | 2 +-
|
||||
sysdeps/loongarch/lp64/multiarch/strchr-lasx.S | 2 +-
|
||||
sysdeps/loongarch/lp64/multiarch/strchr-lsx.S | 2 +-
|
||||
sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S | 2 +-
|
||||
sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S | 2 +-
|
||||
sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S | 2 +-
|
||||
sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S | 2 +-
|
||||
sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S | 2 +-
|
||||
sysdeps/loongarch/lp64/multiarch/strlen-aligned.S | 2 +-
|
||||
sysdeps/loongarch/lp64/multiarch/strlen-lasx.S | 2 +-
|
||||
sysdeps/loongarch/lp64/multiarch/strlen-lsx.S | 2 +-
|
||||
sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S | 2 +-
|
||||
sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S | 2 +-
|
||||
sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S | 2 +-
|
||||
sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S | 2 +-
|
||||
sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S | 2 +-
|
||||
24 files changed, 24 insertions(+), 24 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S
|
||||
index 299dd49c..7eb34395 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* Optimized memcpy_aligned implementation using basic Loongarch instructions.
|
||||
+/* Optimized memcpy_aligned implementation using basic LoongArch instructions.
|
||||
Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-lasx.S b/sysdeps/loongarch/lp64/multiarch/memcpy-lasx.S
|
||||
index 4aae5bf8..ae148df5 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/memcpy-lasx.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memcpy-lasx.S
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* Optimized memcpy implementation using Loongarch LASX instructions.
|
||||
+/* Optimized memcpy implementation using LoongArch LASX instructions.
|
||||
Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/memcpy-lsx.S
|
||||
index 6ebbe7a2..feb2bb0e 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/memcpy-lsx.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memcpy-lsx.S
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* Optimized memcpy implementation using Loongarch LSX instructions.
|
||||
+/* Optimized memcpy implementation using LoongArch LSX instructions.
|
||||
Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S
|
||||
index 8e60a22d..31019b13 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* Optimized unaligned memcpy implementation using basic Loongarch instructions.
|
||||
+/* Optimized unaligned memcpy implementation using basic LoongArch instructions.
|
||||
Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-aligned.S b/sysdeps/loongarch/lp64/multiarch/memmove-aligned.S
|
||||
index 5354f383..a02114c0 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/memmove-aligned.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memmove-aligned.S
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* Optimized memmove_aligned implementation using basic Loongarch instructions.
|
||||
+/* Optimized memmove_aligned implementation using basic LoongArch instructions.
|
||||
Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S
|
||||
index ff68e7a2..95d8ee7b 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* Optimized memmove implementation using Loongarch LASX instructions.
|
||||
+/* Optimized memmove implementation using LoongArch LASX instructions.
|
||||
Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
|
||||
index 9e1502a7..8a936770 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* Optimized memmove implementation using Loongarch LSX instructions.
|
||||
+/* Optimized memmove implementation using LoongArch LSX instructions.
|
||||
Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S
|
||||
index 90a64b6b..3284ce25 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* Optimized memmove_unaligned implementation using basic Loongarch instructions.
|
||||
+/* Optimized memmove_unaligned implementation using basic LoongArch instructions.
|
||||
Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S
|
||||
index 5fb01806..62020054 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* Optimized strchr implementation using basic Loongarch instructions.
|
||||
+/* Optimized strchr implementation using basic LoongArch instructions.
|
||||
Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S
|
||||
index 254402da..4d3cc588 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* Optimized strchr implementation using loongarch LASX SIMD instructions.
|
||||
+/* Optimized strchr implementation using LoongArch LASX instructions.
|
||||
Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S
|
||||
index dae98b0a..8b78c35c 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* Optimized strlen implementation using loongarch LSX SIMD instructions.
|
||||
+/* Optimized strlen implementation using LoongArch LSX instructions.
|
||||
Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S
|
||||
index 1c01a023..20856a06 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* Optimized strchrnul implementation using basic Loongarch instructions.
|
||||
+/* Optimized strchrnul implementation using basic LoongArch instructions.
|
||||
Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S
|
||||
index d45495e4..4753d4ce 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* Optimized strchrnul implementation using loongarch LASX SIMD instructions.
|
||||
+/* Optimized strchrnul implementation using LoongArch LASX instructions.
|
||||
Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S
|
||||
index 07d793ae..671e740c 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* Optimized strchrnul implementation using loongarch LSX SIMD instructions.
|
||||
+/* Optimized strchrnul implementation using LoongArch LSX instructions.
|
||||
Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S
|
||||
index f5f4f336..ba1f9667 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* Optimized strcmp implementation using basic Loongarch instructions.
|
||||
+/* Optimized strcmp implementation using basic LoongArch instructions.
|
||||
Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
|
||||
index 2e177a38..091c8c9e 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* Optimized strcmp implementation using Loongarch LSX instructions.
|
||||
+/* Optimized strcmp implementation using LoongArch LSX instructions.
|
||||
Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
|
||||
index e9e1d2fc..ed0548e4 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* Optimized strlen implementation using basic Loongarch instructions.
|
||||
+/* Optimized strlen implementation using basic LoongArch instructions.
|
||||
Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
|
||||
index 258c47ce..91342f34 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* Optimized strlen implementation using loongarch LASX SIMD instructions.
|
||||
+/* Optimized strlen implementation using LoongArch LASX instructions.
|
||||
Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
|
||||
index b194355e..b09c12e0 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* Optimized strlen implementation using Loongarch LSX SIMD instructions.
|
||||
+/* Optimized strlen implementation using LoongArch LSX instructions.
|
||||
Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S
|
||||
index e2687fa7..f63de872 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* Optimized strncmp implementation using basic Loongarch instructions.
|
||||
+/* Optimized strncmp implementation using basic LoongArch instructions.
|
||||
Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
|
||||
index 0b4eee2a..83cb801d 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* Optimized strncmp implementation using Loongarch LSX instructions.
|
||||
+/* Optimized strncmp implementation using LoongArch LSX instructions.
|
||||
Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S
|
||||
index b900430a..a8296a1b 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* Optimized strnlen implementation using basic Loongarch instructions.
|
||||
+/* Optimized strnlen implementation using basic LoongArch instructions.
|
||||
Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
|
||||
index 2c03d3d9..aa6c812d 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* Optimized strnlen implementation using loongarch LASX instructions
|
||||
+/* Optimized strnlen implementation using LoongArch LASX instructions
|
||||
Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
|
||||
index b769a895..d0febe3e 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* Optimized strnlen implementation using loongarch LSX instructions
|
||||
+/* Optimized strnlen implementation using LoongArch LSX instructions
|
||||
Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
--
|
||||
2.33.0
|
||||
|
67
LoongArch-Change-to-put-magic-number-to-.rodata-sect.patch
Normal file
67
LoongArch-Change-to-put-magic-number-to-.rodata-sect.patch
Normal file
|
@ -0,0 +1,67 @@
|
|||
From fb72c81f9894b23797f6e2e066532c0963f5155f Mon Sep 17 00:00:00 2001
|
||||
From: dengjianbo <dengjianbo@loongson.cn>
|
||||
Date: Wed, 13 Sep 2023 15:35:01 +0800
|
||||
Subject: [PATCH 24/29] LoongArch: Change to put magic number to .rodata
|
||||
section
|
||||
|
||||
Change to put magic number to .rodata section in memmove-lsx, and use
|
||||
pcalau12i and %pc_lo12 with vld to get the data.
|
||||
|
||||
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
.../loongarch/lp64/multiarch/memmove-lsx.S | 20 +++++++++----------
|
||||
1 file changed, 10 insertions(+), 10 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
|
||||
index 8a936770..5eb819ef 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
|
||||
@@ -209,13 +209,10 @@ L(al_less_16):
|
||||
nop
|
||||
|
||||
|
||||
-L(magic_num):
|
||||
- .dword 0x0706050403020100
|
||||
- .dword 0x0f0e0d0c0b0a0908
|
||||
L(unaligned):
|
||||
- pcaddi t2, -4
|
||||
+ pcalau12i t2, %pc_hi20(L(INDEX))
|
||||
bstrins.d a1, zero, 3, 0
|
||||
- vld vr8, t2, 0
|
||||
+ vld vr8, t2, %pc_lo12(L(INDEX))
|
||||
vld vr0, a1, 0
|
||||
|
||||
vld vr1, a1, 16
|
||||
@@ -413,13 +410,10 @@ L(back_al_less_16):
|
||||
vst vr1, a0, 0
|
||||
jr ra
|
||||
|
||||
-L(magic_num_2):
|
||||
- .dword 0x0706050403020100
|
||||
- .dword 0x0f0e0d0c0b0a0908
|
||||
L(back_unaligned):
|
||||
- pcaddi t2, -4
|
||||
+ pcalau12i t2, %pc_hi20(L(INDEX))
|
||||
bstrins.d a4, zero, 3, 0
|
||||
- vld vr8, t2, 0
|
||||
+ vld vr8, t2, %pc_lo12(L(INDEX))
|
||||
vld vr0, a4, 0
|
||||
|
||||
vld vr1, a4, -16
|
||||
@@ -529,6 +523,12 @@ L(back_un_less_16):
|
||||
jr ra
|
||||
END(MEMMOVE_NAME)
|
||||
|
||||
+ .section .rodata.cst16,"M",@progbits,16
|
||||
+ .align 4
|
||||
+L(INDEX):
|
||||
+ .dword 0x0706050403020100
|
||||
+ .dword 0x0f0e0d0c0b0a0908
|
||||
+
|
||||
libc_hidden_builtin_def (MEMCPY_NAME)
|
||||
libc_hidden_builtin_def (MEMMOVE_NAME)
|
||||
#endif
|
||||
--
|
||||
2.33.0
|
||||
|
44
LoongArch-Micro-optimize-LD_PCREL.patch
Normal file
44
LoongArch-Micro-optimize-LD_PCREL.patch
Normal file
|
@ -0,0 +1,44 @@
|
|||
From 7f703cf758c4f185dd62f2a4f463002bb514af16 Mon Sep 17 00:00:00 2001
|
||||
From: Xi Ruoyao <xry111@xry111.site>
|
||||
Date: Sun, 27 Aug 2023 00:36:51 +0800
|
||||
Subject: [PATCH 13/29] LoongArch: Micro-optimize LD_PCREL
|
||||
|
||||
We are requiring Binutils >= 2.41, so explicit relocation syntax is
|
||||
always supported by the assembler. Use it to reduce one instruction.
|
||||
|
||||
Signed-off-by: Xi Ruoyao <xry111@xry111.site>
|
||||
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/unix/sysv/linux/loongarch/pointer_guard.h | 10 ++++------
|
||||
1 file changed, 4 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/unix/sysv/linux/loongarch/pointer_guard.h b/sysdeps/unix/sysv/linux/loongarch/pointer_guard.h
|
||||
index b25e353b..d6c78687 100644
|
||||
--- a/sysdeps/unix/sysv/linux/loongarch/pointer_guard.h
|
||||
+++ b/sysdeps/unix/sysv/linux/loongarch/pointer_guard.h
|
||||
@@ -19,17 +19,15 @@
|
||||
#ifndef POINTER_GUARD_H
|
||||
#define POINTER_GUARD_H
|
||||
|
||||
-/* Load a got-relative EXPR into G, using T.
|
||||
- Note G and T are register names. */
|
||||
+/* Load a got-relative EXPR into register G. */
|
||||
#define LD_GLOBAL(G, EXPR) \
|
||||
la.global G, EXPR; \
|
||||
REG_L G, G, 0;
|
||||
|
||||
-/* Load a pc-relative EXPR into G, using T.
|
||||
- Note G and T are register names. */
|
||||
+/* Load a pc-relative EXPR into register G. */
|
||||
#define LD_PCREL(G, EXPR) \
|
||||
- la.pcrel G, EXPR; \
|
||||
- REG_L G, G, 0;
|
||||
+ pcalau12i G, %pc_hi20(EXPR); \
|
||||
+ REG_L G, G, %pc_lo12(EXPR);
|
||||
|
||||
#if (IS_IN (rtld) \
|
||||
|| (!defined SHARED && (IS_IN (libc) \
|
||||
--
|
||||
2.33.0
|
||||
|
65
LoongArch-Redefine-macro-LEAF-ENTRY.patch
Normal file
65
LoongArch-Redefine-macro-LEAF-ENTRY.patch
Normal file
|
@ -0,0 +1,65 @@
|
|||
From 8dcd8c837df2e3cf81675522487697522f1542f8 Mon Sep 17 00:00:00 2001
|
||||
From: dengjianbo <dengjianbo@loongson.cn>
|
||||
Date: Tue, 8 Aug 2023 14:15:42 +0800
|
||||
Subject: [PATCH 01/29] LoongArch: Redefine macro LEAF/ENTRY.
|
||||
|
||||
The following usage of macro LEAF/ENTRY are all feasible:
|
||||
1. LEAF(fcn) -- the align value of fcn is .align 3(default value)
|
||||
2. LEAF(fcn, 6) -- the align value of fcn is .align 6
|
||||
|
||||
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/loongarch/sys/asm.h | 36 ++++++++++++++++++++++++++----------
|
||||
1 file changed, 26 insertions(+), 10 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/loongarch/sys/asm.h b/sysdeps/loongarch/sys/asm.h
|
||||
index d1a279b8..c5eb8afa 100644
|
||||
--- a/sysdeps/loongarch/sys/asm.h
|
||||
+++ b/sysdeps/loongarch/sys/asm.h
|
||||
@@ -39,16 +39,32 @@
|
||||
#define FREG_L fld.d
|
||||
#define FREG_S fst.d
|
||||
|
||||
-/* Declare leaf routine. */
|
||||
-#define LEAF(symbol) \
|
||||
- .text; \
|
||||
- .globl symbol; \
|
||||
- .align 3; \
|
||||
- cfi_startproc; \
|
||||
- .type symbol, @function; \
|
||||
- symbol:
|
||||
-
|
||||
-#define ENTRY(symbol) LEAF (symbol)
|
||||
+/* Declare leaf routine.
|
||||
+ The usage of macro LEAF/ENTRY is as follows:
|
||||
+ 1. LEAF(fcn) -- the align value of fcn is .align 3 (default value)
|
||||
+ 2. LEAF(fcn, 6) -- the align value of fcn is .align 6
|
||||
+*/
|
||||
+#define LEAF_IMPL(symbol, aln, ...) \
|
||||
+ .text; \
|
||||
+ .globl symbol; \
|
||||
+ .align aln; \
|
||||
+ .type symbol, @function; \
|
||||
+symbol: \
|
||||
+ cfi_startproc;
|
||||
+
|
||||
+
|
||||
+#define LEAF(...) LEAF_IMPL(__VA_ARGS__, 3)
|
||||
+#define ENTRY(...) LEAF(__VA_ARGS__)
|
||||
+
|
||||
+#define LEAF_NO_ALIGN(symbol) \
|
||||
+ .text; \
|
||||
+ .globl symbol; \
|
||||
+ .type symbol, @function; \
|
||||
+symbol: \
|
||||
+ cfi_startproc;
|
||||
+
|
||||
+#define ENTRY_NO_ALIGN(symbol) LEAF_NO_ALIGN(symbol)
|
||||
+
|
||||
|
||||
/* Mark end of function. */
|
||||
#undef END
|
||||
--
|
||||
2.33.0
|
||||
|
56
LoongArch-Remove-support-code-for-old-linker-in-star.patch
Normal file
56
LoongArch-Remove-support-code-for-old-linker-in-star.patch
Normal file
|
@ -0,0 +1,56 @@
|
|||
From f8d66a269cb6f1a7087afadf3375bdf0553abf53 Mon Sep 17 00:00:00 2001
|
||||
From: Xi Ruoyao <xry111@xry111.site>
|
||||
Date: Sun, 27 Aug 2023 00:36:50 +0800
|
||||
Subject: [PATCH 12/29] LoongArch: Remove support code for old linker in
|
||||
start.S
|
||||
|
||||
We are requiring Binutils >= 2.41, so la.pcrel always works here.
|
||||
|
||||
Signed-off-by: Xi Ruoyao <xry111@xry111.site>
|
||||
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/loongarch/start.S | 19 +++----------------
|
||||
1 file changed, 3 insertions(+), 16 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/loongarch/start.S b/sysdeps/loongarch/start.S
|
||||
index e9d82033..bf6bfc9e 100644
|
||||
--- a/sysdeps/loongarch/start.S
|
||||
+++ b/sysdeps/loongarch/start.S
|
||||
@@ -60,20 +60,7 @@ ENTRY (ENTRY_POINT)
|
||||
cfi_undefined (1)
|
||||
or a5, a0, zero /* rtld_fini */
|
||||
|
||||
-#if ENABLE_STATIC_PIE
|
||||
-/* For static PIE, the GOT cannot be used in _start because the GOT entries are
|
||||
- offsets instead of real addresses before __libc_start_main.
|
||||
- __libc_start_main and/or main may be not local, so we rely on the linker to
|
||||
- produce PLT entries for them. GNU ld >= 2.40 supports this. */
|
||||
-# define LA la.pcrel
|
||||
-#else
|
||||
-/* Old GNU ld (< 2.40) cannot handle PC relative address against a non-local
|
||||
- function correctly. We deem these old linkers failing to support static PIE
|
||||
- and load the addresses from GOT. */
|
||||
-# define LA la.got
|
||||
-#endif
|
||||
-
|
||||
- LA a0, t0, main
|
||||
+ la.pcrel a0, t0, main
|
||||
REG_L a1, sp, 0
|
||||
ADDI a2, sp, SZREG
|
||||
|
||||
@@ -84,9 +71,9 @@ ENTRY (ENTRY_POINT)
|
||||
move a4, zero /* used to be fini */
|
||||
or a6, sp, zero /* stack_end */
|
||||
|
||||
- LA ra, t0, __libc_start_main
|
||||
+ la.pcrel ra, t0, __libc_start_main
|
||||
jirl ra, ra, 0
|
||||
|
||||
- LA ra, t0, abort
|
||||
+ la.pcrel ra, t0, abort
|
||||
jirl ra, ra, 0
|
||||
END (ENTRY_POINT)
|
||||
--
|
||||
2.33.0
|
||||
|
28
LoongArch-Replace-deprecated-v0-with-a0-to-eliminate.patch
Normal file
28
LoongArch-Replace-deprecated-v0-with-a0-to-eliminate.patch
Normal file
|
@ -0,0 +1,28 @@
|
|||
From b4b4bb7c9220a0bbdf5aec0ac8c1de1d22329280 Mon Sep 17 00:00:00 2001
|
||||
From: caiyinyu <caiyinyu@loongson.cn>
|
||||
Date: Thu, 14 Sep 2023 19:48:24 +0800
|
||||
Subject: [PATCH 21/29] LoongArch: Replace deprecated $v0 with $a0 to eliminate
|
||||
'as' Warnings.
|
||||
|
||||
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/loongarch/dl-machine.h | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
|
||||
index 8a2db9de..57913cef 100644
|
||||
--- a/sysdeps/loongarch/dl-machine.h
|
||||
+++ b/sysdeps/loongarch/dl-machine.h
|
||||
@@ -90,7 +90,7 @@ static inline ElfW (Addr) elf_machine_dynamic (void)
|
||||
or $a0, $sp, $zero \n\
|
||||
bl _dl_start \n\
|
||||
# Stash user entry point in s0. \n\
|
||||
- or $s0, $v0, $zero \n\
|
||||
+ or $s0, $a0, $zero \n\
|
||||
# Load the original argument count. \n\
|
||||
ld.d $a1, $sp, 0 \n\
|
||||
# Call _dl_init (struct link_map *main_map, int argc, \
|
||||
--
|
||||
2.33.0
|
||||
|
81
LoongArch-Unify-Register-Names.patch
Normal file
81
LoongArch-Unify-Register-Names.patch
Normal file
|
@ -0,0 +1,81 @@
|
|||
From 458ab6d5f39cca1cabd83abd2022f67491f6f5ed Mon Sep 17 00:00:00 2001
|
||||
From: caiyinyu <caiyinyu@loongson.cn>
|
||||
Date: Fri, 20 Oct 2023 09:20:02 +0800
|
||||
Subject: [PATCH 27/29] LoongArch: Unify Register Names.
|
||||
|
||||
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/loongarch/__longjmp.S | 20 ++++++++++----------
|
||||
sysdeps/loongarch/setjmp.S | 18 +++++++++---------
|
||||
2 files changed, 19 insertions(+), 19 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/loongarch/__longjmp.S b/sysdeps/loongarch/__longjmp.S
|
||||
index cbde1946..e87ce311 100644
|
||||
--- a/sysdeps/loongarch/__longjmp.S
|
||||
+++ b/sysdeps/loongarch/__longjmp.S
|
||||
@@ -43,18 +43,18 @@ ENTRY (__longjmp)
|
||||
REG_L s8, a0, 12*SZREG
|
||||
|
||||
#ifndef __loongarch_soft_float
|
||||
- FREG_L $f24, a0, 13*SZREG + 0*SZFREG
|
||||
- FREG_L $f25, a0, 13*SZREG + 1*SZFREG
|
||||
- FREG_L $f26, a0, 13*SZREG + 2*SZFREG
|
||||
- FREG_L $f27, a0, 13*SZREG + 3*SZFREG
|
||||
- FREG_L $f28, a0, 13*SZREG + 4*SZFREG
|
||||
- FREG_L $f29, a0, 13*SZREG + 5*SZFREG
|
||||
- FREG_L $f30, a0, 13*SZREG + 6*SZFREG
|
||||
- FREG_L $f31, a0, 13*SZREG + 7*SZFREG
|
||||
+ FREG_L fs0, a0, 13*SZREG + 0*SZFREG
|
||||
+ FREG_L fs1, a0, 13*SZREG + 1*SZFREG
|
||||
+ FREG_L fs2, a0, 13*SZREG + 2*SZFREG
|
||||
+ FREG_L fs3, a0, 13*SZREG + 3*SZFREG
|
||||
+ FREG_L fs4, a0, 13*SZREG + 4*SZFREG
|
||||
+ FREG_L fs5, a0, 13*SZREG + 5*SZFREG
|
||||
+ FREG_L fs6, a0, 13*SZREG + 6*SZFREG
|
||||
+ FREG_L fs7, a0, 13*SZREG + 7*SZFREG
|
||||
#endif
|
||||
|
||||
- sltui a0,a1,1
|
||||
+ sltui a0, a1, 1
|
||||
ADD a0, a0, a1 # a0 = (a1 == 0) ? 1 : a1
|
||||
- jirl zero,ra,0
|
||||
+ jirl zero, ra, 0
|
||||
|
||||
END (__longjmp)
|
||||
diff --git a/sysdeps/loongarch/setjmp.S b/sysdeps/loongarch/setjmp.S
|
||||
index 6c7065cd..b6e4f727 100644
|
||||
--- a/sysdeps/loongarch/setjmp.S
|
||||
+++ b/sysdeps/loongarch/setjmp.S
|
||||
@@ -52,19 +52,19 @@ ENTRY (__sigsetjmp)
|
||||
REG_S s8, a0, 12*SZREG
|
||||
|
||||
#ifndef __loongarch_soft_float
|
||||
- FREG_S $f24, a0, 13*SZREG + 0*SZFREG
|
||||
- FREG_S $f25, a0, 13*SZREG + 1*SZFREG
|
||||
- FREG_S $f26, a0, 13*SZREG + 2*SZFREG
|
||||
- FREG_S $f27, a0, 13*SZREG + 3*SZFREG
|
||||
- FREG_S $f28, a0, 13*SZREG + 4*SZFREG
|
||||
- FREG_S $f29, a0, 13*SZREG + 5*SZFREG
|
||||
- FREG_S $f30, a0, 13*SZREG + 6*SZFREG
|
||||
- FREG_S $f31, a0, 13*SZREG + 7*SZFREG
|
||||
+ FREG_S fs0, a0, 13*SZREG + 0*SZFREG
|
||||
+ FREG_S fs1, a0, 13*SZREG + 1*SZFREG
|
||||
+ FREG_S fs2, a0, 13*SZREG + 2*SZFREG
|
||||
+ FREG_S fs3, a0, 13*SZREG + 3*SZFREG
|
||||
+ FREG_S fs4, a0, 13*SZREG + 4*SZFREG
|
||||
+ FREG_S fs5, a0, 13*SZREG + 5*SZFREG
|
||||
+ FREG_S fs6, a0, 13*SZREG + 6*SZFREG
|
||||
+ FREG_S fs7, a0, 13*SZREG + 7*SZFREG
|
||||
#endif
|
||||
|
||||
#if !IS_IN (libc) && IS_IN(rtld)
|
||||
li.w v0, 0
|
||||
- jirl zero,ra,0
|
||||
+ jirl zero, ra, 0
|
||||
#else
|
||||
b __sigjmp_save
|
||||
#endif
|
||||
--
|
||||
2.33.0
|
||||
|
24
LoongArch-Update-hwcap.h-to-sync-with-LoongArch-kern.patch
Normal file
24
LoongArch-Update-hwcap.h-to-sync-with-LoongArch-kern.patch
Normal file
|
@ -0,0 +1,24 @@
|
|||
From 4828d1aa0028e819a5fb336d962e8f7cbfedf8b4 Mon Sep 17 00:00:00 2001
|
||||
From: caiyinyu <caiyinyu@loongson.cn>
|
||||
Date: Mon, 23 Oct 2023 15:53:38 +0800
|
||||
Subject: [PATCH 28/29] LoongArch: Update hwcap.h to sync with LoongArch
|
||||
kernel.
|
||||
|
||||
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/unix/sysv/linux/loongarch/bits/hwcap.h | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
diff --git a/sysdeps/unix/sysv/linux/loongarch/bits/hwcap.h b/sysdeps/unix/sysv/linux/loongarch/bits/hwcap.h
|
||||
index 5104b69c..7acec23d 100644
|
||||
--- a/sysdeps/unix/sysv/linux/loongarch/bits/hwcap.h
|
||||
+++ b/sysdeps/unix/sysv/linux/loongarch/bits/hwcap.h
|
||||
@@ -35,3 +35,4 @@
|
||||
#define HWCAP_LOONGARCH_LBT_X86 (1 << 10)
|
||||
#define HWCAP_LOONGARCH_LBT_ARM (1 << 11)
|
||||
#define HWCAP_LOONGARCH_LBT_MIPS (1 << 12)
|
||||
+#define HWCAP_LOONGARCH_PTW (1 << 13)
|
||||
--
|
||||
2.33.0
|
||||
|
30
LoongArch-elf-Add-new-LoongArch-reloc-types-109-into.patch
Normal file
30
LoongArch-elf-Add-new-LoongArch-reloc-types-109-into.patch
Normal file
|
@ -0,0 +1,30 @@
|
|||
From 4938840b15ff9734fdcc63cc0744ce3f3bbb0b16 Mon Sep 17 00:00:00 2001
|
||||
From: caiyinyu <caiyinyu@loongson.cn>
|
||||
Date: Mon, 14 Aug 2023 15:34:08 +0800
|
||||
Subject: [PATCH 05/29] LoongArch: elf: Add new LoongArch reloc types 109 into
|
||||
elf.h
|
||||
|
||||
These reloc types are generated by GNU assembler >= 2.41 for relaxation
|
||||
support.
|
||||
|
||||
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
elf/elf.h | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
diff --git a/elf/elf.h b/elf/elf.h
|
||||
index d623bdeb..9c51073f 100644
|
||||
--- a/elf/elf.h
|
||||
+++ b/elf/elf.h
|
||||
@@ -4213,6 +4213,7 @@ enum
|
||||
#define R_LARCH_SUB6 106
|
||||
#define R_LARCH_ADD_ULEB128 107
|
||||
#define R_LARCH_SUB_ULEB128 108
|
||||
+#define R_LARCH_64_PCREL 109
|
||||
|
||||
/* ARC specific declarations. */
|
||||
|
||||
--
|
||||
2.33.0
|
||||
|
528
Loongarch-Add-ifunc-support-and-add-different-versio.patch
Normal file
528
Loongarch-Add-ifunc-support-and-add-different-versio.patch
Normal file
|
@ -0,0 +1,528 @@
|
|||
From 43abd8772a143cd96688c081500397dd712e631b Mon Sep 17 00:00:00 2001
|
||||
From: dengjianbo <dengjianbo@loongson.cn>
|
||||
Date: Tue, 8 Aug 2023 14:15:44 +0800
|
||||
Subject: [PATCH 03/29] Loongarch: Add ifunc support and add different versions
|
||||
of strlen
|
||||
|
||||
strlen-lasx is implemeted by LASX simd instructions(256bit)
|
||||
strlen-lsx is implemeted by LSX simd instructions(128bit)
|
||||
strlen-align is implemented by LA basic instructions and never use unaligned memory acess
|
||||
|
||||
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/loongarch/lp64/multiarch/Makefile | 7 ++
|
||||
.../lp64/multiarch/ifunc-impl-list.c | 41 +++++++
|
||||
.../loongarch/lp64/multiarch/ifunc-strlen.h | 40 +++++++
|
||||
.../loongarch/lp64/multiarch/strlen-aligned.S | 100 ++++++++++++++++++
|
||||
.../loongarch/lp64/multiarch/strlen-lasx.S | 63 +++++++++++
|
||||
sysdeps/loongarch/lp64/multiarch/strlen-lsx.S | 71 +++++++++++++
|
||||
sysdeps/loongarch/lp64/multiarch/strlen.c | 37 +++++++
|
||||
sysdeps/loongarch/sys/regdef.h | 57 ++++++++++
|
||||
.../unix/sysv/linux/loongarch/cpu-features.h | 2 +
|
||||
9 files changed, 418 insertions(+)
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen.c
|
||||
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
new file mode 100644
|
||||
index 00000000..76c506c9
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
@@ -0,0 +1,7 @@
|
||||
+ifeq ($(subdir),string)
|
||||
+sysdep_routines += \
|
||||
+ strlen-aligned \
|
||||
+ strlen-lsx \
|
||||
+ strlen-lasx \
|
||||
+# sysdep_routines
|
||||
+endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
new file mode 100644
|
||||
index 00000000..1a2a576f
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
@@ -0,0 +1,41 @@
|
||||
+/* Enumerate available IFUNC implementations of a function LoongArch64 version.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <assert.h>
|
||||
+#include <string.h>
|
||||
+#include <wchar.h>
|
||||
+#include <ldsodefs.h>
|
||||
+#include <ifunc-impl-list.h>
|
||||
+#include <stdio.h>
|
||||
+
|
||||
+size_t
|
||||
+__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
+ size_t max)
|
||||
+{
|
||||
+
|
||||
+ size_t i = max;
|
||||
+
|
||||
+ IFUNC_IMPL (i, name, strlen,
|
||||
+#if !defined __loongarch_soft_float
|
||||
+ IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LASX, __strlen_lasx)
|
||||
+ IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LSX, __strlen_lsx)
|
||||
+#endif
|
||||
+ IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_aligned)
|
||||
+ )
|
||||
+ return i;
|
||||
+}
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h b/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h
|
||||
new file mode 100644
|
||||
index 00000000..6258bb76
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h
|
||||
@@ -0,0 +1,40 @@
|
||||
+/* Common definition for strlen ifunc selections.
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <ldsodefs.h>
|
||||
+#include <ifunc-init.h>
|
||||
+
|
||||
+#if !defined __loongarch_soft_float
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
|
||||
+#endif
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
|
||||
+
|
||||
+static inline void *
|
||||
+IFUNC_SELECTOR (void)
|
||||
+{
|
||||
+#if !defined __loongarch_soft_float
|
||||
+ if (SUPPORT_LASX)
|
||||
+ return OPTIMIZE (lasx);
|
||||
+ else if (SUPPORT_LSX)
|
||||
+ return OPTIMIZE (lsx);
|
||||
+ else
|
||||
+#endif
|
||||
+ return OPTIMIZE (aligned);
|
||||
+}
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
|
||||
new file mode 100644
|
||||
index 00000000..e9e1d2fc
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
|
||||
@@ -0,0 +1,100 @@
|
||||
+/* Optimized strlen implementation using basic Loongarch instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc)
|
||||
+# define STRLEN __strlen_aligned
|
||||
+#else
|
||||
+# define STRLEN strlen
|
||||
+#endif
|
||||
+
|
||||
+LEAF(STRLEN, 6)
|
||||
+ move a1, a0
|
||||
+ bstrins.d a0, zero, 2, 0
|
||||
+ lu12i.w a2, 0x01010
|
||||
+ li.w t0, -1
|
||||
+
|
||||
+ ld.d t2, a0, 0
|
||||
+ andi t1, a1, 0x7
|
||||
+ ori a2, a2, 0x101
|
||||
+ slli.d t1, t1, 3
|
||||
+
|
||||
+ bstrins.d a2, a2, 63, 32
|
||||
+ sll.d t1, t0, t1
|
||||
+ slli.d t3, a2, 7
|
||||
+ nor a3, zero, t3
|
||||
+
|
||||
+ orn t2, t2, t1
|
||||
+ sub.d t0, t2, a2
|
||||
+ nor t1, t2, a3
|
||||
+ and t0, t0, t1
|
||||
+
|
||||
+
|
||||
+ bnez t0, L(count_pos)
|
||||
+ addi.d a0, a0, 8
|
||||
+L(loop_16_7bit):
|
||||
+ ld.d t2, a0, 0
|
||||
+ sub.d t1, t2, a2
|
||||
+
|
||||
+ and t0, t1, t3
|
||||
+ bnez t0, L(more_check)
|
||||
+ ld.d t2, a0, 8
|
||||
+ sub.d t1, t2, a2
|
||||
+
|
||||
+ and t0, t1, t3
|
||||
+ addi.d a0, a0, 16
|
||||
+ beqz t0, L(loop_16_7bit)
|
||||
+ addi.d a0, a0, -8
|
||||
+
|
||||
+L(more_check):
|
||||
+ nor t0, t2, a3
|
||||
+ and t0, t1, t0
|
||||
+ bnez t0, L(count_pos)
|
||||
+ addi.d a0, a0, 8
|
||||
+
|
||||
+
|
||||
+L(loop_16_8bit):
|
||||
+ ld.d t2, a0, 0
|
||||
+ sub.d t1, t2, a2
|
||||
+ nor t0, t2, a3
|
||||
+ and t0, t0, t1
|
||||
+
|
||||
+ bnez t0, L(count_pos)
|
||||
+ ld.d t2, a0, 8
|
||||
+ addi.d a0, a0, 16
|
||||
+ sub.d t1, t2, a2
|
||||
+
|
||||
+ nor t0, t2, a3
|
||||
+ and t0, t0, t1
|
||||
+ beqz t0, L(loop_16_8bit)
|
||||
+ addi.d a0, a0, -8
|
||||
+
|
||||
+L(count_pos):
|
||||
+ ctz.d t1, t0
|
||||
+ sub.d a0, a0, a1
|
||||
+ srli.d t1, t1, 3
|
||||
+ add.d a0, a0, t1
|
||||
+
|
||||
+ jr ra
|
||||
+END(STRLEN)
|
||||
+
|
||||
+libc_hidden_builtin_def (STRLEN)
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
|
||||
new file mode 100644
|
||||
index 00000000..258c47ce
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
|
||||
@@ -0,0 +1,63 @@
|
||||
+/* Optimized strlen implementation using loongarch LASX SIMD instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc) && !defined __loongarch_soft_float
|
||||
+
|
||||
+# define STRLEN __strlen_lasx
|
||||
+
|
||||
+LEAF(STRLEN, 6)
|
||||
+ move a1, a0
|
||||
+ bstrins.d a0, zero, 4, 0
|
||||
+ li.d t1, -1
|
||||
+ xvld xr0, a0, 0
|
||||
+
|
||||
+ xvmsknz.b xr0, xr0
|
||||
+ xvpickve.w xr1, xr0, 4
|
||||
+ vilvl.h vr0, vr1, vr0
|
||||
+ movfr2gr.s t0, fa0 # sign extend
|
||||
+
|
||||
+ sra.w t0, t0, a1
|
||||
+ beq t0, t1, L(loop)
|
||||
+ cto.w a0, t0
|
||||
+ jr ra
|
||||
+
|
||||
+L(loop):
|
||||
+ xvld xr0, a0, 32
|
||||
+ addi.d a0, a0, 32
|
||||
+ xvsetanyeqz.b fcc0, xr0
|
||||
+ bceqz fcc0, L(loop)
|
||||
+
|
||||
+
|
||||
+ xvmsknz.b xr0, xr0
|
||||
+ sub.d a0, a0, a1
|
||||
+ xvpickve.w xr1, xr0, 4
|
||||
+ vilvl.h vr0, vr1, vr0
|
||||
+
|
||||
+ movfr2gr.s t0, fa0
|
||||
+ cto.w t0, t0
|
||||
+ add.d a0, a0, t0
|
||||
+ jr ra
|
||||
+END(STRLEN)
|
||||
+
|
||||
+libc_hidden_builtin_def (STRLEN)
|
||||
+#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
|
||||
new file mode 100644
|
||||
index 00000000..b194355e
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
|
||||
@@ -0,0 +1,71 @@
|
||||
+/* Optimized strlen implementation using Loongarch LSX SIMD instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc) && !defined __loongarch_soft_float
|
||||
+
|
||||
+# define STRLEN __strlen_lsx
|
||||
+
|
||||
+LEAF(STRLEN, 6)
|
||||
+ move a1, a0
|
||||
+ bstrins.d a0, zero, 4, 0
|
||||
+ vld vr0, a0, 0
|
||||
+ vld vr1, a0, 16
|
||||
+
|
||||
+ li.d t1, -1
|
||||
+ vmsknz.b vr0, vr0
|
||||
+ vmsknz.b vr1, vr1
|
||||
+ vilvl.h vr0, vr1, vr0
|
||||
+
|
||||
+ movfr2gr.s t0, fa0
|
||||
+ sra.w t0, t0, a1
|
||||
+ beq t0, t1, L(loop)
|
||||
+ cto.w a0, t0
|
||||
+
|
||||
+ jr ra
|
||||
+ nop
|
||||
+ nop
|
||||
+ nop
|
||||
+
|
||||
+
|
||||
+L(loop):
|
||||
+ vld vr0, a0, 32
|
||||
+ vld vr1, a0, 48
|
||||
+ addi.d a0, a0, 32
|
||||
+ vmin.bu vr2, vr0, vr1
|
||||
+
|
||||
+ vsetanyeqz.b fcc0, vr2
|
||||
+ bceqz fcc0, L(loop)
|
||||
+ vmsknz.b vr0, vr0
|
||||
+ vmsknz.b vr1, vr1
|
||||
+
|
||||
+ vilvl.h vr0, vr1, vr0
|
||||
+ sub.d a0, a0, a1
|
||||
+ movfr2gr.s t0, fa0
|
||||
+ cto.w t0, t0
|
||||
+
|
||||
+ add.d a0, a0, t0
|
||||
+ jr ra
|
||||
+END(STRLEN)
|
||||
+
|
||||
+libc_hidden_builtin_def (STRLEN)
|
||||
+#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strlen.c b/sysdeps/loongarch/lp64/multiarch/strlen.c
|
||||
new file mode 100644
|
||||
index 00000000..381c2daa
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strlen.c
|
||||
@@ -0,0 +1,37 @@
|
||||
+/* Multiple versions of strlen.
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+/* Define multiple versions only for the definition in libc. */
|
||||
+
|
||||
+#if IS_IN (libc)
|
||||
+# define strlen __redirect_strlen
|
||||
+# include <string.h>
|
||||
+# undef strlen
|
||||
+
|
||||
+# define SYMBOL_NAME strlen
|
||||
+# include "ifunc-strlen.h"
|
||||
+
|
||||
+libc_ifunc_redirected (__redirect_strlen, strlen, IFUNC_SELECTOR ());
|
||||
+
|
||||
+# ifdef SHARED
|
||||
+__hidden_ver1 (strlen, __GI_strlen, __redirect_strlen)
|
||||
+ __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlen);
|
||||
+# endif
|
||||
+
|
||||
+#endif
|
||||
diff --git a/sysdeps/loongarch/sys/regdef.h b/sysdeps/loongarch/sys/regdef.h
|
||||
index 5100f36d..524d2e32 100644
|
||||
--- a/sysdeps/loongarch/sys/regdef.h
|
||||
+++ b/sysdeps/loongarch/sys/regdef.h
|
||||
@@ -89,6 +89,14 @@
|
||||
#define fs5 $f29
|
||||
#define fs6 $f30
|
||||
#define fs7 $f31
|
||||
+#define fcc0 $fcc0
|
||||
+#define fcc1 $fcc1
|
||||
+#define fcc2 $fcc2
|
||||
+#define fcc3 $fcc3
|
||||
+#define fcc4 $fcc4
|
||||
+#define fcc5 $fcc5
|
||||
+#define fcc6 $fcc6
|
||||
+#define fcc7 $fcc7
|
||||
|
||||
#define vr0 $vr0
|
||||
#define vr1 $vr1
|
||||
@@ -98,6 +106,30 @@
|
||||
#define vr5 $vr5
|
||||
#define vr6 $vr6
|
||||
#define vr7 $vr7
|
||||
+#define vr8 $vr8
|
||||
+#define vr9 $vr9
|
||||
+#define vr10 $vr10
|
||||
+#define vr11 $vr11
|
||||
+#define vr12 $vr12
|
||||
+#define vr13 $vr13
|
||||
+#define vr14 $vr14
|
||||
+#define vr15 $vr15
|
||||
+#define vr16 $vr16
|
||||
+#define vr17 $vr17
|
||||
+#define vr18 $vr18
|
||||
+#define vr19 $vr19
|
||||
+#define vr20 $vr20
|
||||
+#define vr21 $vr21
|
||||
+#define vr22 $vr22
|
||||
+#define vr23 $vr23
|
||||
+#define vr24 $vr24
|
||||
+#define vr25 $vr25
|
||||
+#define vr26 $vr26
|
||||
+#define vr27 $vr27
|
||||
+#define vr28 $vr28
|
||||
+#define vr29 $vr29
|
||||
+#define vr30 $vr30
|
||||
+#define vr31 $vr31
|
||||
|
||||
#define xr0 $xr0
|
||||
#define xr1 $xr1
|
||||
@@ -107,5 +139,30 @@
|
||||
#define xr5 $xr5
|
||||
#define xr6 $xr6
|
||||
#define xr7 $xr7
|
||||
+#define xr7 $xr7
|
||||
+#define xr8 $xr8
|
||||
+#define xr9 $xr9
|
||||
+#define xr10 $xr10
|
||||
+#define xr11 $xr11
|
||||
+#define xr12 $xr12
|
||||
+#define xr13 $xr13
|
||||
+#define xr14 $xr14
|
||||
+#define xr15 $xr15
|
||||
+#define xr16 $xr16
|
||||
+#define xr17 $xr17
|
||||
+#define xr18 $xr18
|
||||
+#define xr19 $xr19
|
||||
+#define xr20 $xr20
|
||||
+#define xr21 $xr21
|
||||
+#define xr22 $xr22
|
||||
+#define xr23 $xr23
|
||||
+#define xr24 $xr24
|
||||
+#define xr25 $xr25
|
||||
+#define xr26 $xr26
|
||||
+#define xr27 $xr27
|
||||
+#define xr28 $xr28
|
||||
+#define xr29 $xr29
|
||||
+#define xr30 $xr30
|
||||
+#define xr31 $xr31
|
||||
|
||||
#endif /* _SYS_REGDEF_H */
|
||||
diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
|
||||
index e371e13b..d1a280a5 100644
|
||||
--- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
|
||||
+++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
|
||||
@@ -25,5 +25,7 @@
|
||||
#define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX)
|
||||
#define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX)
|
||||
|
||||
+#define INIT_ARCH()
|
||||
+
|
||||
#endif /* _CPU_FEATURES_LOONGARCH64_H */
|
||||
|
||||
--
|
||||
2.33.0
|
||||
|
2570
Loongarch-Add-ifunc-support-for-memcpy-aligned-unali.patch
Normal file
2570
Loongarch-Add-ifunc-support-for-memcpy-aligned-unali.patch
Normal file
File diff suppressed because it is too large
Load diff
706
Loongarch-Add-ifunc-support-for-strchr-aligned-lsx-l.patch
Normal file
706
Loongarch-Add-ifunc-support-for-strchr-aligned-lsx-l.patch
Normal file
|
@ -0,0 +1,706 @@
|
|||
From aca7d7f0dde5f56344e8e58e5f6648c96bb1f1cc Mon Sep 17 00:00:00 2001
|
||||
From: dengjianbo <dengjianbo@loongson.cn>
|
||||
Date: Tue, 15 Aug 2023 09:08:11 +0800
|
||||
Subject: [PATCH 06/29] Loongarch: Add ifunc support for strchr{aligned, lsx,
|
||||
lasx} and strchrnul{aligned, lsx, lasx}
|
||||
|
||||
These implementations improve the time to run strchr{nul}
|
||||
microbenchmark in glibc as below:
|
||||
strchr-lasx reduces the runtime about 50%-83%
|
||||
strchr-lsx reduces the runtime about 30%-67%
|
||||
strchr-aligned reduces the runtime about 10%-20%
|
||||
strchrnul-lasx reduces the runtime about 50%-83%
|
||||
strchrnul-lsx reduces the runtime about 36%-65%
|
||||
strchrnul-aligned reduces the runtime about 6%-10%
|
||||
|
||||
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/loongarch/lp64/multiarch/Makefile | 6 ++
|
||||
.../lp64/multiarch/ifunc-impl-list.c | 16 +++
|
||||
.../loongarch/lp64/multiarch/ifunc-strchr.h | 41 ++++++++
|
||||
.../lp64/multiarch/ifunc-strchrnul.h | 41 ++++++++
|
||||
.../loongarch/lp64/multiarch/strchr-aligned.S | 99 +++++++++++++++++++
|
||||
.../loongarch/lp64/multiarch/strchr-lasx.S | 91 +++++++++++++++++
|
||||
sysdeps/loongarch/lp64/multiarch/strchr-lsx.S | 73 ++++++++++++++
|
||||
sysdeps/loongarch/lp64/multiarch/strchr.c | 36 +++++++
|
||||
.../lp64/multiarch/strchrnul-aligned.S | 95 ++++++++++++++++++
|
||||
.../loongarch/lp64/multiarch/strchrnul-lasx.S | 22 +++++
|
||||
.../loongarch/lp64/multiarch/strchrnul-lsx.S | 22 +++++
|
||||
sysdeps/loongarch/lp64/multiarch/strchrnul.c | 39 ++++++++
|
||||
12 files changed, 581 insertions(+)
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strchr.h
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strchrnul.h
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strchr-aligned.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strchr-lasx.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strchr-lsx.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strchr.c
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S
|
||||
create mode 100644 sysdeps/loongarch/lp64/multiarch/strchrnul.c
|
||||
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
index 76c506c9..110a8c5c 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
|
||||
@@ -3,5 +3,11 @@ sysdep_routines += \
|
||||
strlen-aligned \
|
||||
strlen-lsx \
|
||||
strlen-lasx \
|
||||
+ strchr-aligned \
|
||||
+ strchr-lsx \
|
||||
+ strchr-lasx \
|
||||
+ strchrnul-aligned \
|
||||
+ strchrnul-lsx \
|
||||
+ strchrnul-lasx \
|
||||
# sysdep_routines
|
||||
endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
index 1a2a576f..c7164b45 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
@@ -37,5 +37,21 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
#endif
|
||||
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_aligned)
|
||||
)
|
||||
+
|
||||
+ IFUNC_IMPL (i, name, strchr,
|
||||
+#if !defined __loongarch_soft_float
|
||||
+ IFUNC_IMPL_ADD (array, i, strchr, SUPPORT_LASX, __strchr_lasx)
|
||||
+ IFUNC_IMPL_ADD (array, i, strchr, SUPPORT_LSX, __strchr_lsx)
|
||||
+#endif
|
||||
+ IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_aligned)
|
||||
+ )
|
||||
+
|
||||
+ IFUNC_IMPL (i, name, strchrnul,
|
||||
+#if !defined __loongarch_soft_float
|
||||
+ IFUNC_IMPL_ADD (array, i, strchrnul, SUPPORT_LASX, __strchrnul_lasx)
|
||||
+ IFUNC_IMPL_ADD (array, i, strchrnul, SUPPORT_LSX, __strchrnul_lsx)
|
||||
+#endif
|
||||
+ IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_aligned)
|
||||
+ )
|
||||
return i;
|
||||
}
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strchr.h b/sysdeps/loongarch/lp64/multiarch/ifunc-strchr.h
|
||||
new file mode 100644
|
||||
index 00000000..4494db79
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strchr.h
|
||||
@@ -0,0 +1,41 @@
|
||||
+/* Common definition for strchr ifunc selections.
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <ldsodefs.h>
|
||||
+#include <ifunc-init.h>
|
||||
+
|
||||
+#if !defined __loongarch_soft_float
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
|
||||
+#endif
|
||||
+
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
|
||||
+
|
||||
+static inline void *
|
||||
+IFUNC_SELECTOR (void)
|
||||
+{
|
||||
+#if !defined __loongarch_soft_float
|
||||
+ if (SUPPORT_LASX)
|
||||
+ return OPTIMIZE (lasx);
|
||||
+ else if (SUPPORT_LSX)
|
||||
+ return OPTIMIZE (lsx);
|
||||
+ else
|
||||
+#endif
|
||||
+ return OPTIMIZE (aligned);
|
||||
+}
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strchrnul.h b/sysdeps/loongarch/lp64/multiarch/ifunc-strchrnul.h
|
||||
new file mode 100644
|
||||
index 00000000..8a925120
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strchrnul.h
|
||||
@@ -0,0 +1,41 @@
|
||||
+/* Common definition for strchrnul ifunc selections.
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <ldsodefs.h>
|
||||
+#include <ifunc-init.h>
|
||||
+
|
||||
+#if !defined __loongarch_soft_float
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
|
||||
+#endif
|
||||
+
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
|
||||
+
|
||||
+static inline void *
|
||||
+IFUNC_SELECTOR (void)
|
||||
+{
|
||||
+#if !defined __loongarch_soft_float
|
||||
+ if (SUPPORT_LASX)
|
||||
+ return OPTIMIZE (lasx);
|
||||
+ else if (SUPPORT_LSX)
|
||||
+ return OPTIMIZE (lsx);
|
||||
+ else
|
||||
+#endif
|
||||
+ return OPTIMIZE (aligned);
|
||||
+}
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S
|
||||
new file mode 100644
|
||||
index 00000000..5fb01806
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S
|
||||
@@ -0,0 +1,99 @@
|
||||
+/* Optimized strchr implementation using basic Loongarch instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc)
|
||||
+# define STRCHR_NAME __strchr_aligned
|
||||
+#else
|
||||
+# define STRCHR_NAME strchr
|
||||
+#endif
|
||||
+
|
||||
+LEAF(STRCHR_NAME, 6)
|
||||
+ slli.d t1, a0, 3
|
||||
+ bstrins.d a0, zero, 2, 0
|
||||
+ lu12i.w a2, 0x01010
|
||||
+ ld.d t2, a0, 0
|
||||
+
|
||||
+ ori a2, a2, 0x101
|
||||
+ andi a1, a1, 0xff
|
||||
+ bstrins.d a2, a2, 63, 32
|
||||
+ li.w t0, -1
|
||||
+
|
||||
+ mul.d a1, a1, a2
|
||||
+ sll.d t0, t0, t1
|
||||
+ slli.d a3, a2, 7
|
||||
+ orn t2, t2, t0
|
||||
+
|
||||
+ sll.d t3, a1, t1
|
||||
+ xor t4, t2, t3
|
||||
+ sub.d a4, t2, a2
|
||||
+ sub.d a5, t4, a2
|
||||
+
|
||||
+
|
||||
+ andn a4, a4, t2
|
||||
+ andn a5, a5, t4
|
||||
+ or t0, a4, a5
|
||||
+ and t0, t0, a3
|
||||
+
|
||||
+ bnez t0, L(end)
|
||||
+ addi.d a0, a0, 8
|
||||
+L(loop):
|
||||
+ ld.d t4, a0, 0
|
||||
+ xor t2, t4, a1
|
||||
+
|
||||
+ sub.d a4, t4, a2
|
||||
+ sub.d a5, t2, a2
|
||||
+ andn a4, a4, t4
|
||||
+ andn a5, a5, t2
|
||||
+
|
||||
+ or t0, a4, a5
|
||||
+ and t0, t0, a3
|
||||
+ bnez t0, L(end)
|
||||
+ ld.d t4, a0, 8
|
||||
+
|
||||
+
|
||||
+ addi.d a0, a0, 16
|
||||
+ xor t2, t4, a1
|
||||
+ sub.d a4, t4, a2
|
||||
+ sub.d a5, t2, a2
|
||||
+
|
||||
+ andn a4, a4, t4
|
||||
+ andn a5, a5, t2
|
||||
+ or t0, a4, a5
|
||||
+ and t0, t0, a3
|
||||
+
|
||||
+ beqz t0, L(loop)
|
||||
+ addi.d a0, a0, -8
|
||||
+L(end):
|
||||
+ and t0, a5, a3
|
||||
+ and t1, a4, a3
|
||||
+
|
||||
+ ctz.d t0, t0
|
||||
+ ctz.d t1, t1
|
||||
+ srli.w t2, t0, 3
|
||||
+ sltu t3, t1, t0
|
||||
+
|
||||
+
|
||||
+ add.d a0, a0, t2
|
||||
+ masknez a0, a0, t3
|
||||
+ jr ra
|
||||
+END(STRCHR_NAME)
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S
|
||||
new file mode 100644
|
||||
index 00000000..254402da
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S
|
||||
@@ -0,0 +1,91 @@
|
||||
+/* Optimized strchr implementation using loongarch LASX SIMD instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc) && !defined __loongarch_soft_float
|
||||
+
|
||||
+#ifndef AS_STRCHRNUL
|
||||
+# define STRCHR __strchr_lasx
|
||||
+#endif
|
||||
+
|
||||
+LEAF(STRCHR, 6)
|
||||
+ andi t1, a0, 0x1f
|
||||
+ bstrins.d a0, zero, 4, 0
|
||||
+ xvld xr0, a0, 0
|
||||
+ li.d t2, -1
|
||||
+
|
||||
+ xvreplgr2vr.b xr1, a1
|
||||
+ sll.d t1, t2, t1
|
||||
+ xvxor.v xr2, xr0, xr1
|
||||
+ xvmin.bu xr0, xr0, xr2
|
||||
+
|
||||
+ xvmsknz.b xr0, xr0
|
||||
+ xvpickve.w xr3, xr0, 4
|
||||
+ vilvl.h vr0, vr3, vr0
|
||||
+ movfr2gr.s t0, fa0
|
||||
+
|
||||
+ orn t0, t0, t1
|
||||
+ bne t0, t2, L(end)
|
||||
+ addi.d a0, a0, 32
|
||||
+ nop
|
||||
+
|
||||
+
|
||||
+L(loop):
|
||||
+ xvld xr0, a0, 0
|
||||
+ xvxor.v xr2, xr0, xr1
|
||||
+ xvmin.bu xr0, xr0, xr2
|
||||
+ xvsetanyeqz.b fcc0, xr0
|
||||
+
|
||||
+ bcnez fcc0, L(loop_end)
|
||||
+ xvld xr0, a0, 32
|
||||
+ addi.d a0, a0, 64
|
||||
+ xvxor.v xr2, xr0, xr1
|
||||
+
|
||||
+ xvmin.bu xr0, xr0, xr2
|
||||
+ xvsetanyeqz.b fcc0, xr0
|
||||
+ bceqz fcc0, L(loop)
|
||||
+ addi.d a0, a0, -32
|
||||
+
|
||||
+L(loop_end):
|
||||
+ xvmsknz.b xr0, xr0
|
||||
+ xvpickve.w xr1, xr0, 4
|
||||
+ vilvl.h vr0, vr1, vr0
|
||||
+ movfr2gr.s t0, fa0
|
||||
+
|
||||
+
|
||||
+L(end):
|
||||
+ cto.w t0, t0
|
||||
+ add.d a0, a0, t0
|
||||
+#ifndef AS_STRCHRNUL
|
||||
+ vreplgr2vr.b vr0, t0
|
||||
+ xvpermi.q xr3, xr2, 1
|
||||
+
|
||||
+ vshuf.b vr0, vr3, vr2, vr0
|
||||
+ vpickve2gr.bu t0, vr0, 0
|
||||
+ masknez a0, a0, t0
|
||||
+#endif
|
||||
+ jr ra
|
||||
+
|
||||
+END(STRCHR)
|
||||
+
|
||||
+libc_hidden_builtin_def(STRCHR)
|
||||
+#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S
|
||||
new file mode 100644
|
||||
index 00000000..dae98b0a
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S
|
||||
@@ -0,0 +1,73 @@
|
||||
+/* Optimized strlen implementation using loongarch LSX SIMD instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc) && !defined __loongarch_soft_float
|
||||
+
|
||||
+#ifndef AS_STRCHRNUL
|
||||
+# define STRCHR __strchr_lsx
|
||||
+#endif
|
||||
+
|
||||
+LEAF(STRCHR, 6)
|
||||
+ andi t1, a0, 0xf
|
||||
+ bstrins.d a0, zero, 3, 0
|
||||
+ vld vr0, a0, 0
|
||||
+ li.d t2, -1
|
||||
+
|
||||
+ vreplgr2vr.b vr1, a1
|
||||
+ sll.d t3, t2, t1
|
||||
+ vxor.v vr2, vr0, vr1
|
||||
+ vmin.bu vr0, vr0, vr2
|
||||
+
|
||||
+ vmsknz.b vr0, vr0
|
||||
+ movfr2gr.s t0, fa0
|
||||
+ ext.w.h t0, t0
|
||||
+ orn t0, t0, t3
|
||||
+
|
||||
+ beq t0, t2, L(loop)
|
||||
+L(found):
|
||||
+ cto.w t0, t0
|
||||
+ add.d a0, a0, t0
|
||||
+#ifndef AS_STRCHRNUL
|
||||
+ vreplve.b vr2, vr2, t0
|
||||
+ vpickve2gr.bu t1, vr2, 0
|
||||
+ masknez a0, a0, t1
|
||||
+#endif
|
||||
+ jr ra
|
||||
+
|
||||
+
|
||||
+L(loop):
|
||||
+ vld vr0, a0, 16
|
||||
+ addi.d a0, a0, 16
|
||||
+ vxor.v vr2, vr0, vr1
|
||||
+ vmin.bu vr0, vr0, vr2
|
||||
+
|
||||
+ vsetanyeqz.b fcc0, vr0
|
||||
+ bceqz fcc0, L(loop)
|
||||
+ vmsknz.b vr0, vr0
|
||||
+ movfr2gr.s t0, fa0
|
||||
+
|
||||
+ b L(found)
|
||||
+END(STRCHR)
|
||||
+
|
||||
+libc_hidden_builtin_def (STRCHR)
|
||||
+#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strchr.c b/sysdeps/loongarch/lp64/multiarch/strchr.c
|
||||
new file mode 100644
|
||||
index 00000000..404e97bd
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strchr.c
|
||||
@@ -0,0 +1,36 @@
|
||||
+/* Multiple versions of strchr.
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+/* Define multiple versions only for the definition in libc. */
|
||||
+#if IS_IN (libc)
|
||||
+# define strchr __redirect_strchr
|
||||
+# include <string.h>
|
||||
+# undef strchr
|
||||
+
|
||||
+# define SYMBOL_NAME strchr
|
||||
+# include "ifunc-strchr.h"
|
||||
+
|
||||
+libc_ifunc_redirected (__redirect_strchr, strchr, IFUNC_SELECTOR ());
|
||||
+weak_alias(strchr, index)
|
||||
+# ifdef SHARED
|
||||
+__hidden_ver1 (strchr, __GI_strchr, __redirect_strchr)
|
||||
+ __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strchr);
|
||||
+# endif
|
||||
+
|
||||
+#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S
|
||||
new file mode 100644
|
||||
index 00000000..1c01a023
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S
|
||||
@@ -0,0 +1,95 @@
|
||||
+/* Optimized strchrnul implementation using basic Loongarch instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/regdef.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if IS_IN (libc)
|
||||
+# define STRCHRNUL_NAME __strchrnul_aligned
|
||||
+#else
|
||||
+# define STRCHRNUL_NAME __strchrnul
|
||||
+#endif
|
||||
+
|
||||
+LEAF(STRCHRNUL_NAME, 6)
|
||||
+ slli.d t1, a0, 3
|
||||
+ bstrins.d a0, zero, 2, 0
|
||||
+ lu12i.w a2, 0x01010
|
||||
+ ld.d t2, a0, 0
|
||||
+
|
||||
+ ori a2, a2, 0x101
|
||||
+ andi a1, a1, 0xff
|
||||
+ bstrins.d a2, a2, 63, 32
|
||||
+ li.w t0, -1
|
||||
+
|
||||
+ mul.d a1, a1, a2
|
||||
+ sll.d t0, t0, t1
|
||||
+ slli.d a3, a2, 7
|
||||
+ orn t2, t2, t0
|
||||
+
|
||||
+ sll.d t3, a1, t1
|
||||
+ xor t4, t2, t3
|
||||
+ sub.d a4, t2, a2
|
||||
+ sub.d a5, t4, a2
|
||||
+
|
||||
+
|
||||
+ andn a4, a4, t2
|
||||
+ andn a5, a5, t4
|
||||
+ or t0, a4, a5
|
||||
+ and t0, t0, a3
|
||||
+
|
||||
+ bnez t0, L(end)
|
||||
+ addi.d a0, a0, 8
|
||||
+L(loop):
|
||||
+ ld.d t4, a0, 0
|
||||
+ xor t2, t4, a1
|
||||
+
|
||||
+ sub.d a4, t4, a2
|
||||
+ sub.d a5, t2, a2
|
||||
+ andn a4, a4, t4
|
||||
+ andn a5, a5, t2
|
||||
+
|
||||
+ or t0, a4, a5
|
||||
+ and t0, t0, a3
|
||||
+ bnez t0, L(end)
|
||||
+ ld.d t4, a0, 8
|
||||
+
|
||||
+
|
||||
+ addi.d a0, a0, 16
|
||||
+ xor t2, t4, a1
|
||||
+ sub.d a4, t4, a2
|
||||
+ sub.d a5, t2, a2
|
||||
+
|
||||
+ andn a4, a4, t4
|
||||
+ andn a5, a5, t2
|
||||
+ or t0, a4, a5
|
||||
+ and t0, t0, a3
|
||||
+
|
||||
+ beqz t0, L(loop)
|
||||
+ addi.d a0, a0, -8
|
||||
+L(end):
|
||||
+ ctz.d t0, t0
|
||||
+ srli.w t0, t0, 3
|
||||
+
|
||||
+
|
||||
+ add.d a0, a0, t0
|
||||
+ jr ra
|
||||
+END(STRCHRNUL_NAME)
|
||||
+
|
||||
+libc_hidden_builtin_def (STRCHRNUL_NAME)
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S
|
||||
new file mode 100644
|
||||
index 00000000..d45495e4
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S
|
||||
@@ -0,0 +1,22 @@
|
||||
+/* Optimized strchrnul implementation using loongarch LASX SIMD instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#define STRCHR __strchrnul_lasx
|
||||
+#define AS_STRCHRNUL
|
||||
+#include "strchr-lasx.S"
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S
|
||||
new file mode 100644
|
||||
index 00000000..07d793ae
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S
|
||||
@@ -0,0 +1,22 @@
|
||||
+/* Optimized strchrnul implementation using loongarch LSX SIMD instructions.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#define STRCHR __strchrnul_lsx
|
||||
+#define AS_STRCHRNUL
|
||||
+#include "strchr-lsx.S"
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul.c b/sysdeps/loongarch/lp64/multiarch/strchrnul.c
|
||||
new file mode 100644
|
||||
index 00000000..f3b8296e
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul.c
|
||||
@@ -0,0 +1,39 @@
|
||||
+/* Multiple versions of strchrnul.
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+/* Define multiple versions only for the definition in libc. */
|
||||
+
|
||||
+#if IS_IN (libc)
|
||||
+# define strchrnul __redirect_strchrnul
|
||||
+# define __strchrnul __redirect___strchrnul
|
||||
+# include <string.h>
|
||||
+# undef __strchrnul
|
||||
+# undef strchrnul
|
||||
+
|
||||
+# define SYMBOL_NAME strchrnul
|
||||
+# include "ifunc-strchrnul.h"
|
||||
+
|
||||
+libc_ifunc_redirected (__redirect_strchrnul, __strchrnul,
|
||||
+ IFUNC_SELECTOR ());
|
||||
+weak_alias (__strchrnul, strchrnul)
|
||||
+# ifdef SHARED
|
||||
+__hidden_ver1 (__strchrnul, __GI___strchrnul, __redirect_strchrnul)
|
||||
+ __attribute__((visibility ("hidden"))) __attribute_copy__ (strchrnul);
|
||||
+# endif
|
||||
+#endif
|
||||
--
|
||||
2.33.0
|
||||
|
11
README.md
Normal file
11
README.md
Normal file
|
@ -0,0 +1,11 @@
|
|||
Anolis OS
|
||||
=======================================
|
||||
# 代码仓库说明
|
||||
## 分支说明
|
||||
>进行代码开发工作时,请注意选择当前版本对应的分支
|
||||
* aX分支为对应大版本的主分支,如a8分支对应当前最新版本
|
||||
* aX.Y分支为对应小版本的维护分支,如a8.2分支对应8.2版本
|
||||
## 开发流程
|
||||
1. 首先fork目标分支到自己的namespace
|
||||
2. 在自己的fork分支上做出修改
|
||||
3. 向对应的仓库中提交merge request,源分支为fork分支
|
478
Revert-LoongArch-Add-glibc.cpu.hwcap-support.patch
Normal file
478
Revert-LoongArch-Add-glibc.cpu.hwcap-support.patch
Normal file
|
@ -0,0 +1,478 @@
|
|||
From c0f3b0a8c71c26d5351e8ddabe3e8a323803e683 Mon Sep 17 00:00:00 2001
|
||||
From: caiyinyu <caiyinyu@loongson.cn>
|
||||
Date: Thu, 21 Sep 2023 09:10:11 +0800
|
||||
Subject: [PATCH 26/29] Revert "LoongArch: Add glibc.cpu.hwcap support."
|
||||
|
||||
This reverts commit a53451559dc9cce765ea5bcbb92c4007e058e92b.
|
||||
|
||||
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/loongarch/Makefile | 4 -
|
||||
sysdeps/loongarch/Versions | 5 --
|
||||
sysdeps/loongarch/cpu-tunables.c | 89 -------------------
|
||||
sysdeps/loongarch/dl-get-cpu-features.c | 25 ------
|
||||
sysdeps/loongarch/dl-machine.h | 27 +-----
|
||||
sysdeps/loongarch/dl-tunables.list | 25 ------
|
||||
.../unix/sysv/linux/loongarch/cpu-features.c | 29 ------
|
||||
.../unix/sysv/linux/loongarch/cpu-features.h | 18 +---
|
||||
.../unix/sysv/linux/loongarch/dl-procinfo.c | 60 -------------
|
||||
sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c | 21 -----
|
||||
.../unix/sysv/linux/loongarch/libc-start.c | 34 -------
|
||||
11 files changed, 8 insertions(+), 329 deletions(-)
|
||||
delete mode 100644 sysdeps/loongarch/Versions
|
||||
delete mode 100644 sysdeps/loongarch/cpu-tunables.c
|
||||
delete mode 100644 sysdeps/loongarch/dl-get-cpu-features.c
|
||||
delete mode 100644 sysdeps/loongarch/dl-tunables.list
|
||||
delete mode 100644 sysdeps/unix/sysv/linux/loongarch/cpu-features.c
|
||||
delete mode 100644 sysdeps/unix/sysv/linux/loongarch/dl-procinfo.c
|
||||
delete mode 100644 sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c
|
||||
delete mode 100644 sysdeps/unix/sysv/linux/loongarch/libc-start.c
|
||||
|
||||
diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
|
||||
index 30a1f4a8..43d2f583 100644
|
||||
--- a/sysdeps/loongarch/Makefile
|
||||
+++ b/sysdeps/loongarch/Makefile
|
||||
@@ -6,10 +6,6 @@ ifeq ($(subdir),elf)
|
||||
gen-as-const-headers += dl-link.sym
|
||||
endif
|
||||
|
||||
-ifeq ($(subdir),elf)
|
||||
- sysdep-dl-routines += dl-get-cpu-features
|
||||
-endif
|
||||
-
|
||||
# LoongArch's assembler also needs to know about PIC as it changes the
|
||||
# definition of some assembler macros.
|
||||
ASFLAGS-.os += $(pic-ccflag)
|
||||
diff --git a/sysdeps/loongarch/Versions b/sysdeps/loongarch/Versions
|
||||
deleted file mode 100644
|
||||
index 33ae2cc0..00000000
|
||||
--- a/sysdeps/loongarch/Versions
|
||||
+++ /dev/null
|
||||
@@ -1,5 +0,0 @@
|
||||
-ld {
|
||||
- GLIBC_PRIVATE {
|
||||
- _dl_larch_get_cpu_features;
|
||||
- }
|
||||
-}
|
||||
diff --git a/sysdeps/loongarch/cpu-tunables.c b/sysdeps/loongarch/cpu-tunables.c
|
||||
deleted file mode 100644
|
||||
index 8e9fab93..00000000
|
||||
--- a/sysdeps/loongarch/cpu-tunables.c
|
||||
+++ /dev/null
|
||||
@@ -1,89 +0,0 @@
|
||||
-/* LoongArch CPU feature tuning.
|
||||
- This file is part of the GNU C Library.
|
||||
- Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
-
|
||||
- The GNU C Library is free software; you can redistribute it and/or
|
||||
- modify it under the terms of the GNU Lesser General Public
|
||||
- License as published by the Free Software Foundation; either
|
||||
- version 2.1 of the License, or (at your option) any later version.
|
||||
-
|
||||
- The GNU C Library is distributed in the hope that it will be useful,
|
||||
- but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
- Lesser General Public License for more details.
|
||||
-
|
||||
- You should have received a copy of the GNU Lesser General Public
|
||||
- License along with the GNU C Library; if not, see
|
||||
- <http://www.gnu.org/licenses/>. */
|
||||
-
|
||||
-# include <stdbool.h>
|
||||
-# include <stdint.h>
|
||||
-# include <unistd.h> /* Get STDOUT_FILENO for _dl_printf. */
|
||||
-# include <elf/dl-tunables.h>
|
||||
-# include <string.h>
|
||||
-# include <cpu-features.h>
|
||||
-# include <ldsodefs.h>
|
||||
-# include <sys/auxv.h>
|
||||
-
|
||||
-# define HWCAP_LOONGARCH_IFUNC \
|
||||
- (HWCAP_LOONGARCH_UAL | HWCAP_LOONGARCH_LSX | HWCAP_LOONGARCH_LASX)
|
||||
-
|
||||
-# define CHECK_GLIBC_IFUNC_CPU_OFF(f, name, len) \
|
||||
- _Static_assert (sizeof (#name) - 1 == len, #name " != " #len); \
|
||||
- if (!memcmp (f, #name, len) && \
|
||||
- (GLRO (dl_hwcap) & HWCAP_LOONGARCH_##name)) \
|
||||
- { \
|
||||
- hwcap |= (HWCAP_LOONGARCH_##name | (~HWCAP_LOONGARCH_IFUNC)); \
|
||||
- break; \
|
||||
- } \
|
||||
-
|
||||
-attribute_hidden
|
||||
-void
|
||||
-TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
|
||||
-{
|
||||
- const char *p = valp->strval;
|
||||
- size_t len;
|
||||
- unsigned long hwcap = 0;
|
||||
- const char *c;
|
||||
-
|
||||
- do {
|
||||
- for (c = p; *c != ','; c++)
|
||||
- if (*c == '\0')
|
||||
- break;
|
||||
-
|
||||
- len = c - p;
|
||||
-
|
||||
- switch(len)
|
||||
- {
|
||||
- default:
|
||||
- _dl_fatal_printf (
|
||||
- "The valid values of glibc.cpu.hwcaps is UAL, LASX, LSX!!\n"
|
||||
- );
|
||||
- break;
|
||||
- case 3:
|
||||
- {
|
||||
- CHECK_GLIBC_IFUNC_CPU_OFF (p, LSX, 3);
|
||||
- CHECK_GLIBC_IFUNC_CPU_OFF (p, UAL, 3);
|
||||
- _dl_fatal_printf (
|
||||
- "Some features are invalid or not supported on this machine!!\n"
|
||||
- "The valid values of glibc.cpu.hwcaps is UAL, LASX, LSX!!\n"
|
||||
- );
|
||||
- }
|
||||
- break;
|
||||
- case 4:
|
||||
- {
|
||||
- CHECK_GLIBC_IFUNC_CPU_OFF (p, LASX, 4);
|
||||
- _dl_fatal_printf (
|
||||
- "Some features are invalid or not supported on this machine!!\n"
|
||||
- "The valid values of glibc.cpu.hwcaps is UAL, LASX, LSX!!\n"
|
||||
- );
|
||||
- }
|
||||
- break;
|
||||
- }
|
||||
-
|
||||
- p += len + 1;
|
||||
- }
|
||||
- while (*c != '\0');
|
||||
-
|
||||
- GLRO (dl_larch_cpu_features).hwcap &= hwcap;
|
||||
-}
|
||||
diff --git a/sysdeps/loongarch/dl-get-cpu-features.c b/sysdeps/loongarch/dl-get-cpu-features.c
|
||||
deleted file mode 100644
|
||||
index 7cd9bc15..00000000
|
||||
--- a/sysdeps/loongarch/dl-get-cpu-features.c
|
||||
+++ /dev/null
|
||||
@@ -1,25 +0,0 @@
|
||||
-/* Define _dl_larch_get_cpu_features.
|
||||
- Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
-
|
||||
- The GNU C Library is free software; you can redistribute it and/or
|
||||
- modify it under the terms of the GNU Lesser General Public
|
||||
- License as published by the Free Software Foundation; either
|
||||
- version 2.1 of the License, or (at your option) any later version.
|
||||
-
|
||||
- The GNU C Library is distributed in the hope that it will be useful,
|
||||
- but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
- Lesser General Public License for more details.
|
||||
-
|
||||
- You should have received a copy of the GNU Lesser General Public
|
||||
- License along with the GNU C Library; if not, see
|
||||
- <https://www.gnu.org/licenses/>. */
|
||||
-
|
||||
-
|
||||
-#include <ldsodefs.h>
|
||||
-
|
||||
-const struct cpu_features *
|
||||
-_dl_larch_get_cpu_features (void)
|
||||
-{
|
||||
- return &GLRO(dl_larch_cpu_features);
|
||||
-}
|
||||
diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
|
||||
index b395a928..57913cef 100644
|
||||
--- a/sysdeps/loongarch/dl-machine.h
|
||||
+++ b/sysdeps/loongarch/dl-machine.h
|
||||
@@ -29,8 +29,6 @@
|
||||
#include <dl-static-tls.h>
|
||||
#include <dl-machine-rel.h>
|
||||
|
||||
-#include <cpu-features.c>
|
||||
-
|
||||
#ifndef _RTLD_PROLOGUE
|
||||
# define _RTLD_PROLOGUE(entry) \
|
||||
".globl\t" __STRING (entry) "\n\t" \
|
||||
@@ -55,23 +53,6 @@
|
||||
#define ELF_MACHINE_NO_REL 1
|
||||
#define ELF_MACHINE_NO_RELA 0
|
||||
|
||||
-#define DL_PLATFORM_INIT dl_platform_init ()
|
||||
-
|
||||
-static inline void __attribute__ ((unused))
|
||||
-dl_platform_init (void)
|
||||
-{
|
||||
- if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0')
|
||||
- /* Avoid an empty string which would disturb us. */
|
||||
- GLRO(dl_platform) = NULL;
|
||||
-
|
||||
-#ifdef SHARED
|
||||
- /* init_cpu_features has been called early from __libc_start_main in
|
||||
- static executable. */
|
||||
- init_cpu_features (&GLRO(dl_larch_cpu_features));
|
||||
-#endif
|
||||
-}
|
||||
-
|
||||
-
|
||||
/* Return nonzero iff ELF header is compatible with the running host. */
|
||||
static inline int
|
||||
elf_machine_matches_host (const ElfW (Ehdr) *ehdr)
|
||||
@@ -309,9 +290,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
|
||||
if (profile != 0)
|
||||
{
|
||||
#if !defined __loongarch_soft_float
|
||||
- if (RTLD_SUPPORT_LASX)
|
||||
+ if (SUPPORT_LASX)
|
||||
gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile_lasx;
|
||||
- else if (RTLD_SUPPORT_LSX)
|
||||
+ else if (SUPPORT_LSX)
|
||||
gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile_lsx;
|
||||
else
|
||||
#endif
|
||||
@@ -329,9 +310,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
|
||||
indicated by the offset on the stack, and then jump to
|
||||
the resolved address. */
|
||||
#if !defined __loongarch_soft_float
|
||||
- if (RTLD_SUPPORT_LASX)
|
||||
+ if (SUPPORT_LASX)
|
||||
gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lasx;
|
||||
- else if (RTLD_SUPPORT_LSX)
|
||||
+ else if (SUPPORT_LSX)
|
||||
gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lsx;
|
||||
else
|
||||
#endif
|
||||
diff --git a/sysdeps/loongarch/dl-tunables.list b/sysdeps/loongarch/dl-tunables.list
|
||||
deleted file mode 100644
|
||||
index 66b34275..00000000
|
||||
--- a/sysdeps/loongarch/dl-tunables.list
|
||||
+++ /dev/null
|
||||
@@ -1,25 +0,0 @@
|
||||
-# LoongArch specific tunables.
|
||||
-# Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
-# This file is part of the GNU C Library.
|
||||
-
|
||||
-# The GNU C Library is free software; you can redistribute it and/or
|
||||
-# modify it under the terms of the GNU Lesser General Public
|
||||
-# License as published by the Free Software Foundation; either
|
||||
-# version 2.1 of the License, or (at your option) any later version.
|
||||
-
|
||||
-# The GNU C Library is distributed in the hope that it will be useful,
|
||||
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
-# Lesser General Public License for more details.
|
||||
-
|
||||
-# You should have received a copy of the GNU Lesser General Public
|
||||
-# License along with the GNU C Library; if not, see
|
||||
-# <http://www.gnu.org/licenses/>.
|
||||
-
|
||||
-glibc {
|
||||
- cpu {
|
||||
- hwcaps {
|
||||
- type: STRING
|
||||
- }
|
||||
- }
|
||||
-}
|
||||
diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.c b/sysdeps/unix/sysv/linux/loongarch/cpu-features.c
|
||||
deleted file mode 100644
|
||||
index 1290c4ce..00000000
|
||||
--- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.c
|
||||
+++ /dev/null
|
||||
@@ -1,29 +0,0 @@
|
||||
-/* Initialize CPU feature data. LoongArch64 version.
|
||||
- This file is part of the GNU C Library.
|
||||
- Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
-
|
||||
- The GNU C Library is free software; you can redistribute it and/or
|
||||
- modify it under the terms of the GNU Lesser General Public
|
||||
- License as published by the Free Software Foundation; either
|
||||
- version 2.1 of the License, or (at your option) any later version.
|
||||
-
|
||||
- The GNU C Library is distributed in the hope that it will be useful,
|
||||
- but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
- Lesser General Public License for more details.
|
||||
-
|
||||
- You should have received a copy of the GNU Lesser General Public
|
||||
- License along with the GNU C Library; if not, see
|
||||
- <http://www.gnu.org/licenses/>. */
|
||||
-
|
||||
-#include <cpu-features.h>
|
||||
-#include <elf/dl-hwcaps.h>
|
||||
-#include <elf/dl-tunables.h>
|
||||
-extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *) attribute_hidden;
|
||||
-
|
||||
-static inline void
|
||||
-init_cpu_features (struct cpu_features *cpu_features)
|
||||
-{
|
||||
- GLRO (dl_larch_cpu_features).hwcap = GLRO (dl_hwcap);
|
||||
- TUNABLE_GET (glibc, cpu, hwcaps, tunable_val_t *, TUNABLE_CALLBACK (set_hwcaps));
|
||||
-}
|
||||
diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
|
||||
index 450963ce..d1a280a5 100644
|
||||
--- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
|
||||
+++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
|
||||
@@ -19,23 +19,13 @@
|
||||
#ifndef _CPU_FEATURES_LOONGARCH64_H
|
||||
#define _CPU_FEATURES_LOONGARCH64_H
|
||||
|
||||
-#include <stdint.h>
|
||||
#include <sys/auxv.h>
|
||||
|
||||
-struct cpu_features
|
||||
- {
|
||||
- uint64_t hwcap;
|
||||
- };
|
||||
+#define SUPPORT_UAL (GLRO (dl_hwcap) & HWCAP_LOONGARCH_UAL)
|
||||
+#define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX)
|
||||
+#define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX)
|
||||
|
||||
-/* Get a pointer to the CPU features structure. */
|
||||
-extern const struct cpu_features *_dl_larch_get_cpu_features (void)
|
||||
- __attribute__ ((pure));
|
||||
-
|
||||
-#define SUPPORT_UAL (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_UAL)
|
||||
-#define SUPPORT_LSX (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_LSX)
|
||||
-#define SUPPORT_LASX (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_LASX)
|
||||
-#define RTLD_SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX)
|
||||
-#define RTLD_SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX)
|
||||
#define INIT_ARCH()
|
||||
|
||||
#endif /* _CPU_FEATURES_LOONGARCH64_H */
|
||||
+
|
||||
diff --git a/sysdeps/unix/sysv/linux/loongarch/dl-procinfo.c b/sysdeps/unix/sysv/linux/loongarch/dl-procinfo.c
|
||||
deleted file mode 100644
|
||||
index 6217fda9..00000000
|
||||
--- a/sysdeps/unix/sysv/linux/loongarch/dl-procinfo.c
|
||||
+++ /dev/null
|
||||
@@ -1,60 +0,0 @@
|
||||
-/* Data for LoongArch64 version of processor capability information.
|
||||
- Linux version.
|
||||
- Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
- This file is part of the GNU C Library.
|
||||
-
|
||||
- The GNU C Library is free software; you can redistribute it and/or
|
||||
- modify it under the terms of the GNU Lesser General Public
|
||||
- License as published by the Free Software Foundation; either
|
||||
- version 2.1 of the License, or (at your option) any later version.
|
||||
-
|
||||
- The GNU C Library is distributed in the hope that it will be useful,
|
||||
- but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
- Lesser General Public License for more details.
|
||||
-
|
||||
- You should have received a copy of the GNU Lesser General Public
|
||||
- License along with the GNU C Library; if not, see
|
||||
- <http://www.gnu.org/licenses/>. */
|
||||
-
|
||||
-/* If anything should be added here check whether the size of each string
|
||||
- is still ok with the given array size.
|
||||
-
|
||||
- All the #ifdefs in the definitions are quite irritating but
|
||||
- necessary if we want to avoid duplicating the information. There
|
||||
- are three different modes:
|
||||
-
|
||||
- - PROCINFO_DECL is defined. This means we are only interested in
|
||||
- declarations.
|
||||
-
|
||||
- - PROCINFO_DECL is not defined:
|
||||
-
|
||||
- + if SHARED is defined the file is included in an array
|
||||
- initializer. The .element = { ... } syntax is needed.
|
||||
-
|
||||
- + if SHARED is not defined a normal array initialization is
|
||||
- needed.
|
||||
- */
|
||||
-
|
||||
-#ifndef PROCINFO_CLASS
|
||||
-# define PROCINFO_CLASS
|
||||
-#endif
|
||||
-
|
||||
-#if !IS_IN (ldconfig)
|
||||
-# if !defined PROCINFO_DECL && defined SHARED
|
||||
- ._dl_larch_cpu_features
|
||||
-# else
|
||||
-PROCINFO_CLASS struct cpu_features _dl_larch_cpu_features
|
||||
-# endif
|
||||
-# ifndef PROCINFO_DECL
|
||||
-= { }
|
||||
-# endif
|
||||
-# if !defined SHARED || defined PROCINFO_DECL
|
||||
-;
|
||||
-# else
|
||||
-,
|
||||
-# endif
|
||||
-#endif
|
||||
-
|
||||
-#undef PROCINFO_DECL
|
||||
-#undef PROCINFO_CLASS
|
||||
diff --git a/sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c b/sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c
|
||||
deleted file mode 100644
|
||||
index 455fd71a..00000000
|
||||
--- a/sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c
|
||||
+++ /dev/null
|
||||
@@ -1,21 +0,0 @@
|
||||
-/* Operating system support for run-time dynamic linker. LoongArch version.
|
||||
- Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
- This file is part of the GNU C Library.
|
||||
-
|
||||
- The GNU C Library is free software; you can redistribute it and/or
|
||||
- modify it under the terms of the GNU Lesser General Public
|
||||
- License as published by the Free Software Foundation; either
|
||||
- version 2.1 of the License, or (at your option) any later version.
|
||||
-
|
||||
- The GNU C Library is distributed in the hope that it will be useful,
|
||||
- but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
- Lesser General Public License for more details.
|
||||
-
|
||||
- You should have received a copy of the GNU Lesser General Public
|
||||
- License along with the GNU C Library; if not, see
|
||||
- <http://www.gnu.org/licenses/>. */
|
||||
-
|
||||
-#include <config.h>
|
||||
-#include <sysdeps/loongarch/cpu-tunables.c>
|
||||
-#include <sysdeps/unix/sysv/linux/dl-sysdep.c>
|
||||
diff --git a/sysdeps/unix/sysv/linux/loongarch/libc-start.c b/sysdeps/unix/sysv/linux/loongarch/libc-start.c
|
||||
deleted file mode 100644
|
||||
index f1346ece..00000000
|
||||
--- a/sysdeps/unix/sysv/linux/loongarch/libc-start.c
|
||||
+++ /dev/null
|
||||
@@ -1,34 +0,0 @@
|
||||
-/* Override csu/libc-start.c on LoongArch64.
|
||||
- Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
- This file is part of the GNU C Library.
|
||||
-
|
||||
- The GNU C Library is free software; you can redistribute it and/or
|
||||
- modify it under the terms of the GNU Lesser General Public
|
||||
- License as published by the Free Software Foundation; either
|
||||
- version 2.1 of the License, or (at your option) any later version.
|
||||
-
|
||||
- The GNU C Library is distributed in the hope that it will be useful,
|
||||
- but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
- Lesser General Public License for more details.
|
||||
-
|
||||
- You should have received a copy of the GNU Lesser General Public
|
||||
- License along with the GNU C Library; if not, see
|
||||
- <http://www.gnu.org/licenses/>. */
|
||||
-
|
||||
-#ifndef SHARED
|
||||
-
|
||||
-/* Mark symbols hidden in static PIE for early self relocation to work. */
|
||||
-# if BUILD_PIE_DEFAULT
|
||||
-# pragma GCC visibility push(hidden)
|
||||
-# endif
|
||||
-
|
||||
-# include <ldsodefs.h>
|
||||
-# include <cpu-features.c>
|
||||
-
|
||||
-extern struct cpu_features _dl_larch_cpu_features;
|
||||
-
|
||||
-# define ARCH_INIT_CPU_FEATURES() init_cpu_features (&_dl_larch_cpu_features)
|
||||
-
|
||||
-#endif
|
||||
-#include <csu/libc-start.c>
|
||||
--
|
||||
2.33.0
|
||||
|
496
SUPPORTED
496
SUPPORTED
|
@ -1,496 +0,0 @@
|
|||
# This file names the currently supported and somewhat tested locales.
|
||||
# If you have any additions please file a glibc bug report.
|
||||
SUPPORTED-LOCALES=\
|
||||
C.UTF-8/UTF-8 \
|
||||
aa_DJ.UTF-8/UTF-8 \
|
||||
aa_DJ/ISO-8859-1 \
|
||||
aa_ER/UTF-8 \
|
||||
aa_ER@saaho/UTF-8 \
|
||||
aa_ET/UTF-8 \
|
||||
af_ZA.UTF-8/UTF-8 \
|
||||
af_ZA/ISO-8859-1 \
|
||||
agr_PE/UTF-8 \
|
||||
ak_GH/UTF-8 \
|
||||
am_ET/UTF-8 \
|
||||
an_ES.UTF-8/UTF-8 \
|
||||
an_ES/ISO-8859-15 \
|
||||
anp_IN/UTF-8 \
|
||||
ar_AE.UTF-8/UTF-8 \
|
||||
ar_AE/ISO-8859-6 \
|
||||
ar_BH.UTF-8/UTF-8 \
|
||||
ar_BH/ISO-8859-6 \
|
||||
ar_DZ.UTF-8/UTF-8 \
|
||||
ar_DZ/ISO-8859-6 \
|
||||
ar_EG.UTF-8/UTF-8 \
|
||||
ar_EG/ISO-8859-6 \
|
||||
ar_IN/UTF-8 \
|
||||
ar_IQ.UTF-8/UTF-8 \
|
||||
ar_IQ/ISO-8859-6 \
|
||||
ar_JO.UTF-8/UTF-8 \
|
||||
ar_JO/ISO-8859-6 \
|
||||
ar_KW.UTF-8/UTF-8 \
|
||||
ar_KW/ISO-8859-6 \
|
||||
ar_LB.UTF-8/UTF-8 \
|
||||
ar_LB/ISO-8859-6 \
|
||||
ar_LY.UTF-8/UTF-8 \
|
||||
ar_LY/ISO-8859-6 \
|
||||
ar_MA.UTF-8/UTF-8 \
|
||||
ar_MA/ISO-8859-6 \
|
||||
ar_OM.UTF-8/UTF-8 \
|
||||
ar_OM/ISO-8859-6 \
|
||||
ar_QA.UTF-8/UTF-8 \
|
||||
ar_QA/ISO-8859-6 \
|
||||
ar_SA.UTF-8/UTF-8 \
|
||||
ar_SA/ISO-8859-6 \
|
||||
ar_SD.UTF-8/UTF-8 \
|
||||
ar_SD/ISO-8859-6 \
|
||||
ar_SS/UTF-8 \
|
||||
ar_SY.UTF-8/UTF-8 \
|
||||
ar_SY/ISO-8859-6 \
|
||||
ar_TN.UTF-8/UTF-8 \
|
||||
ar_TN/ISO-8859-6 \
|
||||
ar_YE.UTF-8/UTF-8 \
|
||||
ar_YE/ISO-8859-6 \
|
||||
ayc_PE/UTF-8 \
|
||||
az_AZ/UTF-8 \
|
||||
az_IR/UTF-8 \
|
||||
as_IN/UTF-8 \
|
||||
ast_ES.UTF-8/UTF-8 \
|
||||
ast_ES/ISO-8859-15 \
|
||||
be_BY.UTF-8/UTF-8 \
|
||||
be_BY/CP1251 \
|
||||
be_BY@latin/UTF-8 \
|
||||
bem_ZM/UTF-8 \
|
||||
ber_DZ/UTF-8 \
|
||||
ber_MA/UTF-8 \
|
||||
bg_BG.UTF-8/UTF-8 \
|
||||
bg_BG/CP1251 \
|
||||
bhb_IN.UTF-8/UTF-8 \
|
||||
bho_IN/UTF-8 \
|
||||
bho_NP/UTF-8 \
|
||||
bi_VU/UTF-8 \
|
||||
bn_BD/UTF-8 \
|
||||
bn_IN/UTF-8 \
|
||||
bo_CN/UTF-8 \
|
||||
bo_IN/UTF-8 \
|
||||
br_FR.UTF-8/UTF-8 \
|
||||
br_FR/ISO-8859-1 \
|
||||
br_FR@euro/ISO-8859-15 \
|
||||
brx_IN/UTF-8 \
|
||||
bs_BA.UTF-8/UTF-8 \
|
||||
bs_BA/ISO-8859-2 \
|
||||
byn_ER/UTF-8 \
|
||||
ca_AD.UTF-8/UTF-8 \
|
||||
ca_AD/ISO-8859-15 \
|
||||
ca_ES.UTF-8/UTF-8 \
|
||||
ca_ES/ISO-8859-1 \
|
||||
ca_ES@euro/ISO-8859-15 \
|
||||
ca_ES@valencia/UTF-8 \
|
||||
ca_FR.UTF-8/UTF-8 \
|
||||
ca_FR/ISO-8859-15 \
|
||||
ca_IT.UTF-8/UTF-8 \
|
||||
ca_IT/ISO-8859-15 \
|
||||
ce_RU/UTF-8 \
|
||||
chr_US/UTF-8 \
|
||||
cmn_TW/UTF-8 \
|
||||
crh_UA/UTF-8 \
|
||||
cs_CZ.UTF-8/UTF-8 \
|
||||
cs_CZ/ISO-8859-2 \
|
||||
csb_PL/UTF-8 \
|
||||
cv_RU/UTF-8 \
|
||||
cy_GB.UTF-8/UTF-8 \
|
||||
cy_GB/ISO-8859-14 \
|
||||
da_DK.UTF-8/UTF-8 \
|
||||
da_DK/ISO-8859-1 \
|
||||
da_DK.ISO-8859-15/ISO-8859-15 \
|
||||
de_AT.UTF-8/UTF-8 \
|
||||
de_AT/ISO-8859-1 \
|
||||
de_AT@euro/ISO-8859-15 \
|
||||
de_BE.UTF-8/UTF-8 \
|
||||
de_BE/ISO-8859-1 \
|
||||
de_BE@euro/ISO-8859-15 \
|
||||
de_CH.UTF-8/UTF-8 \
|
||||
de_CH/ISO-8859-1 \
|
||||
de_DE.UTF-8/UTF-8 \
|
||||
de_DE/ISO-8859-1 \
|
||||
de_DE@euro/ISO-8859-15 \
|
||||
de_IT.UTF-8/UTF-8 \
|
||||
de_IT/ISO-8859-1 \
|
||||
de_LI.UTF-8/UTF-8 \
|
||||
de_LU.UTF-8/UTF-8 \
|
||||
de_LU/ISO-8859-1 \
|
||||
de_LU@euro/ISO-8859-15 \
|
||||
doi_IN/UTF-8 \
|
||||
dsb_DE/UTF-8 \
|
||||
dv_MV/UTF-8 \
|
||||
dz_BT/UTF-8 \
|
||||
el_GR.UTF-8/UTF-8 \
|
||||
el_GR/ISO-8859-7 \
|
||||
el_GR@euro/ISO-8859-7 \
|
||||
el_CY.UTF-8/UTF-8 \
|
||||
el_CY/ISO-8859-7 \
|
||||
en_AG/UTF-8 \
|
||||
en_AU.UTF-8/UTF-8 \
|
||||
en_AU/ISO-8859-1 \
|
||||
en_BW.UTF-8/UTF-8 \
|
||||
en_BW/ISO-8859-1 \
|
||||
en_CA.UTF-8/UTF-8 \
|
||||
en_CA/ISO-8859-1 \
|
||||
en_DK.UTF-8/UTF-8 \
|
||||
en_DK/ISO-8859-1 \
|
||||
en_GB.UTF-8/UTF-8 \
|
||||
en_GB/ISO-8859-1 \
|
||||
en_GB.ISO-8859-15/ISO-8859-15 \
|
||||
en_HK.UTF-8/UTF-8 \
|
||||
en_HK/ISO-8859-1 \
|
||||
en_IE.UTF-8/UTF-8 \
|
||||
en_IE/ISO-8859-1 \
|
||||
en_IE@euro/ISO-8859-15 \
|
||||
en_IL/UTF-8 \
|
||||
en_IN/UTF-8 \
|
||||
en_NG/UTF-8 \
|
||||
en_NZ.UTF-8/UTF-8 \
|
||||
en_NZ/ISO-8859-1 \
|
||||
en_PH.UTF-8/UTF-8 \
|
||||
en_PH/ISO-8859-1 \
|
||||
en_SC.UTF-8/UTF-8 \
|
||||
en_SG.UTF-8/UTF-8 \
|
||||
en_SG/ISO-8859-1 \
|
||||
en_US.UTF-8/UTF-8 \
|
||||
en_US/ISO-8859-1 \
|
||||
en_US.ISO-8859-15/ISO-8859-15 \
|
||||
en_US@ampm/UTF-8 \
|
||||
en_US.UTF-8@ampm/UTF-8 \
|
||||
en_ZA.UTF-8/UTF-8 \
|
||||
en_ZA/ISO-8859-1 \
|
||||
en_ZM/UTF-8 \
|
||||
en_ZW.UTF-8/UTF-8 \
|
||||
en_ZW/ISO-8859-1 \
|
||||
eo/UTF-8 \
|
||||
es_AR.UTF-8/UTF-8 \
|
||||
es_AR/ISO-8859-1 \
|
||||
es_BO.UTF-8/UTF-8 \
|
||||
es_BO/ISO-8859-1 \
|
||||
es_CL.UTF-8/UTF-8 \
|
||||
es_CL/ISO-8859-1 \
|
||||
es_CO.UTF-8/UTF-8 \
|
||||
es_CO/ISO-8859-1 \
|
||||
es_CR.UTF-8/UTF-8 \
|
||||
es_CR/ISO-8859-1 \
|
||||
es_CU/UTF-8 \
|
||||
es_DO.UTF-8/UTF-8 \
|
||||
es_DO/ISO-8859-1 \
|
||||
es_EC.UTF-8/UTF-8 \
|
||||
es_EC/ISO-8859-1 \
|
||||
es_ES.UTF-8/UTF-8 \
|
||||
es_ES/ISO-8859-1 \
|
||||
es_ES@euro/ISO-8859-15 \
|
||||
es_GT.UTF-8/UTF-8 \
|
||||
es_GT/ISO-8859-1 \
|
||||
es_HN.UTF-8/UTF-8 \
|
||||
es_HN/ISO-8859-1 \
|
||||
es_MX.UTF-8/UTF-8 \
|
||||
es_MX/ISO-8859-1 \
|
||||
es_NI.UTF-8/UTF-8 \
|
||||
es_NI/ISO-8859-1 \
|
||||
es_PA.UTF-8/UTF-8 \
|
||||
es_PA/ISO-8859-1 \
|
||||
es_PE.UTF-8/UTF-8 \
|
||||
es_PE/ISO-8859-1 \
|
||||
es_PR.UTF-8/UTF-8 \
|
||||
es_PR/ISO-8859-1 \
|
||||
es_PY.UTF-8/UTF-8 \
|
||||
es_PY/ISO-8859-1 \
|
||||
es_SV.UTF-8/UTF-8 \
|
||||
es_SV/ISO-8859-1 \
|
||||
es_US.UTF-8/UTF-8 \
|
||||
es_US/ISO-8859-1 \
|
||||
es_UY.UTF-8/UTF-8 \
|
||||
es_UY/ISO-8859-1 \
|
||||
es_VE.UTF-8/UTF-8 \
|
||||
es_VE/ISO-8859-1 \
|
||||
et_EE.UTF-8/UTF-8 \
|
||||
et_EE/ISO-8859-1 \
|
||||
et_EE.ISO-8859-15/ISO-8859-15 \
|
||||
eu_ES.UTF-8/UTF-8 \
|
||||
eu_ES/ISO-8859-1 \
|
||||
eu_ES@euro/ISO-8859-15 \
|
||||
fa_IR/UTF-8 \
|
||||
ff_SN/UTF-8 \
|
||||
fi_FI.UTF-8/UTF-8 \
|
||||
fi_FI/ISO-8859-1 \
|
||||
fi_FI@euro/ISO-8859-15 \
|
||||
fil_PH/UTF-8 \
|
||||
fo_FO.UTF-8/UTF-8 \
|
||||
fo_FO/ISO-8859-1 \
|
||||
fr_BE.UTF-8/UTF-8 \
|
||||
fr_BE/ISO-8859-1 \
|
||||
fr_BE@euro/ISO-8859-15 \
|
||||
fr_CA.UTF-8/UTF-8 \
|
||||
fr_CA/ISO-8859-1 \
|
||||
fr_CH.UTF-8/UTF-8 \
|
||||
fr_CH/ISO-8859-1 \
|
||||
fr_FR.UTF-8/UTF-8 \
|
||||
fr_FR/ISO-8859-1 \
|
||||
fr_FR@euro/ISO-8859-15 \
|
||||
fr_LU.UTF-8/UTF-8 \
|
||||
fr_LU/ISO-8859-1 \
|
||||
fr_LU@euro/ISO-8859-15 \
|
||||
fur_IT/UTF-8 \
|
||||
fy_NL/UTF-8 \
|
||||
fy_DE/UTF-8 \
|
||||
ga_IE.UTF-8/UTF-8 \
|
||||
ga_IE/ISO-8859-1 \
|
||||
ga_IE@euro/ISO-8859-15 \
|
||||
gd_GB.UTF-8/UTF-8 \
|
||||
gd_GB/ISO-8859-15 \
|
||||
gez_ER/UTF-8 \
|
||||
gez_ER@abegede/UTF-8 \
|
||||
gez_ET/UTF-8 \
|
||||
gez_ET@abegede/UTF-8 \
|
||||
gl_ES.UTF-8/UTF-8 \
|
||||
gl_ES/ISO-8859-1 \
|
||||
gl_ES@euro/ISO-8859-15 \
|
||||
gu_IN/UTF-8 \
|
||||
gv_GB.UTF-8/UTF-8 \
|
||||
gv_GB/ISO-8859-1 \
|
||||
ha_NG/UTF-8 \
|
||||
hak_TW/UTF-8 \
|
||||
he_IL.UTF-8/UTF-8 \
|
||||
he_IL/ISO-8859-8 \
|
||||
hi_IN/UTF-8 \
|
||||
hif_FJ/UTF-8 \
|
||||
hne_IN/UTF-8 \
|
||||
hr_HR.UTF-8/UTF-8 \
|
||||
hr_HR/ISO-8859-2 \
|
||||
hsb_DE/ISO-8859-2 \
|
||||
hsb_DE.UTF-8/UTF-8 \
|
||||
ht_HT/UTF-8 \
|
||||
hu_HU.UTF-8/UTF-8 \
|
||||
hu_HU/ISO-8859-2 \
|
||||
hy_AM/UTF-8 \
|
||||
hy_AM.ARMSCII-8/ARMSCII-8 \
|
||||
ia_FR/UTF-8 \
|
||||
id_ID.UTF-8/UTF-8 \
|
||||
id_ID/ISO-8859-1 \
|
||||
ig_NG/UTF-8 \
|
||||
ik_CA/UTF-8 \
|
||||
is_IS.UTF-8/UTF-8 \
|
||||
is_IS/ISO-8859-1 \
|
||||
it_CH.UTF-8/UTF-8 \
|
||||
it_CH/ISO-8859-1 \
|
||||
it_IT.UTF-8/UTF-8 \
|
||||
it_IT/ISO-8859-1 \
|
||||
it_IT@euro/ISO-8859-15 \
|
||||
iu_CA/UTF-8 \
|
||||
ja_JP.EUC-JP/EUC-JP \
|
||||
ja_JP.UTF-8/UTF-8 \
|
||||
ka_GE.UTF-8/UTF-8 \
|
||||
ka_GE/GEORGIAN-PS \
|
||||
kab_DZ/UTF-8 \
|
||||
kk_KZ.UTF-8/UTF-8 \
|
||||
kk_KZ/PT154 \
|
||||
kl_GL.UTF-8/UTF-8 \
|
||||
kl_GL/ISO-8859-1 \
|
||||
km_KH/UTF-8 \
|
||||
kn_IN/UTF-8 \
|
||||
ko_KR.EUC-KR/EUC-KR \
|
||||
ko_KR.UTF-8/UTF-8 \
|
||||
kok_IN/UTF-8 \
|
||||
ks_IN/UTF-8 \
|
||||
ks_IN@devanagari/UTF-8 \
|
||||
ku_TR.UTF-8/UTF-8 \
|
||||
ku_TR/ISO-8859-9 \
|
||||
kw_GB.UTF-8/UTF-8 \
|
||||
kw_GB/ISO-8859-1 \
|
||||
ky_KG/UTF-8 \
|
||||
lb_LU/UTF-8 \
|
||||
lg_UG.UTF-8/UTF-8 \
|
||||
lg_UG/ISO-8859-10 \
|
||||
li_BE/UTF-8 \
|
||||
li_NL/UTF-8 \
|
||||
lij_IT/UTF-8 \
|
||||
ln_CD/UTF-8 \
|
||||
lo_LA/UTF-8 \
|
||||
lt_LT.UTF-8/UTF-8 \
|
||||
lt_LT/ISO-8859-13 \
|
||||
lv_LV.UTF-8/UTF-8 \
|
||||
lv_LV/ISO-8859-13 \
|
||||
lzh_TW/UTF-8 \
|
||||
mag_IN/UTF-8 \
|
||||
mai_IN/UTF-8 \
|
||||
mai_NP/UTF-8 \
|
||||
mfe_MU/UTF-8 \
|
||||
mg_MG.UTF-8/UTF-8 \
|
||||
mg_MG/ISO-8859-15 \
|
||||
mhr_RU/UTF-8 \
|
||||
mi_NZ.UTF-8/UTF-8 \
|
||||
mi_NZ/ISO-8859-13 \
|
||||
miq_NI/UTF-8 \
|
||||
mjw_IN/UTF-8 \
|
||||
mk_MK.UTF-8/UTF-8 \
|
||||
mk_MK/ISO-8859-5 \
|
||||
ml_IN/UTF-8 \
|
||||
mn_MN/UTF-8 \
|
||||
mni_IN/UTF-8 \
|
||||
mr_IN/UTF-8 \
|
||||
ms_MY.UTF-8/UTF-8 \
|
||||
ms_MY/ISO-8859-1 \
|
||||
mt_MT.UTF-8/UTF-8 \
|
||||
mt_MT/ISO-8859-3 \
|
||||
my_MM/UTF-8 \
|
||||
nan_TW/UTF-8 \
|
||||
nan_TW@latin/UTF-8 \
|
||||
nb_NO.UTF-8/UTF-8 \
|
||||
nb_NO/ISO-8859-1 \
|
||||
nds_DE/UTF-8 \
|
||||
nds_NL/UTF-8 \
|
||||
ne_NP/UTF-8 \
|
||||
nhn_MX/UTF-8 \
|
||||
niu_NU/UTF-8 \
|
||||
niu_NZ/UTF-8 \
|
||||
nl_AW/UTF-8 \
|
||||
nl_BE.UTF-8/UTF-8 \
|
||||
nl_BE/ISO-8859-1 \
|
||||
nl_BE@euro/ISO-8859-15 \
|
||||
nl_NL.UTF-8/UTF-8 \
|
||||
nl_NL/ISO-8859-1 \
|
||||
nl_NL@euro/ISO-8859-15 \
|
||||
nn_NO.UTF-8/UTF-8 \
|
||||
nn_NO/ISO-8859-1 \
|
||||
nr_ZA/UTF-8 \
|
||||
nso_ZA/UTF-8 \
|
||||
oc_FR.UTF-8/UTF-8 \
|
||||
oc_FR/ISO-8859-1 \
|
||||
om_ET/UTF-8 \
|
||||
om_KE.UTF-8/UTF-8 \
|
||||
om_KE/ISO-8859-1 \
|
||||
or_IN/UTF-8 \
|
||||
os_RU/UTF-8 \
|
||||
pa_IN/UTF-8 \
|
||||
pa_PK/UTF-8 \
|
||||
pap_AW/UTF-8 \
|
||||
pap_CW/UTF-8 \
|
||||
pl_PL.UTF-8/UTF-8 \
|
||||
pl_PL/ISO-8859-2 \
|
||||
ps_AF/UTF-8 \
|
||||
pt_BR.UTF-8/UTF-8 \
|
||||
pt_BR/ISO-8859-1 \
|
||||
pt_PT.UTF-8/UTF-8 \
|
||||
pt_PT/ISO-8859-1 \
|
||||
pt_PT@euro/ISO-8859-15 \
|
||||
quz_PE/UTF-8 \
|
||||
raj_IN/UTF-8 \
|
||||
ro_RO.UTF-8/UTF-8 \
|
||||
ro_RO/ISO-8859-2 \
|
||||
ru_RU.KOI8-R/KOI8-R \
|
||||
ru_RU.UTF-8/UTF-8 \
|
||||
ru_RU/ISO-8859-5 \
|
||||
ru_UA.UTF-8/UTF-8 \
|
||||
ru_UA/KOI8-U \
|
||||
rw_RW/UTF-8 \
|
||||
sa_IN/UTF-8 \
|
||||
sah_RU/UTF-8 \
|
||||
sat_IN/UTF-8 \
|
||||
sc_IT/UTF-8 \
|
||||
sd_IN/UTF-8 \
|
||||
sd_IN@devanagari/UTF-8 \
|
||||
se_NO/UTF-8 \
|
||||
sgs_LT/UTF-8 \
|
||||
shn_MM/UTF-8 \
|
||||
shs_CA/UTF-8 \
|
||||
si_LK/UTF-8 \
|
||||
sid_ET/UTF-8 \
|
||||
sk_SK.UTF-8/UTF-8 \
|
||||
sk_SK/ISO-8859-2 \
|
||||
sl_SI.UTF-8/UTF-8 \
|
||||
sl_SI/ISO-8859-2 \
|
||||
sm_WS/UTF-8 \
|
||||
so_DJ.UTF-8/UTF-8 \
|
||||
so_DJ/ISO-8859-1 \
|
||||
so_ET/UTF-8 \
|
||||
so_KE.UTF-8/UTF-8 \
|
||||
so_KE/ISO-8859-1 \
|
||||
so_SO.UTF-8/UTF-8 \
|
||||
so_SO/ISO-8859-1 \
|
||||
sq_AL.UTF-8/UTF-8 \
|
||||
sq_AL/ISO-8859-1 \
|
||||
sq_MK/UTF-8 \
|
||||
sr_ME/UTF-8 \
|
||||
sr_RS/UTF-8 \
|
||||
sr_RS@latin/UTF-8 \
|
||||
ss_ZA/UTF-8 \
|
||||
st_ZA.UTF-8/UTF-8 \
|
||||
st_ZA/ISO-8859-1 \
|
||||
sv_FI.UTF-8/UTF-8 \
|
||||
sv_FI/ISO-8859-1 \
|
||||
sv_FI@euro/ISO-8859-15 \
|
||||
sv_SE.UTF-8/UTF-8 \
|
||||
sv_SE/ISO-8859-1 \
|
||||
sv_SE.ISO-8859-15/ISO-8859-15 \
|
||||
sw_KE/UTF-8 \
|
||||
sw_TZ/UTF-8 \
|
||||
szl_PL/UTF-8 \
|
||||
ta_IN/UTF-8 \
|
||||
ta_LK/UTF-8 \
|
||||
tcy_IN.UTF-8/UTF-8 \
|
||||
te_IN/UTF-8 \
|
||||
tg_TJ.UTF-8/UTF-8 \
|
||||
tg_TJ/KOI8-T \
|
||||
th_TH.UTF-8/UTF-8 \
|
||||
th_TH/TIS-620 \
|
||||
the_NP/UTF-8 \
|
||||
ti_ER/UTF-8 \
|
||||
ti_ET/UTF-8 \
|
||||
tig_ER/UTF-8 \
|
||||
tk_TM/UTF-8 \
|
||||
tl_PH.UTF-8/UTF-8 \
|
||||
tl_PH/ISO-8859-1 \
|
||||
tn_ZA/UTF-8 \
|
||||
to_TO/UTF-8 \
|
||||
tpi_PG/UTF-8 \
|
||||
tr_CY.UTF-8/UTF-8 \
|
||||
tr_CY/ISO-8859-9 \
|
||||
tr_TR.UTF-8/UTF-8 \
|
||||
tr_TR/ISO-8859-9 \
|
||||
ts_ZA/UTF-8 \
|
||||
tt_RU/UTF-8 \
|
||||
tt_RU@iqtelif/UTF-8 \
|
||||
ug_CN/UTF-8 \
|
||||
uk_UA.UTF-8/UTF-8 \
|
||||
uk_UA/KOI8-U \
|
||||
unm_US/UTF-8 \
|
||||
ur_IN/UTF-8 \
|
||||
ur_PK/UTF-8 \
|
||||
uz_UZ.UTF-8/UTF-8 \
|
||||
uz_UZ/ISO-8859-1 \
|
||||
uz_UZ@cyrillic/UTF-8 \
|
||||
ve_ZA/UTF-8 \
|
||||
vi_VN/UTF-8 \
|
||||
wa_BE/ISO-8859-1 \
|
||||
wa_BE@euro/ISO-8859-15 \
|
||||
wa_BE.UTF-8/UTF-8 \
|
||||
wae_CH/UTF-8 \
|
||||
wal_ET/UTF-8 \
|
||||
wo_SN/UTF-8 \
|
||||
xh_ZA.UTF-8/UTF-8 \
|
||||
xh_ZA/ISO-8859-1 \
|
||||
yi_US.UTF-8/UTF-8 \
|
||||
yi_US/CP1255 \
|
||||
yo_NG/UTF-8 \
|
||||
yue_HK/UTF-8 \
|
||||
yuw_PG/UTF-8 \
|
||||
zh_CN.GB18030/GB18030 \
|
||||
zh_CN.GBK/GBK \
|
||||
zh_CN.UTF-8/UTF-8 \
|
||||
zh_CN/GB2312 \
|
||||
zh_HK.UTF-8/UTF-8 \
|
||||
zh_HK/BIG5-HKSCS \
|
||||
zh_SG.UTF-8/UTF-8 \
|
||||
zh_SG.GBK/GBK \
|
||||
zh_SG/GB2312 \
|
||||
zh_TW.EUC-TW/EUC-TW \
|
||||
zh_TW.UTF-8/UTF-8 \
|
||||
zh_TW/BIG5 \
|
||||
zu_ZA.UTF-8/UTF-8 \
|
||||
zu_ZA/ISO-8859-1 \
|
|
@ -1,862 +0,0 @@
|
|||
#define _GNU_SOURCE
|
||||
#include <assert.h>
|
||||
#include <dirent.h>
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <locale.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <getopt.h>
|
||||
#include <string.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
#include "../locale/hashval.h"
|
||||
#define __LC_LAST 13
|
||||
#include "../locale/locarchive.h"
|
||||
#include "../crypt/md5.h"
|
||||
|
||||
const char *alias_file = DATADIR "/locale/locale.alias";
|
||||
const char *locar_file = PREFIX "/lib/locale/locale-archive";
|
||||
const char *tmpl_file = PREFIX "/lib/locale/locale-archive.tmpl";
|
||||
const char *loc_path = PREFIX "/lib/locale/";
|
||||
/* Flags set by `--verbose` option. */
|
||||
int be_quiet = 1;
|
||||
int verbose = 0;
|
||||
int max_locarchive_open_retry = 10;
|
||||
const char *output_prefix;
|
||||
|
||||
/* Endianness should have been taken care of by localedef. We don't need to do
|
||||
additional swapping. We need this variable exported however, since
|
||||
locarchive.c uses it to determine if it needs to swap endianness of a value
|
||||
before writing to or reading from the archive. */
|
||||
bool swap_endianness_p = false;
|
||||
|
||||
static const char *locnames[] =
|
||||
{
|
||||
#define DEFINE_CATEGORY(category, category_name, items, a) \
|
||||
[category] = category_name,
|
||||
#include "../locale/categories.def"
|
||||
#undef DEFINE_CATEGORY
|
||||
};
|
||||
|
||||
static int
|
||||
is_prime (unsigned long candidate)
|
||||
{
|
||||
/* No even number and none less than 10 will be passed here. */
|
||||
unsigned long int divn = 3;
|
||||
unsigned long int sq = divn * divn;
|
||||
|
||||
while (sq < candidate && candidate % divn != 0)
|
||||
{
|
||||
++divn;
|
||||
sq += 4 * divn;
|
||||
++divn;
|
||||
}
|
||||
|
||||
return candidate % divn != 0;
|
||||
}
|
||||
|
||||
unsigned long
|
||||
next_prime (unsigned long seed)
|
||||
{
|
||||
/* Make it definitely odd. */
|
||||
seed |= 1;
|
||||
|
||||
while (!is_prime (seed))
|
||||
seed += 2;
|
||||
|
||||
return seed;
|
||||
}
|
||||
|
||||
void
|
||||
error (int status, int errnum, const char *message, ...)
|
||||
{
|
||||
va_list args;
|
||||
|
||||
va_start (args, message);
|
||||
fflush (stdout);
|
||||
fprintf (stderr, "%s: ", program_invocation_name);
|
||||
vfprintf (stderr, message, args);
|
||||
va_end (args);
|
||||
if (errnum)
|
||||
fprintf (stderr, ": %s", strerror (errnum));
|
||||
putc ('\n', stderr);
|
||||
fflush (stderr);
|
||||
if (status)
|
||||
exit (errnum == EROFS ? 0 : status);
|
||||
}
|
||||
|
||||
void *
|
||||
xmalloc (size_t size)
|
||||
{
|
||||
void *p = malloc (size);
|
||||
if (p == NULL)
|
||||
error (EXIT_FAILURE, errno, "could not allocate %zd bytes of memory", size);
|
||||
return p;
|
||||
}
|
||||
|
||||
static void
|
||||
open_tmpl_archive (struct locarhandle *ah)
|
||||
{
|
||||
struct stat64 st;
|
||||
int fd;
|
||||
struct locarhead head;
|
||||
const char *archivefname = ah->fname == NULL ? tmpl_file : ah->fname;
|
||||
|
||||
/* Open the archive. We must have exclusive write access. */
|
||||
fd = open64 (archivefname, O_RDONLY);
|
||||
if (fd == -1)
|
||||
error (EXIT_FAILURE, errno, "cannot open locale archive template file \"%s\"",
|
||||
archivefname);
|
||||
|
||||
if (fstat64 (fd, &st) < 0)
|
||||
error (EXIT_FAILURE, errno, "cannot stat locale archive template file \"%s\"",
|
||||
archivefname);
|
||||
|
||||
/* Read the header. */
|
||||
if (TEMP_FAILURE_RETRY (read (fd, &head, sizeof (head))) != sizeof (head))
|
||||
error (EXIT_FAILURE, errno, "cannot read archive header");
|
||||
|
||||
ah->fd = fd;
|
||||
ah->mmaped = (head.sumhash_offset
|
||||
+ head.sumhash_size * sizeof (struct sumhashent));
|
||||
if (ah->mmaped > (unsigned long) st.st_size)
|
||||
error (EXIT_FAILURE, 0, "locale archive template file truncated");
|
||||
ah->mmaped = st.st_size;
|
||||
ah->reserved = st.st_size;
|
||||
|
||||
/* Now we know how large the administrative information part is.
|
||||
Map all of it. */
|
||||
ah->addr = mmap64 (NULL, ah->mmaped, PROT_READ, MAP_SHARED, fd, 0);
|
||||
if (ah->addr == MAP_FAILED)
|
||||
error (EXIT_FAILURE, errno, "cannot map archive header");
|
||||
}
|
||||
|
||||
/* Open the locale archive. */
|
||||
extern void open_archive (struct locarhandle *ah, bool readonly);
|
||||
|
||||
/* Close the locale archive. */
|
||||
extern void close_archive (struct locarhandle *ah);
|
||||
|
||||
/* Add given locale data to the archive. */
|
||||
extern int add_locale_to_archive (struct locarhandle *ah, const char *name,
|
||||
locale_data_t data, bool replace);
|
||||
|
||||
extern void add_alias (struct locarhandle *ah, const char *alias,
|
||||
bool replace, const char *oldname,
|
||||
uint32_t *locrec_offset_p);
|
||||
|
||||
extern struct namehashent *
|
||||
insert_name (struct locarhandle *ah,
|
||||
const char *name, size_t name_len, bool replace);
|
||||
|
||||
struct nameent
|
||||
{
|
||||
char *name;
|
||||
struct locrecent *locrec;
|
||||
};
|
||||
|
||||
struct dataent
|
||||
{
|
||||
const unsigned char *sum;
|
||||
uint32_t file_offset;
|
||||
};
|
||||
|
||||
static int
|
||||
nameentcmp (const void *a, const void *b)
|
||||
{
|
||||
struct locrecent *la = ((const struct nameent *) a)->locrec;
|
||||
struct locrecent *lb = ((const struct nameent *) b)->locrec;
|
||||
uint32_t start_a = -1, end_a = 0;
|
||||
uint32_t start_b = -1, end_b = 0;
|
||||
int cnt;
|
||||
|
||||
for (cnt = 0; cnt < __LC_LAST; ++cnt)
|
||||
if (cnt != LC_ALL)
|
||||
{
|
||||
if (la->record[cnt].offset < start_a)
|
||||
start_a = la->record[cnt].offset;
|
||||
if (la->record[cnt].offset + la->record[cnt].len > end_a)
|
||||
end_a = la->record[cnt].offset + la->record[cnt].len;
|
||||
}
|
||||
assert (start_a != (uint32_t)-1);
|
||||
assert (end_a != 0);
|
||||
|
||||
for (cnt = 0; cnt < __LC_LAST; ++cnt)
|
||||
if (cnt != LC_ALL)
|
||||
{
|
||||
if (lb->record[cnt].offset < start_b)
|
||||
start_b = lb->record[cnt].offset;
|
||||
if (lb->record[cnt].offset + lb->record[cnt].len > end_b)
|
||||
end_b = lb->record[cnt].offset + lb->record[cnt].len;
|
||||
}
|
||||
assert (start_b != (uint32_t)-1);
|
||||
assert (end_b != 0);
|
||||
|
||||
if (start_a != start_b)
|
||||
return (int)start_a - (int)start_b;
|
||||
return (int)end_a - (int)end_b;
|
||||
}
|
||||
|
||||
static int
|
||||
dataentcmp (const void *a, const void *b)
|
||||
{
|
||||
if (((const struct dataent *) a)->file_offset
|
||||
< ((const struct dataent *) b)->file_offset)
|
||||
return -1;
|
||||
|
||||
if (((const struct dataent *) a)->file_offset
|
||||
> ((const struct dataent *) b)->file_offset)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
sumsearchfn (const void *key, const void *ent)
|
||||
{
|
||||
uint32_t keyn = *(uint32_t *)key;
|
||||
uint32_t entn = ((struct dataent *)ent)->file_offset;
|
||||
|
||||
if (keyn < entn)
|
||||
return -1;
|
||||
if (keyn > entn)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void
|
||||
compute_data (struct locarhandle *ah, struct nameent *name, size_t sumused,
|
||||
struct dataent *files, locale_data_t data)
|
||||
{
|
||||
int cnt;
|
||||
struct locrecent *locrec = name->locrec;
|
||||
struct dataent *file;
|
||||
data[LC_ALL].addr = ((char *) ah->addr) + locrec->record[LC_ALL].offset;
|
||||
data[LC_ALL].size = locrec->record[LC_ALL].len;
|
||||
for (cnt = 0; cnt < __LC_LAST; ++cnt)
|
||||
if (cnt != LC_ALL)
|
||||
{
|
||||
data[cnt].addr = ((char *) ah->addr) + locrec->record[cnt].offset;
|
||||
data[cnt].size = locrec->record[cnt].len;
|
||||
if (data[cnt].addr >= data[LC_ALL].addr
|
||||
&& data[cnt].addr + data[cnt].size
|
||||
<= data[LC_ALL].addr + data[LC_ALL].size)
|
||||
__md5_buffer (data[cnt].addr, data[cnt].size, data[cnt].sum);
|
||||
else
|
||||
{
|
||||
file = bsearch (&locrec->record[cnt].offset, files, sumused,
|
||||
sizeof (*files), sumsearchfn);
|
||||
if (file == NULL)
|
||||
error (EXIT_FAILURE, 0, "inconsistent template file");
|
||||
memcpy (data[cnt].sum, file->sum, sizeof (data[cnt].sum));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
fill_archive (struct locarhandle *tmpl_ah,
|
||||
const char *fname,
|
||||
size_t install_langs_count, char *install_langs_list[],
|
||||
size_t nlist, char *list[],
|
||||
const char *primary)
|
||||
{
|
||||
struct locarhandle ah;
|
||||
struct locarhead *head;
|
||||
int result = 0;
|
||||
struct nameent *names;
|
||||
struct namehashent *namehashtab;
|
||||
size_t cnt, used;
|
||||
struct dataent *files;
|
||||
struct sumhashent *sumhashtab;
|
||||
size_t sumused;
|
||||
struct locrecent *primary_locrec = NULL;
|
||||
struct nameent *primary_nameent = NULL;
|
||||
|
||||
head = tmpl_ah->addr;
|
||||
names = (struct nameent *) malloc (head->namehash_used
|
||||
* sizeof (struct nameent));
|
||||
files = (struct dataent *) malloc (head->sumhash_used
|
||||
* sizeof (struct dataent));
|
||||
if (names == NULL || files == NULL)
|
||||
error (EXIT_FAILURE, errno, "could not allocate tables");
|
||||
|
||||
namehashtab = (struct namehashent *) ((char *) tmpl_ah->addr
|
||||
+ head->namehash_offset);
|
||||
sumhashtab = (struct sumhashent *) ((char *) tmpl_ah->addr
|
||||
+ head->sumhash_offset);
|
||||
|
||||
for (cnt = used = 0; cnt < head->namehash_size; ++cnt)
|
||||
if (namehashtab[cnt].locrec_offset != 0)
|
||||
{
|
||||
char * name;
|
||||
int i;
|
||||
assert (used < head->namehash_used);
|
||||
name = tmpl_ah->addr + namehashtab[cnt].name_offset;
|
||||
if (install_langs_count == 0)
|
||||
{
|
||||
/* Always intstall the entry. */
|
||||
names[used].name = name;
|
||||
names[used++].locrec
|
||||
= (struct locrecent *) ((char *) tmpl_ah->addr +
|
||||
namehashtab[cnt].locrec_offset);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Only install the entry if the user asked for it via
|
||||
--install-langs. */
|
||||
for (i = 0; i < install_langs_count; i++)
|
||||
{
|
||||
/* Add one for "_" and one for the null terminator. */
|
||||
size_t len = strlen (install_langs_list[i]) + 2;
|
||||
char *install_lang = (char *)xmalloc (len);
|
||||
strcpy (install_lang, install_langs_list[i]);
|
||||
if (strchr (install_lang, '_') == NULL)
|
||||
strcat (install_lang, "_");
|
||||
if (strncmp (name, install_lang, strlen (install_lang)) == 0)
|
||||
{
|
||||
names[used].name = name;
|
||||
names[used++].locrec
|
||||
= (struct locrecent *) ((char *)tmpl_ah->addr
|
||||
+ namehashtab[cnt].locrec_offset);
|
||||
}
|
||||
free (install_lang);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Sort the names. */
|
||||
qsort (names, used, sizeof (struct nameent), nameentcmp);
|
||||
|
||||
for (cnt = sumused = 0; cnt < head->sumhash_size; ++cnt)
|
||||
if (sumhashtab[cnt].file_offset != 0)
|
||||
{
|
||||
assert (sumused < head->sumhash_used);
|
||||
files[sumused].sum = (const unsigned char *) sumhashtab[cnt].sum;
|
||||
files[sumused++].file_offset = sumhashtab[cnt].file_offset;
|
||||
}
|
||||
|
||||
/* Sort by file locations. */
|
||||
qsort (files, sumused, sizeof (struct dataent), dataentcmp);
|
||||
|
||||
/* Open the archive. This call never returns if we cannot
|
||||
successfully open the archive. */
|
||||
ah.fname = NULL;
|
||||
if (fname != NULL)
|
||||
ah.fname = fname;
|
||||
open_archive (&ah, false);
|
||||
|
||||
if (primary != NULL)
|
||||
{
|
||||
for (cnt = 0; cnt < used; ++cnt)
|
||||
if (strcmp (names[cnt].name, primary) == 0)
|
||||
break;
|
||||
if (cnt < used)
|
||||
{
|
||||
locale_data_t data;
|
||||
|
||||
compute_data (tmpl_ah, &names[cnt], sumused, files, data);
|
||||
result |= add_locale_to_archive (&ah, primary, data, 0);
|
||||
primary_locrec = names[cnt].locrec;
|
||||
primary_nameent = &names[cnt];
|
||||
}
|
||||
}
|
||||
|
||||
for (cnt = 0; cnt < used; ++cnt)
|
||||
if (&names[cnt] == primary_nameent)
|
||||
continue;
|
||||
else if ((cnt > 0 && names[cnt - 1].locrec == names[cnt].locrec)
|
||||
|| names[cnt].locrec == primary_locrec)
|
||||
{
|
||||
const char *oldname;
|
||||
struct namehashent *namehashent;
|
||||
uint32_t locrec_offset;
|
||||
|
||||
if (names[cnt].locrec == primary_locrec)
|
||||
oldname = primary;
|
||||
else
|
||||
oldname = names[cnt - 1].name;
|
||||
namehashent = insert_name (&ah, oldname, strlen (oldname), true);
|
||||
assert (namehashent->name_offset != 0);
|
||||
assert (namehashent->locrec_offset != 0);
|
||||
locrec_offset = namehashent->locrec_offset;
|
||||
add_alias (&ah, names[cnt].name, 0, oldname, &locrec_offset);
|
||||
}
|
||||
else
|
||||
{
|
||||
locale_data_t data;
|
||||
|
||||
compute_data (tmpl_ah, &names[cnt], sumused, files, data);
|
||||
result |= add_locale_to_archive (&ah, names[cnt].name, data, 0);
|
||||
}
|
||||
|
||||
while (nlist-- > 0)
|
||||
{
|
||||
const char *fname = *list++;
|
||||
size_t fnamelen = strlen (fname);
|
||||
struct stat64 st;
|
||||
DIR *dirp;
|
||||
struct dirent64 *d;
|
||||
int seen;
|
||||
locale_data_t data;
|
||||
int cnt;
|
||||
|
||||
/* First see whether this really is a directory and whether it
|
||||
contains all the require locale category files. */
|
||||
if (stat64 (fname, &st) < 0)
|
||||
{
|
||||
error (0, 0, "stat of \"%s\" failed: %s: ignored", fname,
|
||||
strerror (errno));
|
||||
continue;
|
||||
}
|
||||
if (!S_ISDIR (st.st_mode))
|
||||
{
|
||||
error (0, 0, "\"%s\" is no directory; ignored", fname);
|
||||
continue;
|
||||
}
|
||||
|
||||
dirp = opendir (fname);
|
||||
if (dirp == NULL)
|
||||
{
|
||||
error (0, 0, "cannot open directory \"%s\": %s: ignored",
|
||||
fname, strerror (errno));
|
||||
continue;
|
||||
}
|
||||
|
||||
seen = 0;
|
||||
while ((d = readdir64 (dirp)) != NULL)
|
||||
{
|
||||
for (cnt = 0; cnt < __LC_LAST; ++cnt)
|
||||
if (cnt != LC_ALL)
|
||||
if (strcmp (d->d_name, locnames[cnt]) == 0)
|
||||
{
|
||||
unsigned char d_type;
|
||||
|
||||
/* We have an object of the required name. If it's
|
||||
a directory we have to look at a file with the
|
||||
prefix "SYS_". Otherwise we have found what we
|
||||
are looking for. */
|
||||
#ifdef _DIRENT_HAVE_D_TYPE
|
||||
d_type = d->d_type;
|
||||
|
||||
if (d_type != DT_REG)
|
||||
#endif
|
||||
{
|
||||
char fullname[fnamelen + 2 * strlen (d->d_name) + 7];
|
||||
|
||||
#ifdef _DIRENT_HAVE_D_TYPE
|
||||
if (d_type == DT_UNKNOWN || d_type == DT_LNK)
|
||||
#endif
|
||||
{
|
||||
strcpy (stpcpy (stpcpy (fullname, fname), "/"),
|
||||
d->d_name);
|
||||
|
||||
if (stat64 (fullname, &st) == -1)
|
||||
/* We cannot stat the file, ignore it. */
|
||||
break;
|
||||
|
||||
d_type = IFTODT (st.st_mode);
|
||||
}
|
||||
|
||||
if (d_type == DT_DIR)
|
||||
{
|
||||
/* We have to do more tests. The file is a
|
||||
directory and it therefore must contain a
|
||||
regular file with the same name except a
|
||||
"SYS_" prefix. */
|
||||
char *t = stpcpy (stpcpy (fullname, fname), "/");
|
||||
strcpy (stpcpy (stpcpy (t, d->d_name), "/SYS_"),
|
||||
d->d_name);
|
||||
|
||||
if (stat64 (fullname, &st) == -1)
|
||||
/* There is no SYS_* file or we cannot
|
||||
access it. */
|
||||
break;
|
||||
|
||||
d_type = IFTODT (st.st_mode);
|
||||
}
|
||||
}
|
||||
|
||||
/* If we found a regular file (eventually after
|
||||
following a symlink) we are successful. */
|
||||
if (d_type == DT_REG)
|
||||
++seen;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
closedir (dirp);
|
||||
|
||||
if (seen != __LC_LAST - 1)
|
||||
{
|
||||
/* We don't have all locale category files. Ignore the name. */
|
||||
error (0, 0, "incomplete set of locale files in \"%s\"",
|
||||
fname);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Add the files to the archive. To do this we first compute
|
||||
sizes and the MD5 sums of all the files. */
|
||||
for (cnt = 0; cnt < __LC_LAST; ++cnt)
|
||||
if (cnt != LC_ALL)
|
||||
{
|
||||
char fullname[fnamelen + 2 * strlen (locnames[cnt]) + 7];
|
||||
int fd;
|
||||
|
||||
strcpy (stpcpy (stpcpy (fullname, fname), "/"), locnames[cnt]);
|
||||
fd = open64 (fullname, O_RDONLY);
|
||||
if (fd == -1 || fstat64 (fd, &st) == -1)
|
||||
{
|
||||
/* Cannot read the file. */
|
||||
if (fd != -1)
|
||||
close (fd);
|
||||
break;
|
||||
}
|
||||
|
||||
if (S_ISDIR (st.st_mode))
|
||||
{
|
||||
char *t;
|
||||
close (fd);
|
||||
t = stpcpy (stpcpy (fullname, fname), "/");
|
||||
strcpy (stpcpy (stpcpy (t, locnames[cnt]), "/SYS_"),
|
||||
locnames[cnt]);
|
||||
|
||||
fd = open64 (fullname, O_RDONLY);
|
||||
if (fd == -1 || fstat64 (fd, &st) == -1
|
||||
|| !S_ISREG (st.st_mode))
|
||||
{
|
||||
if (fd != -1)
|
||||
close (fd);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Map the file. */
|
||||
data[cnt].addr = mmap64 (NULL, st.st_size, PROT_READ, MAP_SHARED,
|
||||
fd, 0);
|
||||
if (data[cnt].addr == MAP_FAILED)
|
||||
{
|
||||
/* Cannot map it. */
|
||||
close (fd);
|
||||
break;
|
||||
}
|
||||
|
||||
data[cnt].size = st.st_size;
|
||||
__md5_buffer (data[cnt].addr, st.st_size, data[cnt].sum);
|
||||
|
||||
/* We don't need the file descriptor anymore. */
|
||||
close (fd);
|
||||
}
|
||||
|
||||
if (cnt != __LC_LAST)
|
||||
{
|
||||
while (cnt-- > 0)
|
||||
if (cnt != LC_ALL)
|
||||
munmap (data[cnt].addr, data[cnt].size);
|
||||
|
||||
error (0, 0, "cannot read all files in \"%s\": ignored", fname);
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
result |= add_locale_to_archive (&ah, basename (fname), data, 0);
|
||||
|
||||
for (cnt = 0; cnt < __LC_LAST; ++cnt)
|
||||
if (cnt != LC_ALL)
|
||||
munmap (data[cnt].addr, data[cnt].size);
|
||||
}
|
||||
|
||||
/* We are done. */
|
||||
close_archive (&ah);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void usage()
|
||||
{
|
||||
printf ("\
|
||||
Usage: build-locale-archive [OPTION]... [TEMPLATE-FILE] [ARCHIVE-FILE]\n\
|
||||
Builds a locale archive from a template file.\n\
|
||||
Options:\n\
|
||||
-h, --help Print this usage message.\n\
|
||||
-v, --verbose Verbose execution.\n\
|
||||
-l, --install-langs=LIST Only include locales given in LIST into the \n\
|
||||
locale archive. LIST is a colon separated list\n\
|
||||
of locale prefixes, for example \"de:en:ja\".\n\
|
||||
The special argument \"all\" means to install\n\
|
||||
all languages and it must be present by itself.\n\
|
||||
If \"all\" is present with any other language it\n\
|
||||
will be treated as the name of a locale.\n\
|
||||
If the --install-langs option is missing, all\n\
|
||||
locales are installed. The colon separated list\n\
|
||||
can contain any strings matching the beginning of\n\
|
||||
locale names.\n\
|
||||
If a string does not contain a \"_\", it is added.\n\
|
||||
Examples:\n\
|
||||
--install-langs=\"en\"\n\
|
||||
installs en_US, en_US.iso88591,\n\
|
||||
en_US.iso885915, en_US.utf8,\n\
|
||||
en_GB ...\n\
|
||||
--install-langs=\"en_US.utf8\"\n\
|
||||
installs only en_US.utf8.\n\
|
||||
--install-langs=\"ko\"\n\
|
||||
installs ko_KR, ko_KR.euckr,\n\
|
||||
ko_KR.utf8 but *not* kok_IN\n\
|
||||
because \"ko\" does not contain\n\
|
||||
\"_\" and it is silently added\n\
|
||||
--install-langs\"ko:kok\"\n\
|
||||
installs ko_KR, ko_KR.euckr,\n\
|
||||
ko_KR.utf8, kok_IN, and\n\
|
||||
kok_IN.utf8.\n\
|
||||
--install-langs=\"POSIX\" will\n\
|
||||
installs *no* locales at all\n\
|
||||
because POSIX matches none of\n\
|
||||
the locales. Actually, any string\n\
|
||||
matching nothing will do that.\n\
|
||||
POSIX and C will always be\n\
|
||||
available because they are\n\
|
||||
builtin.\n\
|
||||
Aliases are installed as well,\n\
|
||||
i.e. --install-langs=\"de\"\n\
|
||||
will install not only every locale starting with\n\
|
||||
\"de\" but also the aliases \"deutsch\"\n\
|
||||
and and \"german\" although the latter does not\n\
|
||||
start with \"de\".\n\
|
||||
\n\
|
||||
If the arguments TEMPLATE-FILE and ARCHIVE-FILE are not given the locations\n\
|
||||
where the glibc used expects these files are used by default.\n\
|
||||
");
|
||||
}
|
||||
|
||||
int main (int argc, char *argv[])
|
||||
{
|
||||
char path[4096];
|
||||
DIR *dirp;
|
||||
struct dirent64 *d;
|
||||
struct stat64 st;
|
||||
char *list[16384], *primary;
|
||||
char *lang;
|
||||
int install_langs_count = 0;
|
||||
int i;
|
||||
char *install_langs_arg, *ila_start;
|
||||
char **install_langs_list = NULL;
|
||||
unsigned int cnt = 0;
|
||||
struct locarhandle tmpl_ah;
|
||||
char *new_locar_fname = NULL;
|
||||
size_t loc_path_len = strlen (loc_path);
|
||||
|
||||
while (1)
|
||||
{
|
||||
int c;
|
||||
|
||||
static struct option long_options[] =
|
||||
{
|
||||
{"help", no_argument, 0, 'h'},
|
||||
{"verbose", no_argument, 0, 'v'},
|
||||
{"install-langs", required_argument, 0, 'l'},
|
||||
{0, 0, 0, 0}
|
||||
};
|
||||
/* getopt_long stores the option index here. */
|
||||
int option_index = 0;
|
||||
|
||||
c = getopt_long (argc, argv, "vhl:",
|
||||
long_options, &option_index);
|
||||
|
||||
/* Detect the end of the options. */
|
||||
if (c == -1)
|
||||
break;
|
||||
|
||||
switch (c)
|
||||
{
|
||||
case 0:
|
||||
printf ("unknown option %s", long_options[option_index].name);
|
||||
if (optarg)
|
||||
printf (" with arg %s", optarg);
|
||||
printf ("\n");
|
||||
usage ();
|
||||
exit (1);
|
||||
|
||||
case 'v':
|
||||
verbose = 1;
|
||||
be_quiet = 0;
|
||||
break;
|
||||
|
||||
case 'h':
|
||||
usage ();
|
||||
exit (0);
|
||||
|
||||
case 'l':
|
||||
install_langs_arg = ila_start = strdup (optarg);
|
||||
/* If the argument to --install-lang is "all", do
|
||||
not limit the list of languages to install and install
|
||||
them all. We do not support installing a single locale
|
||||
called "all". */
|
||||
#define MAGIC_INSTALL_ALL "all"
|
||||
if (install_langs_arg != NULL
|
||||
&& install_langs_arg[0] != '\0'
|
||||
&& !(strncmp(install_langs_arg, MAGIC_INSTALL_ALL,
|
||||
strlen(MAGIC_INSTALL_ALL)) == 0
|
||||
&& strlen (install_langs_arg) == 3))
|
||||
{
|
||||
/* Count the number of languages we will install. */
|
||||
while (true)
|
||||
{
|
||||
lang = strtok(install_langs_arg, ":;,");
|
||||
if (lang == NULL)
|
||||
break;
|
||||
install_langs_count++;
|
||||
install_langs_arg = NULL;
|
||||
}
|
||||
free (ila_start);
|
||||
|
||||
/* Reject an entire string made up of delimiters. */
|
||||
if (install_langs_count == 0)
|
||||
break;
|
||||
|
||||
/* Copy the list. */
|
||||
install_langs_list = (char **)xmalloc (sizeof(char *) * install_langs_count);
|
||||
install_langs_arg = ila_start = strdup (optarg);
|
||||
install_langs_count = 0;
|
||||
while (true)
|
||||
{
|
||||
lang = strtok(install_langs_arg, ":;,");
|
||||
if (lang == NULL)
|
||||
break;
|
||||
install_langs_list[install_langs_count] = lang;
|
||||
install_langs_count++;
|
||||
install_langs_arg = NULL;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case '?':
|
||||
/* getopt_long already printed an error message. */
|
||||
usage ();
|
||||
exit (0);
|
||||
|
||||
default:
|
||||
abort ();
|
||||
}
|
||||
}
|
||||
tmpl_ah.fname = NULL;
|
||||
if (optind < argc)
|
||||
tmpl_ah.fname = argv[optind];
|
||||
if (optind + 1 < argc)
|
||||
new_locar_fname = argv[optind + 1];
|
||||
if (verbose)
|
||||
{
|
||||
if (tmpl_ah.fname)
|
||||
printf("input archive file specified on command line: %s\n",
|
||||
tmpl_ah.fname);
|
||||
else
|
||||
printf("using default input archive file.\n");
|
||||
if (new_locar_fname)
|
||||
printf("output archive file specified on command line: %s\n",
|
||||
new_locar_fname);
|
||||
else
|
||||
printf("using default output archive file.\n");
|
||||
}
|
||||
|
||||
dirp = opendir (loc_path);
|
||||
if (dirp == NULL)
|
||||
error (EXIT_FAILURE, errno, "cannot open directory \"%s\"", loc_path);
|
||||
|
||||
open_tmpl_archive (&tmpl_ah);
|
||||
|
||||
if (new_locar_fname)
|
||||
unlink (new_locar_fname);
|
||||
else
|
||||
unlink (locar_file);
|
||||
primary = getenv ("LC_ALL");
|
||||
if (primary == NULL)
|
||||
primary = getenv ("LANG");
|
||||
if (primary != NULL)
|
||||
{
|
||||
if (strncmp (primary, "ja", 2) != 0
|
||||
&& strncmp (primary, "ko", 2) != 0
|
||||
&& strncmp (primary, "zh", 2) != 0)
|
||||
{
|
||||
char *ptr = malloc (strlen (primary) + strlen (".utf8") + 1), *p, *q;
|
||||
/* This leads to invalid locales sometimes:
|
||||
de_DE.iso885915@euro -> de_DE.utf8@euro */
|
||||
if (ptr != NULL)
|
||||
{
|
||||
p = ptr;
|
||||
q = primary;
|
||||
while (*q && *q != '.' && *q != '@')
|
||||
*p++ = *q++;
|
||||
if (*q == '.')
|
||||
while (*q && *q != '@')
|
||||
q++;
|
||||
p = stpcpy (p, ".utf8");
|
||||
strcpy (p, q);
|
||||
primary = ptr;
|
||||
}
|
||||
else
|
||||
primary = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
memcpy (path, loc_path, loc_path_len);
|
||||
|
||||
while ((d = readdir64 (dirp)) != NULL)
|
||||
{
|
||||
if (strcmp (d->d_name, ".") == 0 || strcmp (d->d_name, "..") == 0)
|
||||
continue;
|
||||
if (strchr (d->d_name, '_') == NULL)
|
||||
continue;
|
||||
|
||||
size_t d_name_len = strlen (d->d_name);
|
||||
if (loc_path_len + d_name_len + 1 > sizeof (path))
|
||||
{
|
||||
error (0, 0, "too long filename \"%s\"", d->d_name);
|
||||
continue;
|
||||
}
|
||||
|
||||
memcpy (path + loc_path_len, d->d_name, d_name_len + 1);
|
||||
if (stat64 (path, &st) < 0)
|
||||
{
|
||||
error (0, errno, "cannot stat \"%s\"", path);
|
||||
continue;
|
||||
}
|
||||
if (! S_ISDIR (st.st_mode))
|
||||
continue;
|
||||
if (cnt == 16384)
|
||||
{
|
||||
error (0, 0, "too many directories in \"%s\"", loc_path);
|
||||
break;
|
||||
}
|
||||
list[cnt] = strdup (path);
|
||||
if (list[cnt] == NULL)
|
||||
{
|
||||
error (0, errno, "cannot add file to list \"%s\"", path);
|
||||
continue;
|
||||
}
|
||||
if (primary != NULL && cnt > 0 && strcmp (primary, d->d_name) == 0)
|
||||
{
|
||||
char *p = list[0];
|
||||
list[0] = list[cnt];
|
||||
list[cnt] = p;
|
||||
}
|
||||
cnt++;
|
||||
}
|
||||
closedir (dirp);
|
||||
/* Store the archive to the file specified as the second argument on the
|
||||
command line or the default locale archive. */
|
||||
fill_archive (&tmpl_ah, new_locar_fname,
|
||||
install_langs_count, install_langs_list,
|
||||
cnt, list, primary);
|
||||
close_archive (&tmpl_ah);
|
||||
truncate (tmpl_file, 0);
|
||||
if (install_langs_count > 0)
|
||||
{
|
||||
free (ila_start);
|
||||
free (install_langs_list);
|
||||
}
|
||||
char *tz_argv[] = { "/usr/sbin/tzdata-update", NULL };
|
||||
execve (tz_argv[0], (char *const *)tz_argv, (char *const *)&tz_argv[1]);
|
||||
exit (0);
|
||||
}
|
1
dist
1
dist
|
@ -1 +0,0 @@
|
|||
an8_10
|
1
download
1
download
|
@ -1 +0,0 @@
|
|||
c81d2388896379997bc359d4f2084239 glibc-2.28.tar.xz
|
39
elf-Add-new-LoongArch-reloc-types-101-to-108-into-el.patch
Normal file
39
elf-Add-new-LoongArch-reloc-types-101-to-108-into-el.patch
Normal file
|
@ -0,0 +1,39 @@
|
|||
From fc60db3cf29ba157d09ba4f4b92e3ab382b0339d Mon Sep 17 00:00:00 2001
|
||||
From: Xi Ruoyao <xry111@xry111.site>
|
||||
Date: Wed, 9 Aug 2023 19:12:54 +0800
|
||||
Subject: [PATCH 04/29] elf: Add new LoongArch reloc types (101 to 108) into
|
||||
elf.h
|
||||
|
||||
These reloc types are generated by GNU assembler >= 2.41 for relaxation
|
||||
support.
|
||||
|
||||
Link: https://sourceware.org/git/?p=binutils-gdb.git;a=commitdiff;h=57a930e3
|
||||
Signed-off-by: Xi Ruoyao <xry111@xry111.site>
|
||||
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
elf/elf.h | 8 ++++++++
|
||||
1 file changed, 8 insertions(+)
|
||||
|
||||
diff --git a/elf/elf.h b/elf/elf.h
|
||||
index 89fc8021..d623bdeb 100644
|
||||
--- a/elf/elf.h
|
||||
+++ b/elf/elf.h
|
||||
@@ -4205,6 +4205,14 @@ enum
|
||||
#define R_LARCH_TLS_GD_HI20 98
|
||||
#define R_LARCH_32_PCREL 99
|
||||
#define R_LARCH_RELAX 100
|
||||
+#define R_LARCH_DELETE 101
|
||||
+#define R_LARCH_ALIGN 102
|
||||
+#define R_LARCH_PCREL20_S2 103
|
||||
+#define R_LARCH_CFA 104
|
||||
+#define R_LARCH_ADD6 105
|
||||
+#define R_LARCH_SUB6 106
|
||||
+#define R_LARCH_ADD_ULEB128 107
|
||||
+#define R_LARCH_SUB_ULEB128 108
|
||||
|
||||
/* ARC specific declarations. */
|
||||
|
||||
--
|
||||
2.33.0
|
||||
|
File diff suppressed because it is too large
Load diff
|
@ -1,29 +0,0 @@
|
|||
From dc2d26d52c129c47fa1f16bd0157cd20c6d9a958 Mon Sep 17 00:00:00 2001
|
||||
From: caiyinyu <caiyinyu@loongson.cn>
|
||||
Date: Wed, 21 Jun 2023 11:55:02 +0800
|
||||
Subject: [PATCH 08/14] glibc-2.28: Add new struct user_fp_state in user.h
|
||||
|
||||
Change-Id: Idc233cc11c8f76b624dc2891b432f4d02a53cebc
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/unix/sysv/linux/loongarch/sys/user.h | 6 ++++++
|
||||
1 file changed, 6 insertions(+)
|
||||
|
||||
diff --git a/sysdeps/unix/sysv/linux/loongarch/sys/user.h b/sysdeps/unix/sysv/linux/loongarch/sys/user.h
|
||||
index f9108350..21e340f6 100644
|
||||
--- a/sysdeps/unix/sysv/linux/loongarch/sys/user.h
|
||||
+++ b/sysdeps/unix/sysv/linux/loongarch/sys/user.h
|
||||
@@ -28,4 +28,10 @@ struct user_regs_struct
|
||||
uint64_t reserved[11];
|
||||
};
|
||||
|
||||
+struct user_fp_struct {
|
||||
+ uint64_t fpr[32];
|
||||
+ uint64_t fcc;
|
||||
+ uint32_t fcsr;
|
||||
+};
|
||||
+
|
||||
#endif /* _SYS_USER_H */
|
||||
--
|
||||
2.33.0
|
||||
|
|
@ -1,162 +0,0 @@
|
|||
From 647a0a28e5c9aed2f1fa59bbb7595133e7a4e62f Mon Sep 17 00:00:00 2001
|
||||
From: caiyinyu <caiyinyu@loongson.cn>
|
||||
Date: Mon, 24 Apr 2023 18:09:55 +0800
|
||||
Subject: [PATCH 03/14] glibc-2.28: Fix ifunc str/mem functions xfail problems.
|
||||
|
||||
Change-Id: Ibff4229fcfef23c0b19fb94b21a4d17b49eceec6
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
.../lp64/multiarch/ifunc-impl-list.c | 76 +++++++++----------
|
||||
1 file changed, 38 insertions(+), 38 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
index c2b6bbf7..fdeae797 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
|
||||
@@ -36,105 +36,105 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
size_t i = 0;
|
||||
|
||||
IFUNC_IMPL (i, name, memcpy,
|
||||
- IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_lasx)
|
||||
- IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_LASX, __memcpy_lasx)
|
||||
+ IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_LSX, __memcpy_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_UAL, __memcpy_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_aligned)
|
||||
- IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_unaligned)
|
||||
)
|
||||
|
||||
IFUNC_IMPL (i, name, memmove,
|
||||
- IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_lasx)
|
||||
- IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, memmove, SUPPORT_LASX, __memmove_lasx)
|
||||
+ IFUNC_IMPL_ADD (array, i, memmove, SUPPORT_LSX, __memmove_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, memmove, SUPPORT_UAL, __memmove_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_aligned)
|
||||
- IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_unaligned)
|
||||
)
|
||||
|
||||
IFUNC_IMPL (i, name, memset,
|
||||
- IFUNC_IMPL_ADD (array, i, memset, 1, __memset_lasx)
|
||||
- IFUNC_IMPL_ADD (array, i, memset, 1, __memset_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, memset, SUPPORT_LASX, __memset_lasx)
|
||||
+ IFUNC_IMPL_ADD (array, i, memset, SUPPORT_LSX, __memset_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, memset, SUPPORT_UAL, __memset_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_aligned)
|
||||
- IFUNC_IMPL_ADD (array, i, memset, 1, __memset_unaligned)
|
||||
)
|
||||
|
||||
IFUNC_IMPL (i, name, memchr,
|
||||
- IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_lasx)
|
||||
- IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, memchr, SUPPORT_LASX, __memchr_lasx)
|
||||
+ IFUNC_IMPL_ADD (array, i, memchr, SUPPORT_LSX, __memchr_lsx)
|
||||
IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_aligned)
|
||||
)
|
||||
|
||||
IFUNC_IMPL (i, name, memrchr,
|
||||
- IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_lasx)
|
||||
- IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, memrchr, SUPPORT_LASX, __memrchr_lasx)
|
||||
+ IFUNC_IMPL_ADD (array, i, memrchr, SUPPORT_LSX, __memrchr_lsx)
|
||||
IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_generic)
|
||||
)
|
||||
|
||||
IFUNC_IMPL (i, name, memcmp,
|
||||
- IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_lasx)
|
||||
- IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, memcmp, SUPPORT_LASX, __memcmp_lasx)
|
||||
+ IFUNC_IMPL_ADD (array, i, memcmp, SUPPORT_LSX, __memcmp_lsx)
|
||||
IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_aligned)
|
||||
)
|
||||
|
||||
IFUNC_IMPL (i, name, rawmemchr,
|
||||
- IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_lasx)
|
||||
- IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, rawmemchr, SUPPORT_LASX, __rawmemchr_lasx)
|
||||
+ IFUNC_IMPL_ADD (array, i, rawmemchr, SUPPORT_LSX, __rawmemchr_lsx)
|
||||
IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_aligned)
|
||||
)
|
||||
|
||||
IFUNC_IMPL (i, name, strchr,
|
||||
- IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_lasx)
|
||||
- IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, strchr, SUPPORT_LASX, __strchr_lasx)
|
||||
+ IFUNC_IMPL_ADD (array, i, strchr, SUPPORT_LSX, __strchr_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, strchr, SUPPORT_UAL, __strchr_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_aligned)
|
||||
- IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_unaligned)
|
||||
)
|
||||
|
||||
IFUNC_IMPL (i, name, strrchr,
|
||||
- IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_lasx)
|
||||
- IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, strrchr, SUPPORT_LASX, __strrchr_lasx)
|
||||
+ IFUNC_IMPL_ADD (array, i, strrchr, SUPPORT_LSX, __strrchr_lsx)
|
||||
IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_aligned)
|
||||
)
|
||||
|
||||
IFUNC_IMPL (i, name, strlen,
|
||||
- IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_lasx)
|
||||
- IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LASX, __strlen_lasx)
|
||||
+ IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LSX, __strlen_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_UAL, __strlen_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_aligned)
|
||||
- IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_unaligned)
|
||||
)
|
||||
|
||||
IFUNC_IMPL (i, name, strnlen,
|
||||
- IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_lasx)
|
||||
- IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, strnlen, SUPPORT_LASX, __strnlen_lasx)
|
||||
+ IFUNC_IMPL_ADD (array, i, strnlen, SUPPORT_LSX, __strnlen_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, strnlen, SUPPORT_UAL, __strnlen_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_aligned)
|
||||
- IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_unaligned)
|
||||
)
|
||||
|
||||
IFUNC_IMPL (i, name, strchrnul,
|
||||
- IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_lasx)
|
||||
- IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, strchrnul, SUPPORT_LASX, __strchrnul_lasx)
|
||||
+ IFUNC_IMPL_ADD (array, i, strchrnul, SUPPORT_LSX, __strchrnul_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, strchrnul, SUPPORT_UAL, __strchrnul_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_aligned)
|
||||
- IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_unaligned)
|
||||
)
|
||||
|
||||
IFUNC_IMPL (i, name, strncmp,
|
||||
- IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, strncmp, SUPPORT_LSX, __strncmp_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, strncmp, SUPPORT_UAL, __strncmp_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_aligned)
|
||||
- IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_unaligned)
|
||||
)
|
||||
|
||||
IFUNC_IMPL (i, name, strcpy,
|
||||
- IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, strcpy, SUPPORT_LSX, __strcpy_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, strcpy, SUPPORT_UAL, __strcpy_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_aligned)
|
||||
- IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_unaligned)
|
||||
)
|
||||
|
||||
IFUNC_IMPL (i, name, stpcpy,
|
||||
- IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, stpcpy, SUPPORT_LSX, __stpcpy_lsx)
|
||||
IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_aligned)
|
||||
)
|
||||
|
||||
IFUNC_IMPL (i, name, strcmp,
|
||||
- IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, strcmp, SUPPORT_LSX, __strcmp_lsx)
|
||||
+ IFUNC_IMPL_ADD (array, i, strcmp, SUPPORT_UAL, __strcmp_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_aligned)
|
||||
- IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_unaligned)
|
||||
)
|
||||
|
||||
return i;
|
||||
--
|
||||
2.33.0
|
||||
|
|
@ -1,57 +0,0 @@
|
|||
From 00537d6945e71af8c9b0b1e7c2695f6a9a1ef1f5 Mon Sep 17 00:00:00 2001
|
||||
From: caiyinyu <caiyinyu@loongson.cn>
|
||||
Date: Sun, 25 Jun 2023 16:23:25 +0800
|
||||
Subject: [PATCH 09/14] glibc-2.28: Redefine macro LEAF/ENTRY.
|
||||
|
||||
The following usage of macro LEAF/ENTRY are all feasible:
|
||||
1. LEAF(fcn) -- the align value of fcn is .align 3 (default value)
|
||||
2. LEAF(fcn, 6) -- the align value of fcn is .align 6
|
||||
|
||||
Change-Id: Ie3df4df8dba5259b665bd0e4702aaab0a09a5f65
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/loongarch/sys/asm.h | 15 ++++++++++-----
|
||||
1 file changed, 10 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/loongarch/sys/asm.h b/sysdeps/loongarch/sys/asm.h
|
||||
index 357a5ba3..734e45ae 100644
|
||||
--- a/sysdeps/loongarch/sys/asm.h
|
||||
+++ b/sysdeps/loongarch/sys/asm.h
|
||||
@@ -26,16 +26,21 @@
|
||||
#endif
|
||||
|
||||
|
||||
-/* Declare leaf routine. */
|
||||
-#define LEAF(symbol, aln) \
|
||||
+/* Declare leaf routine.
|
||||
+ The usage of macro LEAF/ENTRY is as follows:
|
||||
+ 1. LEAF(fcn) -- the align value of fcn is .align 3 (default value)
|
||||
+ 2. LEAF(fcn, 6) -- the align value of fcn is .align 6
|
||||
+*/
|
||||
+#define LEAF_IMPL(symbol, aln, ...) \
|
||||
.text; \
|
||||
.globl symbol; \
|
||||
.align aln; \
|
||||
.type symbol, @function; \
|
||||
symbol: \
|
||||
- cfi_startproc; \
|
||||
+ cfi_startproc;
|
||||
|
||||
-# define ENTRY(symbol, aln) LEAF(symbol, aln)
|
||||
+#define LEAF(...) LEAF_IMPL(__VA_ARGS__, 3)
|
||||
+#define ENTRY(...) LEAF(__VA_ARGS__)
|
||||
|
||||
#define LEAF_NO_ALIGN(symbol) \
|
||||
.text; \
|
||||
@@ -44,7 +49,7 @@ symbol: \
|
||||
symbol: \
|
||||
cfi_startproc;
|
||||
|
||||
-# define ENTRY_NO_ALIGN(symbol) LEAF_NO_ALIGN(symbol)
|
||||
+#define ENTRY_NO_ALIGN(symbol) LEAF_NO_ALIGN(symbol)
|
||||
|
||||
/* Mark end of function. */
|
||||
#undef END
|
||||
--
|
||||
2.33.0
|
||||
|
|
@ -1,306 +0,0 @@
|
|||
From 27a004c9777340afd86fc0d129f6ffad508bf090 Mon Sep 17 00:00:00 2001
|
||||
From: caiyinyu <caiyinyu@loongson.cn>
|
||||
Date: Tue, 11 Jul 2023 16:09:55 +0800
|
||||
Subject: [PATCH 12/14] glibc-2.28: Refactor code and fix bug in
|
||||
_dl_runtime_resolve.
|
||||
|
||||
Change-Id: I4907e6643ef25b87d7862e957ce9bf6d201da816
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/loongarch/dl-machine.h | 8 +-
|
||||
sysdeps/loongarch/dl-trampoline.S | 7 ++
|
||||
sysdeps/loongarch/dl-trampoline.h | 159 +++++++++++++-----------------
|
||||
sysdeps/loongarch/sys/asm.h | 9 ++
|
||||
4 files changed, 90 insertions(+), 93 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
|
||||
index 6e9c6258..ff520a07 100644
|
||||
--- a/sysdeps/loongarch/dl-machine.h
|
||||
+++ b/sysdeps/loongarch/dl-machine.h
|
||||
@@ -381,9 +381,13 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
|
||||
/* If using PLTs, fill in the first two entries of .got.plt. */
|
||||
if (l->l_info[DT_JMPREL])
|
||||
{
|
||||
- extern void _dl_runtime_resolve (void) __attribute__ ((visibility ("hidden")));
|
||||
+
|
||||
+#if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float
|
||||
extern void _dl_runtime_resolve_lasx (void) __attribute__ ((visibility ("hidden")));
|
||||
extern void _dl_runtime_resolve_lsx (void) __attribute__ ((visibility ("hidden")));
|
||||
+#endif
|
||||
+ extern void _dl_runtime_resolve (void) __attribute__ ((visibility ("hidden")));
|
||||
+
|
||||
ElfW(Addr) *gotplt = (ElfW(Addr) *) D_PTR (l, l_info[DT_PLTGOT]);
|
||||
/* If a library is prelinked but we have to relocate anyway,
|
||||
we have to be able to undo the prelinking of .got.plt.
|
||||
@@ -391,11 +395,13 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
|
||||
if (gotplt[1])
|
||||
l->l_mach.plt = gotplt[1] + l->l_addr;
|
||||
|
||||
+#if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float
|
||||
if (SUPPORT_LASX)
|
||||
gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lasx;
|
||||
else if (SUPPORT_LSX)
|
||||
gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lsx;
|
||||
else
|
||||
+#endif
|
||||
gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve;
|
||||
|
||||
gotplt[1] = (ElfW(Addr)) l;
|
||||
diff --git a/sysdeps/loongarch/dl-trampoline.S b/sysdeps/loongarch/dl-trampoline.S
|
||||
index 5f627a63..78d741f3 100644
|
||||
--- a/sysdeps/loongarch/dl-trampoline.S
|
||||
+++ b/sysdeps/loongarch/dl-trampoline.S
|
||||
@@ -16,16 +16,23 @@
|
||||
License along with the GNU C Library. If not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
+#include <sysdep.h>
|
||||
+#include <sys/asm.h>
|
||||
+
|
||||
+#if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float
|
||||
#define USE_LASX
|
||||
#define _dl_runtime_resolve _dl_runtime_resolve_lasx
|
||||
#include "dl-trampoline.h"
|
||||
+#undef FRAME_SIZE
|
||||
#undef USE_LASX
|
||||
#undef _dl_runtime_resolve
|
||||
|
||||
#define USE_LSX
|
||||
#define _dl_runtime_resolve _dl_runtime_resolve_lsx
|
||||
#include "dl-trampoline.h"
|
||||
+#undef FRAME_SIZE
|
||||
#undef USE_LSX
|
||||
#undef _dl_runtime_resolve
|
||||
+#endif
|
||||
|
||||
#include "dl-trampoline.h"
|
||||
diff --git a/sysdeps/loongarch/dl-trampoline.h b/sysdeps/loongarch/dl-trampoline.h
|
||||
index 96f41f1d..9a6d9b6c 100644
|
||||
--- a/sysdeps/loongarch/dl-trampoline.h
|
||||
+++ b/sysdeps/loongarch/dl-trampoline.h
|
||||
@@ -17,31 +17,24 @@
|
||||
License along with the GNU C Library. If not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
-#include <sysdep.h>
|
||||
-#include <sys/asm.h>
|
||||
-
|
||||
/* Assembler veneer called from the PLT header code for lazy loading.
|
||||
The PLT header passes its own args in t0-t2. */
|
||||
-
|
||||
-#ifdef __loongarch_soft_float
|
||||
-# define FRAME_SIZE (-((-10 * SZREG) & ALMASK))
|
||||
+#ifdef USE_LASX
|
||||
+# define FRAME_SIZE (-((-9 * SZREG - 8 * SZFREG - 8 * SZXREG) & ALMASK))
|
||||
+#elif defined USE_LSX
|
||||
+# define FRAME_SIZE (-((-9 * SZREG - 8 * SZFREG - 8 * SZVREG) & ALMASK))
|
||||
+#elif !defined __loongarch_soft_float
|
||||
+# define FRAME_SIZE (-((-9 * SZREG - 8 * SZFREG) & ALMASK))
|
||||
#else
|
||||
-# define FRAME_SIZE (-((-10 * SZREG - 8 * 256) & ALMASK))
|
||||
+# define FRAME_SIZE (-((-9 * SZREG) & ALMASK))
|
||||
#endif
|
||||
|
||||
ENTRY (_dl_runtime_resolve, 3)
|
||||
- # Save arguments to stack.
|
||||
-
|
||||
-#ifdef __loongarch64
|
||||
- li.d t3, -FRAME_SIZE
|
||||
- add.d sp, sp, t3
|
||||
-#elif defined __loongarch32
|
||||
- li.w t3, -FRAME_SIZE
|
||||
- add.w sp, sp, t3
|
||||
-#endif
|
||||
|
||||
+ /* Save arguments to stack. */
|
||||
+ ADDI sp, sp, -FRAME_SIZE
|
||||
|
||||
- REG_S ra, sp, 9*SZREG
|
||||
+ REG_S ra, sp, 0*SZREG
|
||||
REG_S a0, sp, 1*SZREG
|
||||
REG_S a1, sp, 2*SZREG
|
||||
REG_S a2, sp, 3*SZREG
|
||||
@@ -51,55 +44,45 @@ ENTRY (_dl_runtime_resolve, 3)
|
||||
REG_S a6, sp, 7*SZREG
|
||||
REG_S a7, sp, 8*SZREG
|
||||
|
||||
-#ifndef __loongarch_soft_float
|
||||
- FREG_S fa0, sp, 10*SZREG + 0*SZFREG
|
||||
- FREG_S fa1, sp, 10*SZREG + 1*SZFREG
|
||||
- FREG_S fa2, sp, 10*SZREG + 2*SZFREG
|
||||
- FREG_S fa3, sp, 10*SZREG + 3*SZFREG
|
||||
- FREG_S fa4, sp, 10*SZREG + 4*SZFREG
|
||||
- FREG_S fa5, sp, 10*SZREG + 5*SZFREG
|
||||
- FREG_S fa6, sp, 10*SZREG + 6*SZFREG
|
||||
- FREG_S fa7, sp, 10*SZREG + 7*SZFREG
|
||||
#ifdef USE_LASX
|
||||
- xvst xr0, sp, 10*SZREG + 0*256
|
||||
- xvst xr1, sp, 10*SZREG + 1*256
|
||||
- xvst xr2, sp, 10*SZREG + 2*256
|
||||
- xvst xr3, sp, 10*SZREG + 3*256
|
||||
- xvst xr4, sp, 10*SZREG + 4*256
|
||||
- xvst xr5, sp, 10*SZREG + 5*256
|
||||
- xvst xr6, sp, 10*SZREG + 6*256
|
||||
- xvst xr7, sp, 10*SZREG + 7*256
|
||||
+ xvst xr0, sp, 9*SZREG + 8*SZFREG + 0*SZXREG
|
||||
+ xvst xr1, sp, 9*SZREG + 8*SZFREG + 1*SZXREG
|
||||
+ xvst xr2, sp, 9*SZREG + 8*SZFREG + 2*SZXREG
|
||||
+ xvst xr3, sp, 9*SZREG + 8*SZFREG + 3*SZXREG
|
||||
+ xvst xr4, sp, 9*SZREG + 8*SZFREG + 4*SZXREG
|
||||
+ xvst xr5, sp, 9*SZREG + 8*SZFREG + 5*SZXREG
|
||||
+ xvst xr6, sp, 9*SZREG + 8*SZFREG + 6*SZXREG
|
||||
+ xvst xr7, sp, 9*SZREG + 8*SZFREG + 7*SZXREG
|
||||
#elif defined USE_LSX
|
||||
- vst vr0, sp, 10*SZREG + 0*128
|
||||
- vst vr1, sp, 10*SZREG + 1*128
|
||||
- vst vr2, sp, 10*SZREG + 2*128
|
||||
- vst vr3, sp, 10*SZREG + 3*128
|
||||
- vst vr4, sp, 10*SZREG + 4*128
|
||||
- vst vr5, sp, 10*SZREG + 5*128
|
||||
- vst vr6, sp, 10*SZREG + 6*128
|
||||
- vst vr7, sp, 10*SZREG + 7*128
|
||||
-#endif
|
||||
+ vst vr0, sp, 9*SZREG + 8*SZFREG + 0*SZVREG
|
||||
+ vst vr1, sp, 9*SZREG + 8*SZFREG + 1*SZVREG
|
||||
+ vst vr2, sp, 9*SZREG + 8*SZFREG + 2*SZVREG
|
||||
+ vst vr3, sp, 9*SZREG + 8*SZFREG + 3*SZVREG
|
||||
+ vst vr4, sp, 9*SZREG + 8*SZFREG + 4*SZVREG
|
||||
+ vst vr5, sp, 9*SZREG + 8*SZFREG + 5*SZVREG
|
||||
+ vst vr6, sp, 9*SZREG + 8*SZFREG + 6*SZVREG
|
||||
+ vst vr7, sp, 9*SZREG + 8*SZFREG + 7*SZVREG
|
||||
+#elif !defined __loongarch_soft_float
|
||||
+ FREG_S fa0, sp, 9*SZREG + 0*SZFREG
|
||||
+ FREG_S fa1, sp, 9*SZREG + 1*SZFREG
|
||||
+ FREG_S fa2, sp, 9*SZREG + 2*SZFREG
|
||||
+ FREG_S fa3, sp, 9*SZREG + 3*SZFREG
|
||||
+ FREG_S fa4, sp, 9*SZREG + 4*SZFREG
|
||||
+ FREG_S fa5, sp, 9*SZREG + 5*SZFREG
|
||||
+ FREG_S fa6, sp, 9*SZREG + 6*SZFREG
|
||||
+ FREG_S fa7, sp, 9*SZREG + 7*SZFREG
|
||||
#endif
|
||||
|
||||
- # Update .got.plt and obtain runtime address of callee.
|
||||
-#ifdef __loongarch64
|
||||
- slli.d a1, t1, 1
|
||||
+ /* Update .got.plt and obtain runtime address of callee */
|
||||
+ SLLI a1, t1, 1
|
||||
or a0, t0, zero
|
||||
- add.d a1, a1, t1
|
||||
+ ADD a1, a1, t1
|
||||
la a2, _dl_fixup
|
||||
jirl ra, a2, 0
|
||||
or t1, v0, zero
|
||||
-#elif defined __loongarch32
|
||||
- slli.w a1, t1, 1
|
||||
- or a0, t0, zero
|
||||
- add.w a1, a1, t1
|
||||
- la a2, _dl_fixup
|
||||
- jirl ra, a2, 0
|
||||
- or t1, v0, zero
|
||||
-#endif
|
||||
|
||||
- # Restore arguments from stack.
|
||||
- REG_L ra, sp, 9*SZREG
|
||||
+ /* Restore arguments from stack. */
|
||||
+ REG_L ra, sp, 0*SZREG
|
||||
REG_L a0, sp, 1*SZREG
|
||||
REG_L a1, sp, 2*SZREG
|
||||
REG_L a2, sp, 3*SZREG
|
||||
@@ -109,45 +92,37 @@ ENTRY (_dl_runtime_resolve, 3)
|
||||
REG_L a6, sp, 7*SZREG
|
||||
REG_L a7, sp, 8*SZREG
|
||||
|
||||
-#ifndef __loongarch_soft_float
|
||||
- FREG_L fa0, sp, 10*SZREG + 0*SZFREG
|
||||
- FREG_L fa1, sp, 10*SZREG + 1*SZFREG
|
||||
- FREG_L fa2, sp, 10*SZREG + 2*SZFREG
|
||||
- FREG_L fa3, sp, 10*SZREG + 3*SZFREG
|
||||
- FREG_L fa4, sp, 10*SZREG + 4*SZFREG
|
||||
- FREG_L fa5, sp, 10*SZREG + 5*SZFREG
|
||||
- FREG_L fa6, sp, 10*SZREG + 6*SZFREG
|
||||
- FREG_L fa7, sp, 10*SZREG + 7*SZFREG
|
||||
#ifdef USE_LASX
|
||||
- xvld xr0, sp, 10*SZREG + 0*256
|
||||
- xvld xr1, sp, 10*SZREG + 1*256
|
||||
- xvld xr2, sp, 10*SZREG + 2*256
|
||||
- xvld xr3, sp, 10*SZREG + 3*256
|
||||
- xvld xr4, sp, 10*SZREG + 4*256
|
||||
- xvld xr5, sp, 10*SZREG + 5*256
|
||||
- xvld xr6, sp, 10*SZREG + 6*256
|
||||
- xvld xr7, sp, 10*SZREG + 7*256
|
||||
+ xvld xr0, sp, 9*SZREG + 8*SZFREG + 0*SZXREG
|
||||
+ xvld xr1, sp, 9*SZREG + 8*SZFREG + 1*SZXREG
|
||||
+ xvld xr2, sp, 9*SZREG + 8*SZFREG + 2*SZXREG
|
||||
+ xvld xr3, sp, 9*SZREG + 8*SZFREG + 3*SZXREG
|
||||
+ xvld xr4, sp, 9*SZREG + 8*SZFREG + 4*SZXREG
|
||||
+ xvld xr5, sp, 9*SZREG + 8*SZFREG + 5*SZXREG
|
||||
+ xvld xr6, sp, 9*SZREG + 8*SZFREG + 6*SZXREG
|
||||
+ xvld xr7, sp, 9*SZREG + 8*SZFREG + 7*SZXREG
|
||||
#elif defined USE_LSX
|
||||
- vld vr0, sp, 10*SZREG + 0*128
|
||||
- vld vr1, sp, 10*SZREG + 1*128
|
||||
- vld vr2, sp, 10*SZREG + 2*128
|
||||
- vld vr3, sp, 10*SZREG + 3*128
|
||||
- vld vr4, sp, 10*SZREG + 4*128
|
||||
- vld vr5, sp, 10*SZREG + 5*128
|
||||
- vld vr6, sp, 10*SZREG + 6*128
|
||||
- vld vr7, sp, 10*SZREG + 7*128
|
||||
-#endif
|
||||
-#endif
|
||||
-
|
||||
-#ifdef __loongarch64
|
||||
- li.d t3, FRAME_SIZE
|
||||
- add.d sp, sp, t3
|
||||
-#elif defined __loongarch32
|
||||
- li.w t3, FRAME_SIZE
|
||||
- addi.w sp, sp, FRAME_SIZE
|
||||
+ vld vr0, sp, 9*SZREG + 8*SZFREG + 0*SZVREG
|
||||
+ vld vr1, sp, 9*SZREG + 8*SZFREG + 1*SZVREG
|
||||
+ vld vr2, sp, 9*SZREG + 8*SZFREG + 2*SZVREG
|
||||
+ vld vr3, sp, 9*SZREG + 8*SZFREG + 3*SZVREG
|
||||
+ vld vr4, sp, 9*SZREG + 8*SZFREG + 4*SZVREG
|
||||
+ vld vr5, sp, 9*SZREG + 8*SZFREG + 5*SZVREG
|
||||
+ vld vr6, sp, 9*SZREG + 8*SZFREG + 6*SZVREG
|
||||
+ vld vr7, sp, 9*SZREG + 8*SZFREG + 7*SZVREG
|
||||
+#elif !defined __loongarch_soft_float
|
||||
+ FREG_L fa0, sp, 9*SZREG + 0*SZFREG
|
||||
+ FREG_L fa1, sp, 9*SZREG + 1*SZFREG
|
||||
+ FREG_L fa2, sp, 9*SZREG + 2*SZFREG
|
||||
+ FREG_L fa3, sp, 9*SZREG + 3*SZFREG
|
||||
+ FREG_L fa4, sp, 9*SZREG + 4*SZFREG
|
||||
+ FREG_L fa5, sp, 9*SZREG + 5*SZFREG
|
||||
+ FREG_L fa6, sp, 9*SZREG + 6*SZFREG
|
||||
+ FREG_L fa7, sp, 9*SZREG + 7*SZFREG
|
||||
#endif
|
||||
|
||||
+ ADDI sp, sp, FRAME_SIZE
|
||||
|
||||
- # Invoke the callee.
|
||||
+ /* Invoke the callee. */
|
||||
jirl zero, t1, 0
|
||||
END (_dl_runtime_resolve)
|
||||
diff --git a/sysdeps/loongarch/sys/asm.h b/sysdeps/loongarch/sys/asm.h
|
||||
index 734e45ae..e80c6245 100644
|
||||
--- a/sysdeps/loongarch/sys/asm.h
|
||||
+++ b/sysdeps/loongarch/sys/asm.h
|
||||
@@ -9,8 +9,17 @@
|
||||
# define PTRLOG 3
|
||||
# define SZREG 8
|
||||
# define SZFREG 8
|
||||
+# define SZVREG 16
|
||||
+# define SZXREG 32
|
||||
# define REG_L ld.d
|
||||
# define REG_S st.d
|
||||
+# define SRLI srli.d
|
||||
+# define SLLI slli.d
|
||||
+# define ADDI addi.d
|
||||
+# define ADD add.d
|
||||
+# define SUB sub.d
|
||||
+# define BSTRINS bstrins.d
|
||||
+# define LI li.d
|
||||
# define FREG_L fld.d
|
||||
# define FREG_S fst.d
|
||||
#elif defined __loongarch32
|
||||
--
|
||||
2.33.0
|
||||
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -1,292 +0,0 @@
|
|||
From e2dd1f13592fa3b99b70eb54cc61e9f98cdcb123 Mon Sep 17 00:00:00 2001
|
||||
From: caiyinyu <caiyinyu@loongson.cn>
|
||||
Date: Mon, 17 Apr 2023 17:20:04 +0800
|
||||
Subject: [PATCH 01/14] glibc-2.28: Remove unseless ANDROID_CHANGES and related
|
||||
code.
|
||||
|
||||
Change-Id: Ib08e92d435126c7b56096ff6f24f1c6b5ea57f46
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/loongarch/lp64/memchr.S | 6 ------
|
||||
sysdeps/loongarch/lp64/memcpy.S | 13 -------------
|
||||
sysdeps/loongarch/lp64/memset.S | 6 ------
|
||||
sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S | 6 ------
|
||||
.../loongarch/lp64/multiarch/memmove-unaligned.S | 6 ------
|
||||
sysdeps/loongarch/lp64/multiarch/memset-unaligned.S | 7 -------
|
||||
sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S | 2 --
|
||||
.../loongarch/lp64/multiarch/strchrnul-unaligned.S | 2 --
|
||||
sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S | 2 --
|
||||
sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S | 2 --
|
||||
.../loongarch/lp64/multiarch/strncmp-unaligned.S | 2 --
|
||||
.../loongarch/lp64/multiarch/strnlen-unaligned.S | 2 --
|
||||
12 files changed, 56 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/loongarch/lp64/memchr.S b/sysdeps/loongarch/lp64/memchr.S
|
||||
index ec34b1af..75c4e15c 100644
|
||||
--- a/sysdeps/loongarch/lp64/memchr.S
|
||||
+++ b/sysdeps/loongarch/lp64/memchr.S
|
||||
@@ -11,11 +11,7 @@
|
||||
#define MEMCHR_NAME memchr
|
||||
#endif
|
||||
|
||||
-#ifdef ANDROID_CHANGES
|
||||
-LEAF(MEMCHR_NAME, 0)
|
||||
-#else
|
||||
LEAF(MEMCHR_NAME)
|
||||
-#endif
|
||||
.align 6
|
||||
beqz a2, L(out)
|
||||
andi t1, a0, 0x7
|
||||
@@ -92,8 +88,6 @@ L(out):
|
||||
jr ra
|
||||
END(MEMCHR_NAME)
|
||||
|
||||
-#ifndef ANDROID_CHANGES
|
||||
#ifdef _LIBC
|
||||
libc_hidden_builtin_def (MEMCHR_NAME)
|
||||
#endif
|
||||
-#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/memcpy.S b/sysdeps/loongarch/lp64/memcpy.S
|
||||
index 1076e678..b6ca60a1 100644
|
||||
--- a/sysdeps/loongarch/lp64/memcpy.S
|
||||
+++ b/sysdeps/loongarch/lp64/memcpy.S
|
||||
@@ -35,29 +35,18 @@
|
||||
st.d t6, reg, n+48; \
|
||||
st.d t7, reg, n+56;
|
||||
|
||||
-#ifdef ANDROID_CHANGES
|
||||
-LEAF(MEMMOVE_NAME, 0)
|
||||
-#else
|
||||
LEAF(MEMMOVE_NAME)
|
||||
-#endif
|
||||
-
|
||||
.align 6
|
||||
sub.d t0, a0, a1
|
||||
bltu t0, a2, L(copy_back)
|
||||
|
||||
END(MEMMOVE_NAME)
|
||||
|
||||
-#ifndef ANDROID_CHANGES
|
||||
#ifdef _LIBC
|
||||
libc_hidden_builtin_def (MEMMOVE_NAME)
|
||||
#endif
|
||||
-#endif
|
||||
|
||||
-#ifdef ANDROID_CHANGES
|
||||
-LEAF(MEMCPY_NAME, 0)
|
||||
-#else
|
||||
LEAF(MEMCPY_NAME)
|
||||
-#endif
|
||||
|
||||
srai.d a3, a2, 4
|
||||
beqz a3, L(short_data) # less than 16 bytes
|
||||
@@ -811,8 +800,6 @@ L(back_end):
|
||||
|
||||
END(MEMCPY_NAME)
|
||||
|
||||
-#ifndef ANDROID_CHANGES
|
||||
#ifdef _LIBC
|
||||
libc_hidden_builtin_def (MEMCPY_NAME)
|
||||
#endif
|
||||
-#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/memset.S b/sysdeps/loongarch/lp64/memset.S
|
||||
index 9fe42b24..41629e7e 100644
|
||||
--- a/sysdeps/loongarch/lp64/memset.S
|
||||
+++ b/sysdeps/loongarch/lp64/memset.S
|
||||
@@ -21,11 +21,7 @@
|
||||
st.d a1, a0, n+48; \
|
||||
st.d a1, a0, n+56;
|
||||
|
||||
-#ifdef ANDROID_CHANGES
|
||||
-LEAF(MEMSET_NAME, 0)
|
||||
-#else
|
||||
LEAF(MEMSET_NAME)
|
||||
-#endif
|
||||
.align 6
|
||||
move t0, a0
|
||||
andi a3, a0, 0x7
|
||||
@@ -166,8 +162,6 @@ L(short_0):
|
||||
|
||||
END(MEMSET_NAME)
|
||||
|
||||
-#ifndef ANDROID_CHANGES
|
||||
#ifdef _LIBC
|
||||
libc_hidden_builtin_def (MEMSET_NAME)
|
||||
#endif
|
||||
-#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S
|
||||
index 5e38df0d..64b60244 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S
|
||||
@@ -31,11 +31,7 @@
|
||||
st.d t6, reg, n+48; \
|
||||
st.d t7, reg, n+56;
|
||||
|
||||
-#ifdef ANDROID_CHANGES
|
||||
-LEAF(MEMCPY_NAME, 0)
|
||||
-#else
|
||||
LEAF(MEMCPY_NAME)
|
||||
-#endif
|
||||
|
||||
//1st var: dst ptr: void *a1 $r4 a0
|
||||
//2nd var: src ptr: void *a2 $r5 a1
|
||||
@@ -250,10 +246,8 @@ end_0_8_unalign:
|
||||
|
||||
END(MEMCPY_NAME)
|
||||
|
||||
-#ifndef ANDROID_CHANGES
|
||||
#ifdef _LIBC
|
||||
libc_hidden_builtin_def (MEMCPY_NAME)
|
||||
#endif
|
||||
-#endif
|
||||
|
||||
#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S
|
||||
index 27ed0c9c..42920a1a 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S
|
||||
@@ -100,11 +100,7 @@
|
||||
LD_64(a4, -1024); \
|
||||
ST_64(a3, -1024);
|
||||
|
||||
-#ifdef ANDROID_CHANGES
|
||||
-LEAF(MEMMOVE_NAME, 0)
|
||||
-#else
|
||||
LEAF(MEMMOVE_NAME)
|
||||
-#endif
|
||||
|
||||
//1st var: dest ptr: void *str1 $r4 a0
|
||||
//2nd var: src ptr: void *str2 $r5 a1
|
||||
@@ -469,10 +465,8 @@ end_unalign_proc_back:
|
||||
|
||||
END(MEMMOVE_NAME)
|
||||
|
||||
-#ifndef ANDROID_CHANGES
|
||||
#ifdef _LIBC
|
||||
libc_hidden_builtin_def (MEMMOVE_NAME)
|
||||
#endif
|
||||
-#endif
|
||||
|
||||
#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
|
||||
index 16ff2ef7..54e51546 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
|
||||
@@ -33,12 +33,7 @@
|
||||
//2nd var: int val $5 a1
|
||||
//3rd var: size_t num $6 a2
|
||||
|
||||
-#ifdef ANDROID_CHANGES
|
||||
-LEAF(MEMSET_NAME, 0)
|
||||
-#else
|
||||
LEAF(MEMSET_NAME)
|
||||
-#endif
|
||||
-
|
||||
.align 6
|
||||
bstrins.d a1, a1, 15, 8
|
||||
add.d t7, a0, a2
|
||||
@@ -168,10 +163,8 @@ end_0_8_unalign:
|
||||
|
||||
END(MEMSET_NAME)
|
||||
|
||||
-#ifndef ANDROID_CHANGES
|
||||
#ifdef _LIBC
|
||||
libc_hidden_builtin_def (MEMSET_NAME)
|
||||
#endif
|
||||
-#endif
|
||||
|
||||
#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S
|
||||
index 1d5e56c5..de6c7f4f 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S
|
||||
@@ -123,10 +123,8 @@ L(_mc8_a):
|
||||
jr ra
|
||||
END(STRCHR_NAME)
|
||||
|
||||
-#ifndef ANDROID_CHANGES
|
||||
#ifdef _LIBC
|
||||
libc_hidden_builtin_def (STRCHR_NAME)
|
||||
#endif
|
||||
-#endif
|
||||
|
||||
#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S
|
||||
index 6338d005..abc246ca 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S
|
||||
@@ -136,11 +136,9 @@ L(_mc8_a):
|
||||
jr ra
|
||||
END(STRCHRNUL_NAME)
|
||||
|
||||
-#ifndef ANDROID_CHANGES
|
||||
#ifdef _LIBC
|
||||
weak_alias(STRCHRNUL_NAME, strchrnul)
|
||||
libc_hidden_builtin_def (STRCHRNUL_NAME)
|
||||
#endif
|
||||
-#endif
|
||||
|
||||
#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S
|
||||
index 449733cb..c77dc1a9 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S
|
||||
@@ -190,10 +190,8 @@ strcpy_page_cross:
|
||||
beqz has_nul, strcpy_page_cross_ok
|
||||
b strcpy_end
|
||||
END(STRCPY)
|
||||
-#ifndef ANDROID_CHANGES
|
||||
#ifdef _LIBC
|
||||
libc_hidden_builtin_def (STRCPY)
|
||||
#endif
|
||||
-#endif
|
||||
|
||||
#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S
|
||||
index e9b7cf67..2fe0fb34 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S
|
||||
@@ -107,10 +107,8 @@ strlen_loop_noascii:
|
||||
jr ra
|
||||
END(STRLEN)
|
||||
|
||||
-#ifndef ANDROID_CHANGES
|
||||
#ifdef _LIBC
|
||||
libc_hidden_builtin_def (STRLEN)
|
||||
#endif
|
||||
-#endif
|
||||
|
||||
#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S
|
||||
index 558df29b..6ec107ca 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S
|
||||
@@ -248,10 +248,8 @@ strncmp_ret0:
|
||||
then exchange(src1,src2). */
|
||||
|
||||
END(STRNCMP)
|
||||
-#ifndef ANDROID_CHANGES
|
||||
#ifdef _LIBC
|
||||
libc_hidden_builtin_def (STRNCMP)
|
||||
#endif
|
||||
-#endif
|
||||
|
||||
#endif
|
||||
diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S
|
||||
index 60eccf00..4a195b7c 100644
|
||||
--- a/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S
|
||||
+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S
|
||||
@@ -136,10 +136,8 @@ L(_hit_limit):
|
||||
move len, limit
|
||||
jr ra
|
||||
END(STRNLEN)
|
||||
-#ifndef ANDROID_CHANGES
|
||||
#ifdef _LIBC
|
||||
libc_hidden_builtin_def (STRNLEN)
|
||||
#endif
|
||||
-#endif
|
||||
|
||||
#endif
|
||||
--
|
||||
2.33.0
|
||||
|
|
@ -1,40 +0,0 @@
|
|||
From f4041e5da609a9f5da966fa000c00b150788a948 Mon Sep 17 00:00:00 2001
|
||||
From: caiyinyu <caiyinyu@loongson.cn>
|
||||
Date: Sun, 23 Jul 2023 14:32:08 +0800
|
||||
Subject: [PATCH 13/14] glibc-2.28: Remove useless IS_LA{264,364,464} and
|
||||
IS_LA{264, 364, 464}.
|
||||
|
||||
Change-Id: Id9a573510e2a493151191372d651f381ec2aefe7
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/unix/sysv/linux/loongarch/cpu-features.h | 7 -------
|
||||
1 file changed, 7 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
|
||||
index b46a8489..2703d4f7 100644
|
||||
--- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
|
||||
+++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
|
||||
@@ -22,10 +22,6 @@
|
||||
#include <stdint.h>
|
||||
#include <sys/auxv.h>
|
||||
|
||||
-#define LA264 0x14a000
|
||||
-#define LA364 0x14b000
|
||||
-#define LA464 0x14c011
|
||||
-
|
||||
struct cpu_features
|
||||
{
|
||||
uint64_t cpucfg_prid;
|
||||
@@ -42,9 +38,6 @@ extern const struct cpu_features *_dl_larch_get_cpu_features (void)
|
||||
:"=r"(ret) \
|
||||
:"r"(index));
|
||||
|
||||
-#define IS_LA264(prid) (prid == LA264)
|
||||
-#define IS_LA364(prid) (prid == LA364)
|
||||
-#define IS_LA464(prid) (prid == LA464)
|
||||
#define SUPPORT_UAL (GLRO (dl_hwcap) & HWCAP_LOONGARCH_UAL)
|
||||
#define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX)
|
||||
#define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX)
|
||||
--
|
||||
2.33.0
|
||||
|
|
@ -1,123 +0,0 @@
|
|||
From c94d9376e241dc52eb9f2a2107313b7836e0e9ad Mon Sep 17 00:00:00 2001
|
||||
From: caiyinyu <caiyinyu@loongson.cn>
|
||||
Date: Wed, 6 Sep 2023 16:41:09 +0800
|
||||
Subject: [PATCH 14/14] glibc-2.28: Use RTLD_SUPPORT_{LSX, LASX} to choose
|
||||
_dl_runtime_resolve.
|
||||
|
||||
Key Points:
|
||||
1. On lasx & lsx platforms, use _dl_runtime_resolve_{lsx, lasx} to save vector registers.
|
||||
2. Via "tunables", users can choose str/mem functions with
|
||||
`export GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX`.
|
||||
Note: glibc.cpu.hwcaps doesn't affect _dl_runtime_resolve_{lsx, lasx} selection.
|
||||
|
||||
Usage Notes:
|
||||
1. Only valid inputs: LASX, LSX, UAL. Case-sensitive, comma-separated, no spaces.
|
||||
2. Example: `export GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX,UAL` turns on LASX & UAL.
|
||||
Unmentioned features turn off. With default ifunc: lasx > lsx > unaligned >
|
||||
aligned > generic, effect is: lasx > unaligned > aligned > generic; lsx off.
|
||||
3. Incorrect GLIBC_TUNABLES settings will show error messages.
|
||||
4. Valid input examples:
|
||||
- GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX: lasx > aligned > generic.
|
||||
- GLIBC_TUNABLES=glibc.cpu.hwcaps=LSX,UAL: lsx > unaligned > aligned > generic.
|
||||
- GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX,UAL,LASX,UAL,LSX,LASX,UAL: Repetitions
|
||||
allowed but not recommended. Results in: lasx > lsx > unaligned > aligned >
|
||||
generic.
|
||||
|
||||
Change-Id: I555ce2039bc36bf071fc9265d7b0bb7b93b96ae7
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
sysdeps/loongarch/cpu-tunables.c | 2 +-
|
||||
sysdeps/loongarch/dl-machine.h | 11 ++++++-----
|
||||
sysdeps/unix/sysv/linux/loongarch/cpu-features.c | 2 ++
|
||||
sysdeps/unix/sysv/linux/loongarch/cpu-features.h | 10 +++++++---
|
||||
4 files changed, 16 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/loongarch/cpu-tunables.c b/sysdeps/loongarch/cpu-tunables.c
|
||||
index 840c1b8c..e0799ca9 100644
|
||||
--- a/sysdeps/loongarch/cpu-tunables.c
|
||||
+++ b/sysdeps/loongarch/cpu-tunables.c
|
||||
@@ -88,7 +88,7 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
|
||||
}
|
||||
while (*c != '\0');
|
||||
|
||||
- GLRO (dl_hwcap) &= hwcap;
|
||||
+ GLRO (dl_larch_cpu_features).hwcap &= hwcap;
|
||||
}
|
||||
|
||||
#endif
|
||||
diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
|
||||
index ff520a07..b5f43c84 100644
|
||||
--- a/sysdeps/loongarch/dl-machine.h
|
||||
+++ b/sysdeps/loongarch/dl-machine.h
|
||||
@@ -75,13 +75,14 @@ dl_platform_init (void)
|
||||
GLRO(dl_platform) = NULL;
|
||||
|
||||
#ifdef SHARED
|
||||
+ /* init_cpu_features has been called early from __libc_start_main in
|
||||
+ static executable. */
|
||||
+ init_cpu_features (&GLRO(dl_larch_cpu_features));
|
||||
|
||||
#if HAVE_TUNABLES
|
||||
TUNABLE_GET (hwcaps, tunable_val_t *, TUNABLE_CALLBACK (set_hwcaps));
|
||||
#endif
|
||||
- /* init_cpu_features has been called early from __libc_start_main in
|
||||
- static executable. */
|
||||
- init_cpu_features (&GLRO(dl_larch_cpu_features));
|
||||
+
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -396,9 +397,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
|
||||
l->l_mach.plt = gotplt[1] + l->l_addr;
|
||||
|
||||
#if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float
|
||||
- if (SUPPORT_LASX)
|
||||
+ if (RTLD_SUPPORT_LASX)
|
||||
gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lasx;
|
||||
- else if (SUPPORT_LSX)
|
||||
+ else if (RTLD_SUPPORT_LSX)
|
||||
gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lsx;
|
||||
else
|
||||
#endif
|
||||
diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.c b/sysdeps/unix/sysv/linux/loongarch/cpu-features.c
|
||||
index 80870f3c..cf015011 100644
|
||||
--- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.c
|
||||
+++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.c
|
||||
@@ -29,4 +29,6 @@ init_cpu_features (struct cpu_features *cpu_features)
|
||||
|
||||
__cpucfg(cpucfg_word, 2);
|
||||
cpu_features->cpucfg_word_idx2 = cpucfg_word;
|
||||
+
|
||||
+ GLRO (dl_larch_cpu_features).hwcap = GLRO (dl_hwcap);
|
||||
}
|
||||
diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
|
||||
index 2703d4f7..17c9f5a7 100644
|
||||
--- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
|
||||
+++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
|
||||
@@ -26,6 +26,7 @@ struct cpu_features
|
||||
{
|
||||
uint64_t cpucfg_prid;
|
||||
uint64_t cpucfg_word_idx2;
|
||||
+ uint64_t hwcap;
|
||||
};
|
||||
|
||||
/* Get a pointer to the CPU features structure. */
|
||||
@@ -38,9 +39,12 @@ extern const struct cpu_features *_dl_larch_get_cpu_features (void)
|
||||
:"=r"(ret) \
|
||||
:"r"(index));
|
||||
|
||||
-#define SUPPORT_UAL (GLRO (dl_hwcap) & HWCAP_LOONGARCH_UAL)
|
||||
-#define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX)
|
||||
-#define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX)
|
||||
+#define SUPPORT_UAL (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_UAL)
|
||||
+#define SUPPORT_LSX (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_LSX)
|
||||
+#define SUPPORT_LASX (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_LASX)
|
||||
+
|
||||
+#define RTLD_SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX)
|
||||
+#define RTLD_SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX)
|
||||
|
||||
#endif /* _CPU_FEATURES_LOONGARCH64_H */
|
||||
|
||||
--
|
||||
2.33.0
|
||||
|
|
@ -1,91 +0,0 @@
|
|||
From 58b1f882644f839259505dde3205e226a1c649f1 Mon Sep 17 00:00:00 2001
|
||||
From: caiyinyu <caiyinyu@loongson.cn>
|
||||
Date: Tue, 11 Jul 2023 15:42:26 +0800
|
||||
Subject: [PATCH 10/14] glibc-2.28: config: Added HAVE_LOONGARCH_VEC_ASM.
|
||||
|
||||
Change-Id: Iea464ea0c975a351682a60f66251167f6c79385b
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
config.h.in | 5 +++++
|
||||
sysdeps/loongarch/configure | 28 ++++++++++++++++++++++++++++
|
||||
sysdeps/loongarch/configure.ac | 15 +++++++++++++++
|
||||
3 files changed, 48 insertions(+)
|
||||
|
||||
diff --git a/config.h.in b/config.h.in
|
||||
index 94d5ea36..fa53cc2d 100644
|
||||
--- a/config.h.in
|
||||
+++ b/config.h.in
|
||||
@@ -123,6 +123,11 @@
|
||||
/* RISC-V floating-point ABI for ld.so. */
|
||||
#undef RISCV_ABI_FLEN
|
||||
|
||||
+/* Assembler support LoongArch LASX/LSX vector instructions.
|
||||
+ This macro becomes obsolete when glibc increased the minimum
|
||||
+ required version of GNU 'binutils' to 2.41 or later. */
|
||||
+#define HAVE_LOONGARCH_VEC_ASM 0
|
||||
+
|
||||
/* Linux specific: minimum supported kernel version. */
|
||||
#undef __LINUX_KERNEL_VERSION
|
||||
|
||||
diff --git a/sysdeps/loongarch/configure b/sysdeps/loongarch/configure
|
||||
index 1e5abf81..0f0dae3a 100755
|
||||
--- a/sysdeps/loongarch/configure
|
||||
+++ b/sysdeps/loongarch/configure
|
||||
@@ -2,3 +2,31 @@
|
||||
# Local configure fragment for sysdeps/loongarch/elf.
|
||||
|
||||
#AC_DEFINE(PI_STATIC_AND_HIDDEN)
|
||||
+
|
||||
+# Check if asm support vector instructions.
|
||||
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for vector support in assembler" >&5
|
||||
+$as_echo_n "checking for vector support in assembler... " >&6; }
|
||||
+if ${libc_cv_loongarch_vec_asm+:} false; then :
|
||||
+ $as_echo_n "(cached) " >&6
|
||||
+else
|
||||
+ cat > conftest.s <<\EOF
|
||||
+ vld $vr0, $sp, 0
|
||||
+EOF
|
||||
+if { ac_try='${CC-cc} -c $CFLAGS conftest.s -o conftest 1>&5'
|
||||
+ { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
|
||||
+ (eval $ac_try) 2>&5
|
||||
+ ac_status=$?
|
||||
+ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
|
||||
+ test $ac_status = 0; }; }; then
|
||||
+ libc_cv_loongarch_vec_asm=yes
|
||||
+else
|
||||
+ libc_cv_loongarch_vec_asm=no
|
||||
+fi
|
||||
+rm -f conftest*
|
||||
+fi
|
||||
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_loongarch_vec_asm" >&5
|
||||
+$as_echo "$libc_cv_loongarch_vec_asm" >&6; }
|
||||
+if test $libc_cv_loongarch_vec_asm = yes; then
|
||||
+ $as_echo "#define HAVE_LOONGARCH_VEC_ASM 1" >>confdefs.h
|
||||
+
|
||||
+fi
|
||||
diff --git a/sysdeps/loongarch/configure.ac b/sysdeps/loongarch/configure.ac
|
||||
index 67b46ce0..aac0efa9 100644
|
||||
--- a/sysdeps/loongarch/configure.ac
|
||||
+++ b/sysdeps/loongarch/configure.ac
|
||||
@@ -4,3 +4,18 @@ GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
|
||||
dnl It is always possible to access static and hidden symbols in an
|
||||
dnl position independent way.
|
||||
#AC_DEFINE(PI_STATIC_AND_HIDDEN)
|
||||
+
|
||||
+# Check if asm support vector instructions.
|
||||
+AC_CACHE_CHECK(for vector support in assembler, libc_cv_loongarch_vec_asm, [dnl
|
||||
+cat > conftest.s <<\EOF
|
||||
+ vld $vr0, $sp, 0
|
||||
+EOF
|
||||
+if AC_TRY_COMMAND(${CC-cc} -c $CFLAGS conftest.s -o conftest 1>&AS_MESSAGE_LOG_FD); then
|
||||
+ libc_cv_loongarch_vec_asm=yes
|
||||
+else
|
||||
+ libc_cv_loongarch_vec_asm=no
|
||||
+fi
|
||||
+rm -f conftest*])
|
||||
+if test $libc_cv_loongarch_vec_asm = yes; then
|
||||
+ AC_DEFINE(HAVE_LOONGARCH_VEC_ASM)
|
||||
+fi
|
||||
--
|
||||
2.33.0
|
||||
|
|
@ -1,75 +0,0 @@
|
|||
From 0153532f680527c4378a10673518cabda2e02584 Mon Sep 17 00:00:00 2001
|
||||
From: caiyinyu <caiyinyu@loongson.cn>
|
||||
Date: Fri, 26 May 2023 14:58:39 +0800
|
||||
Subject: [PATCH 05/14] glibc-2.28: remove ABILPX32 related code.
|
||||
|
||||
Change-Id: I73eb5bc4d4ca12e4d45ed6b533fa38d60a3a633f
|
||||
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
|
||||
---
|
||||
elf/elf.h | 3 +--
|
||||
sysdeps/loongarch/dl-machine.h | 2 --
|
||||
sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h | 2 +-
|
||||
sysdeps/loongarch/sys/regdef.h | 4 +---
|
||||
4 files changed, 3 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/elf/elf.h b/elf/elf.h
|
||||
index 65d1fb46..4bfbad61 100644
|
||||
--- a/elf/elf.h
|
||||
+++ b/elf/elf.h
|
||||
@@ -3933,10 +3933,9 @@ enum
|
||||
#define R_NDS32_TLS_TPOFF 102
|
||||
#define R_NDS32_TLS_DESC 119
|
||||
|
||||
-/* LoongISA ELF Flags */
|
||||
+/* LoongArch ELF Flags */
|
||||
#define EF_LARCH_ABI 0x0003
|
||||
#define EF_LARCH_ABI_LP64 0x0003
|
||||
-#define EF_LARCH_ABI_LPX32 0x0002
|
||||
#define EF_LARCH_ABI_LP32 0x0001
|
||||
|
||||
/* Loongarch specific dynamic relocations. */
|
||||
diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
|
||||
index 2d527241..6e9c6258 100644
|
||||
--- a/sysdeps/loongarch/dl-machine.h
|
||||
+++ b/sysdeps/loongarch/dl-machine.h
|
||||
@@ -96,8 +96,6 @@ elf_machine_matches_host (const ElfW(Ehdr) *ehdr)
|
||||
|
||||
#ifdef _ABILP64
|
||||
if ((ehdr->e_flags & EF_LARCH_ABI) != EF_LARCH_ABI_LP64)
|
||||
-#elif defined _ABILPX32
|
||||
- if ((ehdr->e_flags & EF_LARCH_ABI) != EF_LARCH_ABI_LPX32)
|
||||
#elif defined _ABILP32
|
||||
if ((ehdr->e_flags & EF_LARCH_ABI) != EF_LARCH_ABI_LP32)
|
||||
#else
|
||||
diff --git a/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h b/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h
|
||||
index 5a761355..aa63bce1 100644
|
||||
--- a/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h
|
||||
+++ b/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h
|
||||
@@ -32,7 +32,7 @@
|
||||
# define __SIZEOF_PTHREAD_BARRIER_T 32
|
||||
# define __SIZEOF_PTHREAD_BARRIERATTR_T 4
|
||||
#else
|
||||
-# error "rv32i-based systems are not supported"
|
||||
+# error "32-bit based systems are not supported"
|
||||
#endif
|
||||
|
||||
#define __PTHREAD_COMPAT_PADDING_MID
|
||||
diff --git a/sysdeps/loongarch/sys/regdef.h b/sysdeps/loongarch/sys/regdef.h
|
||||
index 769784b8..36f00939 100644
|
||||
--- a/sysdeps/loongarch/sys/regdef.h
|
||||
+++ b/sysdeps/loongarch/sys/regdef.h
|
||||
@@ -72,10 +72,8 @@
|
||||
# define fs6 $f30
|
||||
# define fs7 $f31
|
||||
|
||||
-#elif _LOONGARCH_SIM == _ABILPX32
|
||||
-# error ABILPX32
|
||||
#elif _LOONGARCH_SIM == _ABILP32
|
||||
-# error ABILP32
|
||||
+# error ABILP32 not support yet
|
||||
#else
|
||||
# error noABI
|
||||
#endif
|
||||
--
|
||||
2.33.0
|
||||
|
File diff suppressed because it is too large
Load diff
BIN
glibc-2.38.tar.xz
Normal file
BIN
glibc-2.38.tar.xz
Normal file
Binary file not shown.
|
@ -1,29 +0,0 @@
|
|||
From 4e32231c73cb7a9b004fb160d03c26498ec3860d Mon Sep 17 00:00:00 2001
|
||||
From: Zhao Hang <wb-zh951434@alibaba-inc.com>
|
||||
Date: Tue, 28 May 2024 10:10:11 +0800
|
||||
Subject: [PATCH] Add Hygon Support
|
||||
|
||||
Signed-off-by: Zhao Hang <wb-zh951434@alibaba-inc.com>
|
||||
---
|
||||
sysdeps/x86/cpu-features.c | 5 +++--
|
||||
1 file changed, 3 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
||||
index 1248a702..c72dc2f3 100644
|
||||
--- a/sysdeps/x86/cpu-features.c
|
||||
+++ b/sysdeps/x86/cpu-features.c
|
||||
@@ -546,8 +546,9 @@ init_cpu_features (struct cpu_features *cpu_features)
|
||||
|= bit_arch_Prefer_AVX2_STRCMP;
|
||||
}
|
||||
}
|
||||
- /* This spells out "AuthenticAMD". */
|
||||
- else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
|
||||
+ /* This spells out "AuthenticAMD" or "HygonGenuine". */
|
||||
+ else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)||(ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e))
|
||||
+
|
||||
{
|
||||
unsigned int extended_model;
|
||||
|
||||
--
|
||||
2.31.1
|
||||
|
|
@ -1,147 +0,0 @@
|
|||
From 58f93dff514cc0bdf3c72eff590dcf5fe5bf9e00 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Wed, 19 Jul 2023 23:09:09 +0800
|
||||
Subject: [PATCH 3/6] Add a testcase to check alignment of PT_LOAD segment [BZ
|
||||
#28676]
|
||||
|
||||
Backport from master commit: fc2334a
|
||||
|
||||
Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>
|
||||
---
|
||||
elf/Makefile | 13 ++++++++++++-
|
||||
elf/tst-align3.c | 38 ++++++++++++++++++++++++++++++++++++++
|
||||
elf/tst-alignmod3.c | 32 ++++++++++++++++++++++++++++++++
|
||||
3 files changed, 82 insertions(+), 1 deletion(-)
|
||||
create mode 100644 elf/tst-align3.c
|
||||
create mode 100644 elf/tst-alignmod3.c
|
||||
|
||||
diff --git a/elf/Makefile b/elf/Makefile
|
||||
index 634c3113..442817ca 100644
|
||||
--- a/elf/Makefile
|
||||
+++ b/elf/Makefile
|
||||
@@ -331,6 +331,7 @@ tests += \
|
||||
tst-addr1 \
|
||||
tst-align \
|
||||
tst-align2 \
|
||||
+ tst-align3 \
|
||||
tst-audit-tlsdesc \
|
||||
tst-audit-tlsdesc-dlopen \
|
||||
tst-audit1 \
|
||||
@@ -466,7 +467,9 @@ endif
|
||||
test-srcs = \
|
||||
tst-pathopt
|
||||
# tests-srcs
|
||||
-
|
||||
+ifeq (yes,$(have-fpie))
|
||||
+tests-pie += tst-align3
|
||||
+endif
|
||||
selinux-enabled := $(shell cat /selinux/enforce 2> /dev/null)
|
||||
|
||||
ifneq ($(selinux-enabled),1)
|
||||
@@ -647,6 +650,7 @@ modules-names = \
|
||||
tst-absolute-zero-lib \
|
||||
tst-alignmod \
|
||||
tst-alignmod2 \
|
||||
+ tst-alignmod3 \
|
||||
tst-array2dep \
|
||||
tst-array5dep \
|
||||
tst-audit-tlsdesc-mod1 \
|
||||
@@ -1669,6 +1673,13 @@ CFLAGS-tst-alignmod2.c += $(stack-align-test-flags)
|
||||
$(objpfx)tst-align: $(libdl)
|
||||
$(objpfx)tst-align.out: $(objpfx)tst-alignmod.so
|
||||
$(objpfx)tst-align2: $(objpfx)tst-alignmod2.so
|
||||
+$(objpfx)tst-align3: $(objpfx)tst-alignmod3.so
|
||||
+ifeq (yes,$(have-fpie))
|
||||
+CFLAGS-tst-align3.c += $(PIE-ccflag)
|
||||
+endif
|
||||
+LDFLAGS-tst-align3 += -Wl,-z,max-page-size=0x200000
|
||||
+LDFLAGS-tst-alignmod3.so += -Wl,-z,max-page-size=0x200000
|
||||
+$(objpfx)tst-alignmod3.so: $(libsupport)
|
||||
|
||||
$(objpfx)unload3: $(libdl)
|
||||
$(objpfx)unload3.out: $(objpfx)unload3mod1.so $(objpfx)unload3mod2.so \
|
||||
diff --git a/elf/tst-align3.c b/elf/tst-align3.c
|
||||
new file mode 100644
|
||||
index 00000000..ac86d623
|
||||
--- /dev/null
|
||||
+++ b/elf/tst-align3.c
|
||||
@@ -0,0 +1,38 @@
|
||||
+/* Check alignment of PT_LOAD segment in a shared library.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <support/check.h>
|
||||
+#include <tst-stack-align.h>
|
||||
+
|
||||
+/* This should cover all possible page sizes we currently support. */
|
||||
+#define ALIGN 0x200000
|
||||
+
|
||||
+int bar __attribute__ ((aligned (ALIGN))) = 1;
|
||||
+
|
||||
+extern int do_load_test (void);
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ printf ("bar: %p\n", &bar);
|
||||
+ TEST_VERIFY (is_aligned (&bar, ALIGN) == 0);
|
||||
+
|
||||
+ return do_load_test ();
|
||||
+}
|
||||
+
|
||||
+#include <support/test-driver.c>
|
||||
diff --git a/elf/tst-alignmod3.c b/elf/tst-alignmod3.c
|
||||
new file mode 100644
|
||||
index 00000000..0d33f237
|
||||
--- /dev/null
|
||||
+++ b/elf/tst-alignmod3.c
|
||||
@@ -0,0 +1,32 @@
|
||||
+/* Check alignment of PT_LOAD segment in a shared library.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <support/check.h>
|
||||
+#include <tst-stack-align.h>
|
||||
+
|
||||
+/* This should cover all possible page sizes we currently support. */
|
||||
+#define ALIGN 0x200000
|
||||
+
|
||||
+int foo __attribute__ ((aligned (ALIGN))) = 1;
|
||||
+
|
||||
+void
|
||||
+do_load_test (void)
|
||||
+{
|
||||
+ printf ("foo: %p\n", &foo);
|
||||
+ TEST_VERIFY (is_aligned (&foo, ALIGN) == 0);
|
||||
+}
|
||||
--
|
||||
2.27.0
|
||||
|
|
@ -1,325 +0,0 @@
|
|||
From 6152628751bf13f74c9336263a9c22f29ccd8ffb Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Wed, 19 Jul 2023 23:01:53 +0800
|
||||
Subject: [PATCH 1/6] Properly check stack alignment [BZ #27901]
|
||||
|
||||
1. Replace
|
||||
|
||||
if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0)
|
||||
|
||||
which may be optimized out by compiler, with
|
||||
|
||||
int
|
||||
__attribute__ ((weak, noclone, noinline))
|
||||
is_aligned (void *p, int align)
|
||||
{
|
||||
return (((uintptr_t) p) & (align - 1)) != 0;
|
||||
}
|
||||
|
||||
2. Add TEST_STACK_ALIGN_INIT to TEST_STACK_ALIGN.
|
||||
3. Add a common TEST_STACK_ALIGN_INIT to check 16-byte stack alignment
|
||||
for both i386 and x86-64.
|
||||
4. Update powerpc to use TEST_STACK_ALIGN_INIT.
|
||||
|
||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
||||
Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>
|
||||
---
|
||||
sysdeps/generic/tst-stack-align.h | 40 ++++++++++++++++---------
|
||||
sysdeps/i386/i686/tst-stack-align.h | 44 ---------------------------
|
||||
sysdeps/i386/tst-stack-align.h | 41 -------------------------
|
||||
sysdeps/powerpc/tst-stack-align.h | 27 +++++------------
|
||||
sysdeps/x86/tst-stack-align.h | 28 ++++++++++++++++++
|
||||
sysdeps/x86_64/tst-stack-align.h | 46 -----------------------------
|
||||
6 files changed, 61 insertions(+), 165 deletions(-)
|
||||
delete mode 100644 sysdeps/i386/i686/tst-stack-align.h
|
||||
delete mode 100644 sysdeps/i386/tst-stack-align.h
|
||||
create mode 100644 sysdeps/x86/tst-stack-align.h
|
||||
delete mode 100644 sysdeps/x86_64/tst-stack-align.h
|
||||
|
||||
diff --git a/sysdeps/generic/tst-stack-align.h b/sysdeps/generic/tst-stack-align.h
|
||||
index e5cb3310..e6050901 100644
|
||||
--- a/sysdeps/generic/tst-stack-align.h
|
||||
+++ b/sysdeps/generic/tst-stack-align.h
|
||||
@@ -1,4 +1,5 @@
|
||||
-/* Copyright (C) 2003-2018 Free Software Foundation, Inc.
|
||||
+/* Check stack alignment. Generic version.
|
||||
+ Copyright (C) 2003-2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
@@ -18,17 +19,28 @@
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
|
||||
+int
|
||||
+__attribute__ ((weak, noclone, noinline))
|
||||
+is_aligned (void *p, int align)
|
||||
+{
|
||||
+ return (((uintptr_t) p) & (align - 1)) != 0;
|
||||
+}
|
||||
+
|
||||
+#ifndef TEST_STACK_ALIGN_INIT
|
||||
+# define TEST_STACK_ALIGN_INIT() 0
|
||||
+#endif
|
||||
+
|
||||
#define TEST_STACK_ALIGN() \
|
||||
- ({ \
|
||||
- double _d = 12.0; \
|
||||
- long double _ld = 15.0; \
|
||||
- int _ret = 0; \
|
||||
- printf ("double: %g %p %zu\n", _d, &_d, __alignof (double)); \
|
||||
- if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0) \
|
||||
- _ret = 1; \
|
||||
- \
|
||||
- printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double)); \
|
||||
- if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0) \
|
||||
- _ret = 1; \
|
||||
- _ret; \
|
||||
- })
|
||||
+ ({ \
|
||||
+ double _d = 12.0; \
|
||||
+ long double _ld = 15.0; \
|
||||
+ int _ret = TEST_STACK_ALIGN_INIT (); \
|
||||
+ \
|
||||
+ printf ("double: %g %p %zu\n", _d, &_d, __alignof (double)); \
|
||||
+ _ret += is_aligned (&_d, __alignof (double)); \
|
||||
+ \
|
||||
+ printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, \
|
||||
+ __alignof (long double)); \
|
||||
+ _ret += is_aligned (&_ld, __alignof (long double)); \
|
||||
+ _ret; \
|
||||
+ })
|
||||
diff --git a/sysdeps/i386/i686/tst-stack-align.h b/sysdeps/i386/i686/tst-stack-align.h
|
||||
deleted file mode 100644
|
||||
index 975f26ef..00000000
|
||||
--- a/sysdeps/i386/i686/tst-stack-align.h
|
||||
+++ /dev/null
|
||||
@@ -1,44 +0,0 @@
|
||||
-/* Copyright (C) 2003-2018 Free Software Foundation, Inc.
|
||||
- This file is part of the GNU C Library.
|
||||
-
|
||||
- The GNU C Library is free software; you can redistribute it and/or
|
||||
- modify it under the terms of the GNU Lesser General Public
|
||||
- License as published by the Free Software Foundation; either
|
||||
- version 2.1 of the License, or (at your option) any later version.
|
||||
-
|
||||
- The GNU C Library is distributed in the hope that it will be useful,
|
||||
- but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
- Lesser General Public License for more details.
|
||||
-
|
||||
- You should have received a copy of the GNU Lesser General Public
|
||||
- License along with the GNU C Library; if not, see
|
||||
- <http://www.gnu.org/licenses/>. */
|
||||
-
|
||||
-#include <stdio.h>
|
||||
-#include <stdint.h>
|
||||
-#ifndef __SSE__
|
||||
-#include_next <tst-stack-align.h>
|
||||
-#else
|
||||
-#include <xmmintrin.h>
|
||||
-
|
||||
-#define TEST_STACK_ALIGN() \
|
||||
- ({ \
|
||||
- __m128 _m; \
|
||||
- double _d = 12.0; \
|
||||
- long double _ld = 15.0; \
|
||||
- int _ret = 0; \
|
||||
- printf ("__m128: %p %zu\n", &_m, __alignof (__m128)); \
|
||||
- if ((((uintptr_t) &_m) & (__alignof (__m128) - 1)) != 0) \
|
||||
- _ret = 1; \
|
||||
- \
|
||||
- printf ("double: %g %p %zu\n", _d, &_d, __alignof (double)); \
|
||||
- if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0) \
|
||||
- _ret = 1; \
|
||||
- \
|
||||
- printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double)); \
|
||||
- if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0) \
|
||||
- _ret = 1; \
|
||||
- _ret; \
|
||||
- })
|
||||
-#endif
|
||||
diff --git a/sysdeps/i386/tst-stack-align.h b/sysdeps/i386/tst-stack-align.h
|
||||
deleted file mode 100644
|
||||
index 394ff773..00000000
|
||||
--- a/sysdeps/i386/tst-stack-align.h
|
||||
+++ /dev/null
|
||||
@@ -1,41 +0,0 @@
|
||||
-/* Copyright (C) 2004-2018 Free Software Foundation, Inc.
|
||||
- This file is part of the GNU C Library.
|
||||
-
|
||||
- The GNU C Library is free software; you can redistribute it and/or
|
||||
- modify it under the terms of the GNU Lesser General Public
|
||||
- License as published by the Free Software Foundation; either
|
||||
- version 2.1 of the License, or (at your option) any later version.
|
||||
-
|
||||
- The GNU C Library is distributed in the hope that it will be useful,
|
||||
- but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
- Lesser General Public License for more details.
|
||||
-
|
||||
- You should have received a copy of the GNU Lesser General Public
|
||||
- License along with the GNU C Library; if not, see
|
||||
- <http://www.gnu.org/licenses/>. */
|
||||
-
|
||||
-#include <stdio.h>
|
||||
-#include <stdint.h>
|
||||
-
|
||||
-typedef struct { int i[4]; } int_al16 __attribute__((aligned (16)));
|
||||
-
|
||||
-#define TEST_STACK_ALIGN() \
|
||||
- ({ \
|
||||
- int_al16 _m; \
|
||||
- double _d = 12.0; \
|
||||
- long double _ld = 15.0; \
|
||||
- int _ret = 0; \
|
||||
- printf ("int_al16: %p %zu\n", &_m, __alignof (int_al16)); \
|
||||
- if ((((uintptr_t) &_m) & (__alignof (int_al16) - 1)) != 0) \
|
||||
- _ret = 1; \
|
||||
- \
|
||||
- printf ("double: %g %p %zu\n", _d, &_d, __alignof (double)); \
|
||||
- if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0) \
|
||||
- _ret = 1; \
|
||||
- \
|
||||
- printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double)); \
|
||||
- if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0) \
|
||||
- _ret = 1; \
|
||||
- _ret; \
|
||||
- })
|
||||
diff --git a/sysdeps/powerpc/tst-stack-align.h b/sysdeps/powerpc/tst-stack-align.h
|
||||
index 7fd7013b..d7400b28 100644
|
||||
--- a/sysdeps/powerpc/tst-stack-align.h
|
||||
+++ b/sysdeps/powerpc/tst-stack-align.h
|
||||
@@ -1,4 +1,5 @@
|
||||
-/* Copyright (C) 2005-2018 Free Software Foundation, Inc.
|
||||
+/* Check stack alignment. PowerPC version.
|
||||
+ Copyright (C) 2005-2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
@@ -15,10 +16,7 @@
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
-#include <stdio.h>
|
||||
-#include <stdint.h>
|
||||
-
|
||||
-#define TEST_STACK_ALIGN() \
|
||||
+#define TEST_STACK_ALIGN_INIT() \
|
||||
({ \
|
||||
/* Altivec __vector int etc. needs 16byte aligned stack. \
|
||||
Instead of using altivec.h here, use aligned attribute instead. */ \
|
||||
@@ -27,20 +25,9 @@
|
||||
int _i __attribute__((aligned (16))); \
|
||||
int _j[3]; \
|
||||
} _s = { ._i = 18, ._j[0] = 19, ._j[1] = 20, ._j[2] = 21 }; \
|
||||
- double _d = 12.0; \
|
||||
- long double _ld = 15.0; \
|
||||
- int _ret = 0; \
|
||||
printf ("__vector int: { %d, %d, %d, %d } %p %zu\n", _s._i, _s._j[0], \
|
||||
_s._j[1], _s._j[2], &_s, __alignof (_s)); \
|
||||
- if ((((uintptr_t) &_s) & (__alignof (_s) - 1)) != 0) \
|
||||
- _ret = 1; \
|
||||
- \
|
||||
- printf ("double: %g %p %zu\n", _d, &_d, __alignof (double)); \
|
||||
- if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0) \
|
||||
- _ret = 1; \
|
||||
- \
|
||||
- printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double)); \
|
||||
- if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0) \
|
||||
- _ret = 1; \
|
||||
- _ret; \
|
||||
- })
|
||||
+ is_aligned (&_s, __alignof (_s)); \
|
||||
+ })
|
||||
+
|
||||
+#include_next <tst-stack-align.h>
|
||||
diff --git a/sysdeps/x86/tst-stack-align.h b/sysdeps/x86/tst-stack-align.h
|
||||
new file mode 100644
|
||||
index 00000000..02ecc72d
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-stack-align.h
|
||||
@@ -0,0 +1,28 @@
|
||||
+/* Check stack alignment. X86 version.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+typedef struct { int i[16]; } int_al16 __attribute__((aligned (16)));
|
||||
+
|
||||
+#define TEST_STACK_ALIGN_INIT() \
|
||||
+ ({ \
|
||||
+ int_al16 _m; \
|
||||
+ printf ("int_al16: %p %zu\n", &_m, __alignof (int_al16)); \
|
||||
+ is_aligned (&_m, __alignof (int_al16)); \
|
||||
+ })
|
||||
+
|
||||
+#include_next <tst-stack-align.h>
|
||||
diff --git a/sysdeps/x86_64/tst-stack-align.h b/sysdeps/x86_64/tst-stack-align.h
|
||||
deleted file mode 100644
|
||||
index b2ef77f6..00000000
|
||||
--- a/sysdeps/x86_64/tst-stack-align.h
|
||||
+++ /dev/null
|
||||
@@ -1,46 +0,0 @@
|
||||
-/* Copyright (C) 2003-2018 Free Software Foundation, Inc.
|
||||
- This file is part of the GNU C Library.
|
||||
-
|
||||
- The GNU C Library is free software; you can redistribute it and/or
|
||||
- modify it under the terms of the GNU Lesser General Public
|
||||
- License as published by the Free Software Foundation; either
|
||||
- version 2.1 of the License, or (at your option) any later version.
|
||||
-
|
||||
- The GNU C Library is distributed in the hope that it will be useful,
|
||||
- but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
- Lesser General Public License for more details.
|
||||
-
|
||||
- You should have received a copy of the GNU Lesser General Public
|
||||
- License along with the GNU C Library; if not, see
|
||||
- <http://www.gnu.org/licenses/>. */
|
||||
-
|
||||
-#include <stdio.h>
|
||||
-#include <stdint.h>
|
||||
-
|
||||
-#define TEST_STACK_ALIGN() \
|
||||
- ({ \
|
||||
- /* AMD64 ABI mandates 16byte aligned stack. \
|
||||
- Unfortunately, current GCC doesn't support __int128 or __float128 \
|
||||
- types, so use aligned attribute instead. */ \
|
||||
- struct _S \
|
||||
- { \
|
||||
- int _i __attribute__((aligned (16))); \
|
||||
- int _pad[3]; \
|
||||
- } _s = { ._i = 18 }; \
|
||||
- double _d = 12.0; \
|
||||
- long double _ld = 15.0; \
|
||||
- int _ret = 0; \
|
||||
- printf ("__int128: %d %p %zu\n", _s._i, &_s, __alignof (_s)); \
|
||||
- if ((((uintptr_t) &_s) & (__alignof (_s) - 1)) != 0) \
|
||||
- _ret = 1; \
|
||||
- \
|
||||
- printf ("double: %g %p %zu\n", _d, &_d, __alignof (double)); \
|
||||
- if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0) \
|
||||
- _ret = 1; \
|
||||
- \
|
||||
- printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double)); \
|
||||
- if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0) \
|
||||
- _ret = 1; \
|
||||
- _ret; \
|
||||
- })
|
||||
--
|
||||
2.27.0
|
||||
|
|
@ -1,112 +0,0 @@
|
|||
commit 849274d48fc59bfa6db3c713c8ced8026b20f3b7
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Thu Nov 16 19:55:35 2023 +0100
|
||||
|
||||
elf: Fix force_first handling in dlclose (bug 30981)
|
||||
|
||||
The force_first parameter was ineffective because the dlclose'd
|
||||
object was not necessarily the first in the maps array. Also
|
||||
enable force_first handling unconditionally, regardless of namespace.
|
||||
The initial object in a namespace should be destructed first, too.
|
||||
|
||||
The _dl_sort_maps_dfs function had early returns for relocation
|
||||
dependency processing which broke force_first handling, too, and
|
||||
this is fixed in this change as well.
|
||||
|
||||
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||
|
||||
diff --git a/elf/dl-close.c b/elf/dl-close.c
|
||||
index 66524b6708c59f29..8107c2d5f6ad2bc6 100644
|
||||
--- a/elf/dl-close.c
|
||||
+++ b/elf/dl-close.c
|
||||
@@ -182,6 +182,16 @@ _dl_close_worker (struct link_map *map, bool force)
|
||||
}
|
||||
assert (idx == nloaded);
|
||||
|
||||
+ /* Put the dlclose'd map first, so that its destructor runs first.
|
||||
+ The map variable is NULL after a retry. */
|
||||
+ if (map != NULL)
|
||||
+ {
|
||||
+ maps[map->l_idx] = maps[0];
|
||||
+ maps[map->l_idx]->l_idx = map->l_idx;
|
||||
+ maps[0] = map;
|
||||
+ maps[0]->l_idx = 0;
|
||||
+ }
|
||||
+
|
||||
/* Keep track of the lowest index link map we have covered already. */
|
||||
int done_index = -1;
|
||||
while (++done_index < nloaded)
|
||||
@@ -255,9 +265,10 @@ _dl_close_worker (struct link_map *map, bool force)
|
||||
}
|
||||
}
|
||||
|
||||
- /* Sort the entries. We can skip looking for the binary itself which is
|
||||
- at the front of the search list for the main namespace. */
|
||||
- _dl_sort_maps (maps, nloaded, (nsid == LM_ID_BASE), true);
|
||||
+ /* Sort the entries. Unless retrying, the maps[0] object (the
|
||||
+ original argument to dlclose) needs to remain first, so that its
|
||||
+ destructor runs first. */
|
||||
+ _dl_sort_maps (maps, nloaded, /* force_first */ map != NULL, true);
|
||||
|
||||
/* Call all termination functions at once. */
|
||||
bool unload_any = false;
|
||||
@@ -768,7 +779,11 @@ _dl_close_worker (struct link_map *map, bool force)
|
||||
/* Recheck if we need to retry, release the lock. */
|
||||
out:
|
||||
if (dl_close_state == rerun)
|
||||
- goto retry;
|
||||
+ {
|
||||
+ /* The map may have been deallocated. */
|
||||
+ map = NULL;
|
||||
+ goto retry;
|
||||
+ }
|
||||
|
||||
dl_close_state = not_pending;
|
||||
}
|
||||
diff --git a/elf/dl-sort-maps.c b/elf/dl-sort-maps.c
|
||||
index aeb79b40b45054c0..c17ac325eca658ef 100644
|
||||
--- a/elf/dl-sort-maps.c
|
||||
+++ b/elf/dl-sort-maps.c
|
||||
@@ -260,13 +260,12 @@ _dl_sort_maps_dfs (struct link_map **maps, unsigned int nmaps,
|
||||
The below memcpy is not needed in the do_reldeps case here,
|
||||
since we wrote back to maps[] during DFS traversal. */
|
||||
if (maps_head == maps)
|
||||
- return;
|
||||
+ break;
|
||||
}
|
||||
assert (maps_head == maps);
|
||||
- return;
|
||||
}
|
||||
-
|
||||
- memcpy (maps, rpo, sizeof (struct link_map *) * nmaps);
|
||||
+ else
|
||||
+ memcpy (maps, rpo, sizeof (struct link_map *) * nmaps);
|
||||
|
||||
/* Skipping the first object at maps[0] is not valid in general,
|
||||
since traversing along object dependency-links may "find" that
|
||||
diff --git a/elf/dso-sort-tests-1.def b/elf/dso-sort-tests-1.def
|
||||
index 4bf9052db16fb352..cf6453e9eb85ac65 100644
|
||||
--- a/elf/dso-sort-tests-1.def
|
||||
+++ b/elf/dso-sort-tests-1.def
|
||||
@@ -56,14 +56,16 @@ output: b>a>{}<a<b
|
||||
# relocation(dynamic) dependencies. While this is technically unspecified, the
|
||||
# presumed reasonable practical behavior is for the destructor order to respect
|
||||
# the static DT_NEEDED links (here this means the a->b->c->d order).
|
||||
-# The older dynamic_sort=1 algorithm does not achieve this, while the DFS-based
|
||||
-# dynamic_sort=2 algorithm does, although it is still arguable whether going
|
||||
-# beyond spec to do this is the right thing to do.
|
||||
+# The older dynamic_sort=1 algorithm originally did not achieve this,
|
||||
+# but this was a bug in the way _dl_sort_maps was called from _dl_close_worker,
|
||||
+# effectively disabling proper force_first handling.
|
||||
+# The new dynamic_sort=2 algorithm shows the effect of the simpler force_first
|
||||
+# handling: the a object is simply moved to the front.
|
||||
# The below expected outputs are what the two algorithms currently produce
|
||||
# respectively, for regression testing purposes.
|
||||
tst-bz15311: {+a;+e;+f;+g;+d;%d;-d;-g;-f;-e;-a};a->b->c->d;d=>[ba];c=>a;b=>e=>a;c=>f=>b;d=>g=>c
|
||||
-output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<c<d<g<f<b<e];}
|
||||
-output(glibc.rtld.dynamic_sort=2): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<g<f<a<b<c<d<e];}
|
||||
+output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<b<c<d<g<f<e];}
|
||||
+output(glibc.rtld.dynamic_sort=2): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<g<f<b<c<d<e];}
|
||||
|
||||
# Test that even in the presence of dependency loops involving dlopen'ed
|
||||
# object, that object is initialized last (and not unloaded prematurely).
|
|
@ -1,83 +0,0 @@
|
|||
commit c00b984fcd53f679ca2dafcd1aee2c89836e6e73
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Tue Aug 29 08:28:31 2023 +0200
|
||||
|
||||
nscd: Skip unusable entries in first pass in prune_cache (bug 30800)
|
||||
|
||||
Previously, if an entry was marked unusable for any reason, but had
|
||||
not timed out yet, the assert would trigger.
|
||||
|
||||
One way to get into such state is if a data change is detected during
|
||||
re-validation of an entry. This causes the entry to be marked as not
|
||||
usable. If exits nscd soon after that, then the clock jumps
|
||||
backwards, and nscd restarted, the cache re-validation run after
|
||||
startup triggers the removed assert.
|
||||
|
||||
The change is more complicated than just the removal of the assert
|
||||
because entries marked as not usable should be garbage-collected in
|
||||
the second pass. To make this happen, it is necessary to update some
|
||||
book-keeping data.
|
||||
|
||||
Reviewed-by: DJ Delorie <dj@redhat.com>
|
||||
|
||||
diff --git a/nscd/cache.c b/nscd/cache.c
|
||||
index efe4214d953edb30..2fd3f78ebb567bbe 100644
|
||||
--- a/nscd/cache.c
|
||||
+++ b/nscd/cache.c
|
||||
@@ -371,8 +371,11 @@ prune_cache (struct database_dyn *table, time_t now, int fd)
|
||||
serv2str[runp->type], str, dh->timeout);
|
||||
}
|
||||
|
||||
- /* Check whether the entry timed out. */
|
||||
- if (dh->timeout < now)
|
||||
+ /* Check whether the entry timed out. Timed out entries
|
||||
+ will be revalidated. For unusable records, it is still
|
||||
+ necessary to record that the bucket needs to be scanned
|
||||
+ again below. */
|
||||
+ if (dh->timeout < now || !dh->usable)
|
||||
{
|
||||
/* This hash bucket could contain entries which need to
|
||||
be looked at. */
|
||||
@@ -384,7 +387,7 @@ prune_cache (struct database_dyn *table, time_t now, int fd)
|
||||
/* We only have to look at the data of the first entries
|
||||
since the count information is kept in the data part
|
||||
which is shared. */
|
||||
- if (runp->first)
|
||||
+ if (runp->first && dh->usable)
|
||||
{
|
||||
|
||||
/* At this point there are two choices: we reload the
|
||||
@@ -400,9 +403,6 @@ prune_cache (struct database_dyn *table, time_t now, int fd)
|
||||
{
|
||||
/* Remove the value. */
|
||||
dh->usable = false;
|
||||
-
|
||||
- /* We definitely have some garbage entries now. */
|
||||
- any = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -414,18 +414,15 @@ prune_cache (struct database_dyn *table, time_t now, int fd)
|
||||
|
||||
time_t timeout = readdfcts[runp->type] (table, runp, dh);
|
||||
next_timeout = MIN (next_timeout, timeout);
|
||||
-
|
||||
- /* If the entry has been replaced, we might need
|
||||
- cleanup. */
|
||||
- any |= !dh->usable;
|
||||
}
|
||||
}
|
||||
+
|
||||
+ /* If the entry has been replaced, we might need cleanup. */
|
||||
+ any |= !dh->usable;
|
||||
}
|
||||
else
|
||||
- {
|
||||
- assert (dh->usable);
|
||||
- next_timeout = MIN (next_timeout, dh->timeout);
|
||||
- }
|
||||
+ /* Entry has not timed out and is usable. */
|
||||
+ next_timeout = MIN (next_timeout, dh->timeout);
|
||||
|
||||
run = runp->next;
|
||||
}
|
|
@ -1,72 +0,0 @@
|
|||
commit 2aa0974d2573441bffd596b07bff8698b1f2f18c
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Fri Oct 20 14:29:50 2023 +0200
|
||||
|
||||
elf: ldconfig should skip temporary files created by package managers
|
||||
|
||||
This avoids crashes due to partially written files, after a package
|
||||
update is interrupted.
|
||||
|
||||
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||
|
||||
Conflicts:
|
||||
elf/ldconfig.c
|
||||
(missing alloca removal downstream)
|
||||
|
||||
diff --git a/elf/ldconfig.c b/elf/ldconfig.c
|
||||
index 8c66d7e5426d8cc4..51de08f91fbaf093 100644
|
||||
--- a/elf/ldconfig.c
|
||||
+++ b/elf/ldconfig.c
|
||||
@@ -771,6 +771,31 @@ struct dlib_entry
|
||||
struct dlib_entry *next;
|
||||
};
|
||||
|
||||
+/* Skip some temporary DSO files. These files may be partially written
|
||||
+ and lead to ldconfig crashes when examined. */
|
||||
+static bool
|
||||
+skip_dso_based_on_name (const char *name, size_t len)
|
||||
+{
|
||||
+ /* Skip temporary files created by the prelink program. Files with
|
||||
+ names like these are never really DSOs we want to look at. */
|
||||
+ if (len >= sizeof (".#prelink#") - 1)
|
||||
+ {
|
||||
+ if (strcmp (name + len - sizeof (".#prelink#") + 1,
|
||||
+ ".#prelink#") == 0)
|
||||
+ return true;
|
||||
+ if (len >= sizeof (".#prelink#.XXXXXX") - 1
|
||||
+ && memcmp (name + len - sizeof (".#prelink#.XXXXXX")
|
||||
+ + 1, ".#prelink#.", sizeof (".#prelink#.") - 1) == 0)
|
||||
+ return true;
|
||||
+ }
|
||||
+ /* Skip temporary files created by RPM. */
|
||||
+ if (memchr (name, len, ';') != NULL)
|
||||
+ return true;
|
||||
+ /* Skip temporary files created by dpkg. */
|
||||
+ if (len > 4 && memcmp (name + len - 4, ".tmp", 4) == 0)
|
||||
+ return true;
|
||||
+ return false;
|
||||
+}
|
||||
|
||||
static void
|
||||
search_dir (const struct dir_entry *entry)
|
||||
@@ -849,18 +874,8 @@ search_dir (const struct dir_entry *entry)
|
||||
continue;
|
||||
|
||||
size_t len = strlen (direntry->d_name);
|
||||
- /* Skip temporary files created by the prelink program. Files with
|
||||
- names like these are never really DSOs we want to look at. */
|
||||
- if (len >= sizeof (".#prelink#") - 1)
|
||||
- {
|
||||
- if (strcmp (direntry->d_name + len - sizeof (".#prelink#") + 1,
|
||||
- ".#prelink#") == 0)
|
||||
- continue;
|
||||
- if (len >= sizeof (".#prelink#.XXXXXX") - 1
|
||||
- && memcmp (direntry->d_name + len - sizeof (".#prelink#.XXXXXX")
|
||||
- + 1, ".#prelink#.", sizeof (".#prelink#.") - 1) == 0)
|
||||
- continue;
|
||||
- }
|
||||
+ if (skip_dso_based_on_name (direntry->d_name, len))
|
||||
+ continue;
|
||||
len += strlen (entry->path) + 2;
|
||||
if (len > file_name_len)
|
||||
{
|
|
@ -1,61 +0,0 @@
|
|||
commit cfb5a97a93ea656e3b2263e42142a4032986d9ba
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Mon Oct 23 12:53:16 2023 +0200
|
||||
|
||||
ldconfig: Fixes for skipping temporary files.
|
||||
|
||||
Arguments to a memchr call were swapped, causing incorrect skipping
|
||||
of files.
|
||||
|
||||
Files related to dpkg have different names: they actually end in
|
||||
.dpkg-new and .dpkg-tmp, not .tmp as I mistakenly assumed.
|
||||
|
||||
Fixes commit 2aa0974d2573441bffd59 ("elf: ldconfig should skip
|
||||
temporary files created by package managers").
|
||||
|
||||
diff --git a/elf/ldconfig.c b/elf/ldconfig.c
|
||||
index 51de08f91fbaf093..fb19dd68d41c07a4 100644
|
||||
--- a/elf/ldconfig.c
|
||||
+++ b/elf/ldconfig.c
|
||||
@@ -771,6 +771,17 @@ struct dlib_entry
|
||||
struct dlib_entry *next;
|
||||
};
|
||||
|
||||
+/* Return true if the N bytes at NAME end with with the characters in
|
||||
+ the string SUFFIX. (NAME[N + 1] does not have to be a null byte.)
|
||||
+ Expected to be called with a string literal for SUFFIX. */
|
||||
+static inline bool
|
||||
+endswithn (const char *name, size_t n, const char *suffix)
|
||||
+{
|
||||
+ return (n >= strlen (suffix)
|
||||
+ && memcmp (name + n - strlen (suffix), suffix,
|
||||
+ strlen (suffix)) == 0);
|
||||
+}
|
||||
+
|
||||
/* Skip some temporary DSO files. These files may be partially written
|
||||
and lead to ldconfig crashes when examined. */
|
||||
static bool
|
||||
@@ -780,8 +791,7 @@ skip_dso_based_on_name (const char *name, size_t len)
|
||||
names like these are never really DSOs we want to look at. */
|
||||
if (len >= sizeof (".#prelink#") - 1)
|
||||
{
|
||||
- if (strcmp (name + len - sizeof (".#prelink#") + 1,
|
||||
- ".#prelink#") == 0)
|
||||
+ if (endswithn (name, len, ".#prelink#"))
|
||||
return true;
|
||||
if (len >= sizeof (".#prelink#.XXXXXX") - 1
|
||||
&& memcmp (name + len - sizeof (".#prelink#.XXXXXX")
|
||||
@@ -789,10 +799,11 @@ skip_dso_based_on_name (const char *name, size_t len)
|
||||
return true;
|
||||
}
|
||||
/* Skip temporary files created by RPM. */
|
||||
- if (memchr (name, len, ';') != NULL)
|
||||
+ if (memchr (name, ';', len) != NULL)
|
||||
return true;
|
||||
/* Skip temporary files created by dpkg. */
|
||||
- if (len > 4 && memcmp (name + len - 4, ".tmp", 4) == 0)
|
||||
+ if (endswithn (name, len, ".dpkg-new")
|
||||
+ || endswithn (name, len, ".dpkg-tmp"))
|
||||
return true;
|
||||
return false;
|
||||
}
|
|
@ -1,259 +0,0 @@
|
|||
From 97700a34f36721b11a754cf37a1cc40695ece1fd Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Mon, 21 Jan 2019 11:23:59 -0800
|
||||
Subject: [PATCH] x86-64 memchr/wmemchr: Properly handle the length parameter
|
||||
[BZ# 24097]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
||||
functions written in assembly can only use the lower 32 bits of a
|
||||
64-bit register as length or must clear the upper 32 bits before using
|
||||
the full 64-bit register for length.
|
||||
|
||||
This pach fixes memchr/wmemchr for x32. Tested on x86-64 and x32. On
|
||||
x86-64, libc.so is the same with and withou the fix.
|
||||
|
||||
[BZ# 24097]
|
||||
CVE-2019-6488
|
||||
* sysdeps/x86_64/memchr.S: Use RDX_LP for length. Clear the
|
||||
upper 32 bits of RDX register.
|
||||
* sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise.
|
||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memchr and
|
||||
tst-size_t-wmemchr.
|
||||
* sysdeps/x86_64/x32/test-size_t.h: New file.
|
||||
* sysdeps/x86_64/x32/tst-size_t-memchr.c: Likewise.
|
||||
* sysdeps/x86_64/x32/tst-size_t-wmemchr.c: Likewise.
|
||||
---
|
||||
sysdeps/x86_64/memchr.S | 10 ++--
|
||||
sysdeps/x86_64/multiarch/memchr-avx2.S | 8 ++-
|
||||
sysdeps/x86_64/x32/Makefile | 8 +++
|
||||
sysdeps/x86_64/x32/test-size_t.h | 35 ++++++++++++
|
||||
sysdeps/x86_64/x32/tst-size_t-memchr.c | 72 +++++++++++++++++++++++++
|
||||
sysdeps/x86_64/x32/tst-size_t-wmemchr.c | 20 +++++++
|
||||
6 files changed, 148 insertions(+), 5 deletions(-)
|
||||
create mode 100644 sysdeps/x86_64/x32/test-size_t.h
|
||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memchr.c
|
||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemchr.c
|
||||
|
||||
Conflicts:
|
||||
ChangeLog
|
||||
(removed)
|
||||
NEWS
|
||||
(removed)
|
||||
|
||||
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
|
||||
index feef5d4f..cb320257 100644
|
||||
--- a/sysdeps/x86_64/memchr.S
|
||||
+++ b/sysdeps/x86_64/memchr.S
|
||||
@@ -34,12 +34,16 @@ ENTRY(MEMCHR)
|
||||
mov %edi, %ecx
|
||||
|
||||
#ifdef USE_AS_WMEMCHR
|
||||
- test %rdx, %rdx
|
||||
+ test %RDX_LP, %RDX_LP
|
||||
jz L(return_null)
|
||||
- shl $2, %rdx
|
||||
+ shl $2, %RDX_LP
|
||||
#else
|
||||
+# ifdef __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ movl %edx, %edx
|
||||
+# endif
|
||||
punpcklbw %xmm1, %xmm1
|
||||
- test %rdx, %rdx
|
||||
+ test %RDX_LP, %RDX_LP
|
||||
jz L(return_null)
|
||||
punpcklbw %xmm1, %xmm1
|
||||
#endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
||||
index 5f5e7725..c81da19b 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
||||
@@ -40,16 +40,20 @@
|
||||
ENTRY (MEMCHR)
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
/* Check for zero length. */
|
||||
- testq %rdx, %rdx
|
||||
+ test %RDX_LP, %RDX_LP
|
||||
jz L(null)
|
||||
# endif
|
||||
movl %edi, %ecx
|
||||
/* Broadcast CHAR to YMM0. */
|
||||
vmovd %esi, %xmm0
|
||||
# ifdef USE_AS_WMEMCHR
|
||||
- shl $2, %rdx
|
||||
+ shl $2, %RDX_LP
|
||||
vpbroadcastd %xmm0, %ymm0
|
||||
# else
|
||||
+# ifdef __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ movl %edx, %edx
|
||||
+# endif
|
||||
vpbroadcastb %xmm0, %ymm0
|
||||
# endif
|
||||
/* Check if we may cross page boundary with one vector load. */
|
||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
||||
index f2ebc24f..7d528889 100644
|
||||
--- a/sysdeps/x86_64/x32/Makefile
|
||||
+++ b/sysdeps/x86_64/x32/Makefile
|
||||
@@ -4,3 +4,11 @@ ifeq ($(subdir),math)
|
||||
# 64-bit llround. Add -fno-builtin-lround to silence the compiler.
|
||||
CFLAGS-s_llround.c += -fno-builtin-lround
|
||||
endif
|
||||
+
|
||||
+ifeq ($(subdir),string)
|
||||
+tests += tst-size_t-memchr
|
||||
+endif
|
||||
+
|
||||
+ifeq ($(subdir),wcsmbs)
|
||||
+tests += tst-size_t-wmemchr
|
||||
+endif
|
||||
diff --git a/sysdeps/x86_64/x32/test-size_t.h b/sysdeps/x86_64/x32/test-size_t.h
|
||||
new file mode 100644
|
||||
index 00000000..78a94086
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/x32/test-size_t.h
|
||||
@@ -0,0 +1,35 @@
|
||||
+/* Test string/memory functions with size_t in the lower 32 bits of
|
||||
+ 64-bit register.
|
||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#define TEST_MAIN
|
||||
+#include <string/test-string.h>
|
||||
+
|
||||
+/* On x32, parameter_t may be passed in a 64-bit register with the LEN
|
||||
+ field in the lower 32 bits. When the LEN field of 64-bit register
|
||||
+ is passed to string/memory function as the size_t parameter, only
|
||||
+ the lower 32 bits can be used. */
|
||||
+typedef struct
|
||||
+{
|
||||
+ union
|
||||
+ {
|
||||
+ size_t len;
|
||||
+ void (*fn) (void);
|
||||
+ };
|
||||
+ void *p;
|
||||
+} parameter_t;
|
||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-memchr.c b/sysdeps/x86_64/x32/tst-size_t-memchr.c
|
||||
new file mode 100644
|
||||
index 00000000..29a3daf1
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/x32/tst-size_t-memchr.c
|
||||
@@ -0,0 +1,72 @@
|
||||
+/* Test memchr with size_t in the lower 32 bits of 64-bit register.
|
||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifndef WIDE
|
||||
+# define TEST_NAME "memchr"
|
||||
+#else
|
||||
+# define TEST_NAME "wmemchr"
|
||||
+#endif /* WIDE */
|
||||
+#include "test-size_t.h"
|
||||
+
|
||||
+#ifndef WIDE
|
||||
+# define MEMCHR memchr
|
||||
+# define CHAR char
|
||||
+# define UCHAR unsigned char
|
||||
+#else
|
||||
+# include <wchar.h>
|
||||
+# define MEMCHR wmemchr
|
||||
+# define CHAR wchar_t
|
||||
+# define UCHAR wchar_t
|
||||
+#endif /* WIDE */
|
||||
+
|
||||
+IMPL (MEMCHR, 1)
|
||||
+
|
||||
+typedef CHAR * (*proto_t) (const CHAR*, int, size_t);
|
||||
+
|
||||
+static CHAR *
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+do_memchr (parameter_t a, parameter_t b)
|
||||
+{
|
||||
+ return CALL (&b, a.p, (uintptr_t) b.p, a.len);
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+test_main (void)
|
||||
+{
|
||||
+ test_init ();
|
||||
+
|
||||
+ parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
|
||||
+ parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
|
||||
+
|
||||
+ int ret = 0;
|
||||
+ FOR_EACH_IMPL (impl, 0)
|
||||
+ {
|
||||
+ c.fn = impl->fn;
|
||||
+ CHAR *res = do_memchr (src, c);
|
||||
+ if (res)
|
||||
+ {
|
||||
+ error (0, 0, "Wrong result in function %s: %p != NULL",
|
||||
+ impl->name, res);
|
||||
+ ret = 1;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
||||
+}
|
||||
+
|
||||
+#include <support/test-driver.c>
|
||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemchr.c b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
|
||||
new file mode 100644
|
||||
index 00000000..877801d6
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
|
||||
@@ -0,0 +1,20 @@
|
||||
+/* Test wmemchr with size_t in the lower 32 bits of 64-bit register.
|
||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#define WIDE 1
|
||||
+#include "tst-size_t-memchr.c"
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,41 +0,0 @@
|
|||
From ddf0992cf57a93200e0c782e2a94d0733a5a0b87 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Sun, 9 Jan 2022 16:02:21 -0600
|
||||
Subject: [PATCH] x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to
|
||||
__wcscmp_avx2. For x86_64 this covers the entire address range so any
|
||||
length larger could not possibly be used to bound `s1` or `s2`.
|
||||
|
||||
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strcmp-avx2.S | 10 ++++++++++
|
||||
1 file changed, 10 insertions(+)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
index 156c1949..8fb8eedc 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
@@ -83,6 +83,16 @@ ENTRY (STRCMP)
|
||||
je L(char0)
|
||||
jb L(zero)
|
||||
# ifdef USE_AS_WCSCMP
|
||||
+# ifndef __ILP32__
|
||||
+ movq %rdx, %rcx
|
||||
+ /* Check if length could overflow when multiplied by
|
||||
+ sizeof(wchar_t). Checking top 8 bits will cover all potential
|
||||
+ overflow cases as well as redirect cases where its impossible to
|
||||
+ length to bound a valid memory region. In these cases just use
|
||||
+ 'wcscmp'. */
|
||||
+ shrq $56, %rcx
|
||||
+ jnz __wcscmp_avx2
|
||||
+# endif
|
||||
/* Convert units: from wide to byte char. */
|
||||
shl $2, %RDX_LP
|
||||
# endif
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,257 +0,0 @@
|
|||
From 244b415d386487521882debb845a040a4758cb18 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Fri, 25 Mar 2022 17:13:33 -0500
|
||||
Subject: [PATCH] x86: Small improvements for wcslen
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Just a few QOL changes.
|
||||
1. Prefer `add` > `lea` as it has high execution units it can run
|
||||
on.
|
||||
2. Don't break macro-fusion between `test` and `jcc`
|
||||
3. Reduce code size by removing gratuitous padding bytes (-90
|
||||
bytes).
|
||||
|
||||
geometric_mean(N=20) of all benchmarks New / Original: 0.959
|
||||
|
||||
All string/memory tests pass.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/wcslen.S | 86 ++++++++++++++++++++---------------------
|
||||
1 file changed, 41 insertions(+), 45 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
|
||||
index 9f5f7232..254bb030 100644
|
||||
--- a/sysdeps/x86_64/wcslen.S
|
||||
+++ b/sysdeps/x86_64/wcslen.S
|
||||
@@ -41,82 +41,82 @@ ENTRY (__wcslen)
|
||||
pxor %xmm0, %xmm0
|
||||
|
||||
lea 32(%rdi), %rax
|
||||
- lea 16(%rdi), %rcx
|
||||
+ addq $16, %rdi
|
||||
and $-16, %rax
|
||||
|
||||
pcmpeqd (%rax), %xmm0
|
||||
pmovmskb %xmm0, %edx
|
||||
pxor %xmm1, %xmm1
|
||||
+ addq $16, %rax
|
||||
test %edx, %edx
|
||||
- lea 16(%rax), %rax
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd (%rax), %xmm1
|
||||
pmovmskb %xmm1, %edx
|
||||
pxor %xmm2, %xmm2
|
||||
+ addq $16, %rax
|
||||
test %edx, %edx
|
||||
- lea 16(%rax), %rax
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd (%rax), %xmm2
|
||||
pmovmskb %xmm2, %edx
|
||||
pxor %xmm3, %xmm3
|
||||
+ addq $16, %rax
|
||||
test %edx, %edx
|
||||
- lea 16(%rax), %rax
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd (%rax), %xmm3
|
||||
pmovmskb %xmm3, %edx
|
||||
+ addq $16, %rax
|
||||
test %edx, %edx
|
||||
- lea 16(%rax), %rax
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd (%rax), %xmm0
|
||||
pmovmskb %xmm0, %edx
|
||||
+ addq $16, %rax
|
||||
test %edx, %edx
|
||||
- lea 16(%rax), %rax
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd (%rax), %xmm1
|
||||
pmovmskb %xmm1, %edx
|
||||
+ addq $16, %rax
|
||||
test %edx, %edx
|
||||
- lea 16(%rax), %rax
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd (%rax), %xmm2
|
||||
pmovmskb %xmm2, %edx
|
||||
+ addq $16, %rax
|
||||
test %edx, %edx
|
||||
- lea 16(%rax), %rax
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd (%rax), %xmm3
|
||||
pmovmskb %xmm3, %edx
|
||||
+ addq $16, %rax
|
||||
test %edx, %edx
|
||||
- lea 16(%rax), %rax
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd (%rax), %xmm0
|
||||
pmovmskb %xmm0, %edx
|
||||
+ addq $16, %rax
|
||||
test %edx, %edx
|
||||
- lea 16(%rax), %rax
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd (%rax), %xmm1
|
||||
pmovmskb %xmm1, %edx
|
||||
+ addq $16, %rax
|
||||
test %edx, %edx
|
||||
- lea 16(%rax), %rax
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd (%rax), %xmm2
|
||||
pmovmskb %xmm2, %edx
|
||||
+ addq $16, %rax
|
||||
test %edx, %edx
|
||||
- lea 16(%rax), %rax
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd (%rax), %xmm3
|
||||
pmovmskb %xmm3, %edx
|
||||
+ addq $16, %rax
|
||||
test %edx, %edx
|
||||
- lea 16(%rax), %rax
|
||||
jnz L(exit)
|
||||
|
||||
and $-0x40, %rax
|
||||
@@ -133,104 +133,100 @@ L(aligned_64_loop):
|
||||
pminub %xmm0, %xmm2
|
||||
pcmpeqd %xmm3, %xmm2
|
||||
pmovmskb %xmm2, %edx
|
||||
+ addq $64, %rax
|
||||
test %edx, %edx
|
||||
- lea 64(%rax), %rax
|
||||
jz L(aligned_64_loop)
|
||||
|
||||
pcmpeqd -64(%rax), %xmm3
|
||||
pmovmskb %xmm3, %edx
|
||||
+ addq $48, %rdi
|
||||
test %edx, %edx
|
||||
- lea 48(%rcx), %rcx
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd %xmm1, %xmm3
|
||||
pmovmskb %xmm3, %edx
|
||||
+ addq $-16, %rdi
|
||||
test %edx, %edx
|
||||
- lea -16(%rcx), %rcx
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd -32(%rax), %xmm3
|
||||
pmovmskb %xmm3, %edx
|
||||
+ addq $-16, %rdi
|
||||
test %edx, %edx
|
||||
- lea -16(%rcx), %rcx
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd %xmm6, %xmm3
|
||||
pmovmskb %xmm3, %edx
|
||||
+ addq $-16, %rdi
|
||||
test %edx, %edx
|
||||
- lea -16(%rcx), %rcx
|
||||
- jnz L(exit)
|
||||
-
|
||||
- jmp L(aligned_64_loop)
|
||||
+ jz L(aligned_64_loop)
|
||||
|
||||
.p2align 4
|
||||
L(exit):
|
||||
- sub %rcx, %rax
|
||||
+ sub %rdi, %rax
|
||||
shr $2, %rax
|
||||
test %dl, %dl
|
||||
jz L(exit_high)
|
||||
|
||||
- mov %dl, %cl
|
||||
- and $15, %cl
|
||||
+ andl $15, %edx
|
||||
jz L(exit_1)
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ /* No align here. Naturally aligned % 16 == 1. */
|
||||
L(exit_high):
|
||||
- mov %dh, %ch
|
||||
- and $15, %ch
|
||||
+ andl $(15 << 8), %edx
|
||||
jz L(exit_3)
|
||||
add $2, %rax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 3
|
||||
L(exit_1):
|
||||
add $1, %rax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 3
|
||||
L(exit_3):
|
||||
add $3, %rax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 3
|
||||
L(exit_tail0):
|
||||
- xor %rax, %rax
|
||||
+ xorl %eax, %eax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 3
|
||||
L(exit_tail1):
|
||||
- mov $1, %rax
|
||||
+ movl $1, %eax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 3
|
||||
L(exit_tail2):
|
||||
- mov $2, %rax
|
||||
+ movl $2, %eax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 3
|
||||
L(exit_tail3):
|
||||
- mov $3, %rax
|
||||
+ movl $3, %eax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 3
|
||||
L(exit_tail4):
|
||||
- mov $4, %rax
|
||||
+ movl $4, %eax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 3
|
||||
L(exit_tail5):
|
||||
- mov $5, %rax
|
||||
+ movl $5, %eax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 3
|
||||
L(exit_tail6):
|
||||
- mov $6, %rax
|
||||
+ movl $6, %eax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 3
|
||||
L(exit_tail7):
|
||||
- mov $7, %rax
|
||||
+ movl $7, %eax
|
||||
ret
|
||||
|
||||
END (__wcslen)
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,964 +0,0 @@
|
|||
From 7cbc03d03091d5664060924789afe46d30a5477e Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Fri, 15 Apr 2022 12:28:00 -0500
|
||||
Subject: [PATCH] x86: Remove memcmp-sse4.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Code didn't actually use any sse4 instructions since `ptest` was
|
||||
removed in:
|
||||
|
||||
commit 2f9062d7171850451e6044ef78d91ff8c017b9c0
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Wed Nov 10 16:18:56 2021 -0600
|
||||
|
||||
x86: Shrink memcmp-sse4.S code size
|
||||
|
||||
The new memcmp-sse2 implementation is also faster.
|
||||
|
||||
geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905
|
||||
|
||||
Note there are two regressions preferring SSE2 for Size = 1 and Size =
|
||||
65.
|
||||
|
||||
Size = 1:
|
||||
size, align0, align1, ret, New Time/Old Time
|
||||
1, 1, 1, 0, 1.2
|
||||
1, 1, 1, 1, 1.197
|
||||
1, 1, 1, -1, 1.2
|
||||
|
||||
This is intentional. Size == 1 is significantly less hot based on
|
||||
profiles of GCC11 and Python3 than sizes [4, 8] (which is made
|
||||
hotter).
|
||||
|
||||
Python3 Size = 1 -> 13.64%
|
||||
Python3 Size = [4, 8] -> 60.92%
|
||||
|
||||
GCC11 Size = 1 -> 1.29%
|
||||
GCC11 Size = [4, 8] -> 33.86%
|
||||
|
||||
size, align0, align1, ret, New Time/Old Time
|
||||
4, 4, 4, 0, 0.622
|
||||
4, 4, 4, 1, 0.797
|
||||
4, 4, 4, -1, 0.805
|
||||
5, 5, 5, 0, 0.623
|
||||
5, 5, 5, 1, 0.777
|
||||
5, 5, 5, -1, 0.802
|
||||
6, 6, 6, 0, 0.625
|
||||
6, 6, 6, 1, 0.813
|
||||
6, 6, 6, -1, 0.788
|
||||
7, 7, 7, 0, 0.625
|
||||
7, 7, 7, 1, 0.799
|
||||
7, 7, 7, -1, 0.795
|
||||
8, 8, 8, 0, 0.625
|
||||
8, 8, 8, 1, 0.848
|
||||
8, 8, 8, -1, 0.914
|
||||
9, 9, 9, 0, 0.625
|
||||
|
||||
Size = 65:
|
||||
size, align0, align1, ret, New Time/Old Time
|
||||
65, 0, 0, 0, 1.103
|
||||
65, 0, 0, 1, 1.216
|
||||
65, 0, 0, -1, 1.227
|
||||
65, 65, 0, 0, 1.091
|
||||
65, 0, 65, 1, 1.19
|
||||
65, 65, 65, -1, 1.215
|
||||
|
||||
This is because A) the checks in range [65, 96] are now unrolled 2x
|
||||
and B) because smaller values <= 16 are now given a hotter path. By
|
||||
contrast the SSE4 version has a branch for Size = 80. The unrolled
|
||||
version has get better performance for returns which need both
|
||||
comparisons.
|
||||
|
||||
size, align0, align1, ret, New Time/Old Time
|
||||
128, 4, 8, 0, 0.858
|
||||
128, 4, 8, 1, 0.879
|
||||
128, 4, 8, -1, 0.888
|
||||
|
||||
As well, out of microbenchmark environments that are not full
|
||||
predictable the branch will have a real-cost.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/Makefile | 2 -
|
||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 -
|
||||
sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 -
|
||||
sysdeps/x86_64/multiarch/memcmp-sse4.S | 804 ---------------------
|
||||
4 files changed, 814 deletions(-)
|
||||
delete mode 100644 sysdeps/x86_64/multiarch/memcmp-sse4.S
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
||||
index bca82e38..b503e4b8 100644
|
||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
||||
@@ -11,7 +11,6 @@ sysdep_routines += \
|
||||
memcmp-avx2-movbe-rtm \
|
||||
memcmp-evex-movbe \
|
||||
memcmp-sse2 \
|
||||
- memcmp-sse4 \
|
||||
memcmp-ssse3 \
|
||||
memcpy-ssse3 \
|
||||
memcpy-ssse3-back \
|
||||
@@ -174,7 +173,6 @@ sysdep_routines += \
|
||||
wmemcmp-avx2-movbe-rtm \
|
||||
wmemcmp-c \
|
||||
wmemcmp-evex-movbe \
|
||||
- wmemcmp-sse4 \
|
||||
wmemcmp-ssse3 \
|
||||
# sysdep_routines
|
||||
endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index 14314367..450a2917 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -78,8 +78,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
&& CPU_FEATURE_USABLE (BMI2)
|
||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
||||
__memcmp_evex_movbe)
|
||||
- IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
|
||||
- __memcmp_sse4_1)
|
||||
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
|
||||
__memcmp_ssse3)
|
||||
IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
|
||||
@@ -824,8 +822,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
&& CPU_FEATURE_USABLE (BMI2)
|
||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
||||
__wmemcmp_evex_movbe)
|
||||
- IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
|
||||
- __wmemcmp_sse4_1)
|
||||
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
|
||||
__wmemcmp_ssse3)
|
||||
IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
||||
index 690dffe8..0bc47a7f 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
||||
@@ -21,7 +21,6 @@
|
||||
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
|
||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
|
||||
@@ -47,9 +46,6 @@ IFUNC_SELECTOR (void)
|
||||
return OPTIMIZE (avx2_movbe);
|
||||
}
|
||||
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
|
||||
- return OPTIMIZE (sse4_1);
|
||||
-
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
|
||||
return OPTIMIZE (ssse3);
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
|
||||
deleted file mode 100644
|
||||
index 50060006..00000000
|
||||
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
|
||||
+++ /dev/null
|
||||
@@ -1,804 +0,0 @@
|
||||
-/* memcmp with SSE4.1, wmemcmp with SSE4.1
|
||||
- Copyright (C) 2010-2018 Free Software Foundation, Inc.
|
||||
- Contributed by Intel Corporation.
|
||||
- This file is part of the GNU C Library.
|
||||
-
|
||||
- The GNU C Library is free software; you can redistribute it and/or
|
||||
- modify it under the terms of the GNU Lesser General Public
|
||||
- License as published by the Free Software Foundation; either
|
||||
- version 2.1 of the License, or (at your option) any later version.
|
||||
-
|
||||
- The GNU C Library is distributed in the hope that it will be useful,
|
||||
- but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
- Lesser General Public License for more details.
|
||||
-
|
||||
- You should have received a copy of the GNU Lesser General Public
|
||||
- License along with the GNU C Library; if not, see
|
||||
- <http://www.gnu.org/licenses/>. */
|
||||
-
|
||||
-#if IS_IN (libc)
|
||||
-
|
||||
-# include <sysdep.h>
|
||||
-
|
||||
-# ifndef MEMCMP
|
||||
-# define MEMCMP __memcmp_sse4_1
|
||||
-# endif
|
||||
-
|
||||
-#ifdef USE_AS_WMEMCMP
|
||||
-# define CMPEQ pcmpeqd
|
||||
-# define CHAR_SIZE 4
|
||||
-#else
|
||||
-# define CMPEQ pcmpeqb
|
||||
-# define CHAR_SIZE 1
|
||||
-#endif
|
||||
-
|
||||
-
|
||||
-/* Warning!
|
||||
- wmemcmp has to use SIGNED comparison for elements.
|
||||
- memcmp has to use UNSIGNED comparison for elemnts.
|
||||
-*/
|
||||
-
|
||||
- .section .text.sse4.1,"ax",@progbits
|
||||
-ENTRY (MEMCMP)
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- shl $2, %RDX_LP
|
||||
-# elif defined __ILP32__
|
||||
- /* Clear the upper 32 bits. */
|
||||
- mov %edx, %edx
|
||||
-# endif
|
||||
- cmp $79, %RDX_LP
|
||||
- ja L(79bytesormore)
|
||||
-
|
||||
- cmp $CHAR_SIZE, %RDX_LP
|
||||
- jbe L(firstbyte)
|
||||
-
|
||||
- /* N in (CHAR_SIZE, 79) bytes. */
|
||||
- cmpl $32, %edx
|
||||
- ja L(more_32_bytes)
|
||||
-
|
||||
- cmpl $16, %edx
|
||||
- jae L(16_to_32_bytes)
|
||||
-
|
||||
-# ifndef USE_AS_WMEMCMP
|
||||
- cmpl $8, %edx
|
||||
- jae L(8_to_16_bytes)
|
||||
-
|
||||
- cmpl $4, %edx
|
||||
- jb L(2_to_3_bytes)
|
||||
-
|
||||
- movl (%rdi), %eax
|
||||
- movl (%rsi), %ecx
|
||||
-
|
||||
- bswap %eax
|
||||
- bswap %ecx
|
||||
-
|
||||
- shlq $32, %rax
|
||||
- shlq $32, %rcx
|
||||
-
|
||||
- movl -4(%rdi, %rdx), %edi
|
||||
- movl -4(%rsi, %rdx), %esi
|
||||
-
|
||||
- bswap %edi
|
||||
- bswap %esi
|
||||
-
|
||||
- orq %rdi, %rax
|
||||
- orq %rsi, %rcx
|
||||
- subq %rcx, %rax
|
||||
- cmovne %edx, %eax
|
||||
- sbbl %ecx, %ecx
|
||||
- orl %ecx, %eax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4,, 8
|
||||
-L(2_to_3_bytes):
|
||||
- movzwl (%rdi), %eax
|
||||
- movzwl (%rsi), %ecx
|
||||
- shll $8, %eax
|
||||
- shll $8, %ecx
|
||||
- bswap %eax
|
||||
- bswap %ecx
|
||||
- movzbl -1(%rdi, %rdx), %edi
|
||||
- movzbl -1(%rsi, %rdx), %esi
|
||||
- orl %edi, %eax
|
||||
- orl %esi, %ecx
|
||||
- subl %ecx, %eax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4,, 8
|
||||
-L(8_to_16_bytes):
|
||||
- movq (%rdi), %rax
|
||||
- movq (%rsi), %rcx
|
||||
-
|
||||
- bswap %rax
|
||||
- bswap %rcx
|
||||
-
|
||||
- subq %rcx, %rax
|
||||
- jne L(8_to_16_bytes_done)
|
||||
-
|
||||
- movq -8(%rdi, %rdx), %rax
|
||||
- movq -8(%rsi, %rdx), %rcx
|
||||
-
|
||||
- bswap %rax
|
||||
- bswap %rcx
|
||||
-
|
||||
- subq %rcx, %rax
|
||||
-
|
||||
-L(8_to_16_bytes_done):
|
||||
- cmovne %edx, %eax
|
||||
- sbbl %ecx, %ecx
|
||||
- orl %ecx, %eax
|
||||
- ret
|
||||
-# else
|
||||
- xorl %eax, %eax
|
||||
- movl (%rdi), %ecx
|
||||
- cmpl (%rsi), %ecx
|
||||
- jne L(8_to_16_bytes_done)
|
||||
- movl 4(%rdi), %ecx
|
||||
- cmpl 4(%rsi), %ecx
|
||||
- jne L(8_to_16_bytes_done)
|
||||
- movl -4(%rdi, %rdx), %ecx
|
||||
- cmpl -4(%rsi, %rdx), %ecx
|
||||
- jne L(8_to_16_bytes_done)
|
||||
- ret
|
||||
-# endif
|
||||
-
|
||||
- .p2align 4,, 3
|
||||
-L(ret_zero):
|
||||
- xorl %eax, %eax
|
||||
-L(zero):
|
||||
- ret
|
||||
-
|
||||
- .p2align 4,, 8
|
||||
-L(firstbyte):
|
||||
- jb L(ret_zero)
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- xorl %eax, %eax
|
||||
- movl (%rdi), %ecx
|
||||
- cmpl (%rsi), %ecx
|
||||
- je L(zero)
|
||||
-L(8_to_16_bytes_done):
|
||||
- setg %al
|
||||
- leal -1(%rax, %rax), %eax
|
||||
-# else
|
||||
- movzbl (%rdi), %eax
|
||||
- movzbl (%rsi), %ecx
|
||||
- sub %ecx, %eax
|
||||
-# endif
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(vec_return_begin_48):
|
||||
- addq $16, %rdi
|
||||
- addq $16, %rsi
|
||||
-L(vec_return_begin_32):
|
||||
- bsfl %eax, %eax
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- movl 32(%rdi, %rax), %ecx
|
||||
- xorl %edx, %edx
|
||||
- cmpl 32(%rsi, %rax), %ecx
|
||||
- setg %dl
|
||||
- leal -1(%rdx, %rdx), %eax
|
||||
-# else
|
||||
- movzbl 32(%rsi, %rax), %ecx
|
||||
- movzbl 32(%rdi, %rax), %eax
|
||||
- subl %ecx, %eax
|
||||
-# endif
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(vec_return_begin_16):
|
||||
- addq $16, %rdi
|
||||
- addq $16, %rsi
|
||||
-L(vec_return_begin):
|
||||
- bsfl %eax, %eax
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- movl (%rdi, %rax), %ecx
|
||||
- xorl %edx, %edx
|
||||
- cmpl (%rsi, %rax), %ecx
|
||||
- setg %dl
|
||||
- leal -1(%rdx, %rdx), %eax
|
||||
-# else
|
||||
- movzbl (%rsi, %rax), %ecx
|
||||
- movzbl (%rdi, %rax), %eax
|
||||
- subl %ecx, %eax
|
||||
-# endif
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(vec_return_end_16):
|
||||
- subl $16, %edx
|
||||
-L(vec_return_end):
|
||||
- bsfl %eax, %eax
|
||||
- addl %edx, %eax
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- movl -16(%rdi, %rax), %ecx
|
||||
- xorl %edx, %edx
|
||||
- cmpl -16(%rsi, %rax), %ecx
|
||||
- setg %dl
|
||||
- leal -1(%rdx, %rdx), %eax
|
||||
-# else
|
||||
- movzbl -16(%rsi, %rax), %ecx
|
||||
- movzbl -16(%rdi, %rax), %eax
|
||||
- subl %ecx, %eax
|
||||
-# endif
|
||||
- ret
|
||||
-
|
||||
- .p2align 4,, 8
|
||||
-L(more_32_bytes):
|
||||
- movdqu (%rdi), %xmm0
|
||||
- movdqu (%rsi), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin)
|
||||
-
|
||||
- movdqu 16(%rdi), %xmm0
|
||||
- movdqu 16(%rsi), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_16)
|
||||
-
|
||||
- cmpl $64, %edx
|
||||
- jbe L(32_to_64_bytes)
|
||||
- movdqu 32(%rdi), %xmm0
|
||||
- movdqu 32(%rsi), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_32)
|
||||
-
|
||||
- .p2align 4,, 6
|
||||
-L(32_to_64_bytes):
|
||||
- movdqu -32(%rdi, %rdx), %xmm0
|
||||
- movdqu -32(%rsi, %rdx), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_end_16)
|
||||
-
|
||||
- movdqu -16(%rdi, %rdx), %xmm0
|
||||
- movdqu -16(%rsi, %rdx), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_end)
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(16_to_32_bytes):
|
||||
- movdqu (%rdi), %xmm0
|
||||
- movdqu (%rsi), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin)
|
||||
-
|
||||
- movdqu -16(%rdi, %rdx), %xmm0
|
||||
- movdqu -16(%rsi, %rdx), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_end)
|
||||
- ret
|
||||
-
|
||||
-
|
||||
- .p2align 4
|
||||
-L(79bytesormore):
|
||||
- movdqu (%rdi), %xmm0
|
||||
- movdqu (%rsi), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin)
|
||||
-
|
||||
-
|
||||
- mov %rsi, %rcx
|
||||
- and $-16, %rsi
|
||||
- add $16, %rsi
|
||||
- sub %rsi, %rcx
|
||||
-
|
||||
- sub %rcx, %rdi
|
||||
- add %rcx, %rdx
|
||||
- test $0xf, %rdi
|
||||
- jz L(2aligned)
|
||||
-
|
||||
- cmp $128, %rdx
|
||||
- ja L(128bytesormore)
|
||||
-
|
||||
- .p2align 4,, 6
|
||||
-L(less128bytes):
|
||||
- movdqu (%rdi), %xmm1
|
||||
- CMPEQ (%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin)
|
||||
-
|
||||
- movdqu 16(%rdi), %xmm1
|
||||
- CMPEQ 16(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_16)
|
||||
-
|
||||
- movdqu 32(%rdi), %xmm1
|
||||
- CMPEQ 32(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_32)
|
||||
-
|
||||
- movdqu 48(%rdi), %xmm1
|
||||
- CMPEQ 48(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_48)
|
||||
-
|
||||
- cmp $96, %rdx
|
||||
- jb L(32_to_64_bytes)
|
||||
-
|
||||
- addq $64, %rdi
|
||||
- addq $64, %rsi
|
||||
- subq $64, %rdx
|
||||
-
|
||||
- .p2align 4,, 6
|
||||
-L(last_64_bytes):
|
||||
- movdqu (%rdi), %xmm1
|
||||
- CMPEQ (%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin)
|
||||
-
|
||||
- movdqu 16(%rdi), %xmm1
|
||||
- CMPEQ 16(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_16)
|
||||
-
|
||||
- movdqu -32(%rdi, %rdx), %xmm0
|
||||
- movdqu -32(%rsi, %rdx), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_end_16)
|
||||
-
|
||||
- movdqu -16(%rdi, %rdx), %xmm0
|
||||
- movdqu -16(%rsi, %rdx), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_end)
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(128bytesormore):
|
||||
- cmp $256, %rdx
|
||||
- ja L(unaligned_loop)
|
||||
-L(less256bytes):
|
||||
- movdqu (%rdi), %xmm1
|
||||
- CMPEQ (%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin)
|
||||
-
|
||||
- movdqu 16(%rdi), %xmm1
|
||||
- CMPEQ 16(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_16)
|
||||
-
|
||||
- movdqu 32(%rdi), %xmm1
|
||||
- CMPEQ 32(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_32)
|
||||
-
|
||||
- movdqu 48(%rdi), %xmm1
|
||||
- CMPEQ 48(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_48)
|
||||
-
|
||||
- addq $64, %rdi
|
||||
- addq $64, %rsi
|
||||
-
|
||||
- movdqu (%rdi), %xmm1
|
||||
- CMPEQ (%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin)
|
||||
-
|
||||
- movdqu 16(%rdi), %xmm1
|
||||
- CMPEQ 16(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_16)
|
||||
-
|
||||
- movdqu 32(%rdi), %xmm1
|
||||
- CMPEQ 32(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_32)
|
||||
-
|
||||
- movdqu 48(%rdi), %xmm1
|
||||
- CMPEQ 48(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_48)
|
||||
-
|
||||
- addq $-128, %rdx
|
||||
- subq $-64, %rsi
|
||||
- subq $-64, %rdi
|
||||
-
|
||||
- cmp $64, %rdx
|
||||
- ja L(less128bytes)
|
||||
-
|
||||
- cmp $32, %rdx
|
||||
- ja L(last_64_bytes)
|
||||
-
|
||||
- movdqu -32(%rdi, %rdx), %xmm0
|
||||
- movdqu -32(%rsi, %rdx), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_end_16)
|
||||
-
|
||||
- movdqu -16(%rdi, %rdx), %xmm0
|
||||
- movdqu -16(%rsi, %rdx), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_end)
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(unaligned_loop):
|
||||
-# ifdef DATA_CACHE_SIZE_HALF
|
||||
- mov $DATA_CACHE_SIZE_HALF, %R8_LP
|
||||
-# else
|
||||
- mov __x86_data_cache_size_half(%rip), %R8_LP
|
||||
-# endif
|
||||
- movq %r8, %r9
|
||||
- addq %r8, %r8
|
||||
- addq %r9, %r8
|
||||
- cmpq %r8, %rdx
|
||||
- ja L(L2_L3_cache_unaligned)
|
||||
- sub $64, %rdx
|
||||
- .p2align 4
|
||||
-L(64bytesormore_loop):
|
||||
- movdqu (%rdi), %xmm0
|
||||
- movdqu 16(%rdi), %xmm1
|
||||
- movdqu 32(%rdi), %xmm2
|
||||
- movdqu 48(%rdi), %xmm3
|
||||
-
|
||||
- CMPEQ (%rsi), %xmm0
|
||||
- CMPEQ 16(%rsi), %xmm1
|
||||
- CMPEQ 32(%rsi), %xmm2
|
||||
- CMPEQ 48(%rsi), %xmm3
|
||||
-
|
||||
- pand %xmm0, %xmm1
|
||||
- pand %xmm2, %xmm3
|
||||
- pand %xmm1, %xmm3
|
||||
-
|
||||
- pmovmskb %xmm3, %eax
|
||||
- incw %ax
|
||||
- jnz L(64bytesormore_loop_end)
|
||||
-
|
||||
- add $64, %rsi
|
||||
- add $64, %rdi
|
||||
- sub $64, %rdx
|
||||
- ja L(64bytesormore_loop)
|
||||
-
|
||||
- .p2align 4,, 6
|
||||
-L(loop_tail):
|
||||
- addq %rdx, %rdi
|
||||
- movdqu (%rdi), %xmm0
|
||||
- movdqu 16(%rdi), %xmm1
|
||||
- movdqu 32(%rdi), %xmm2
|
||||
- movdqu 48(%rdi), %xmm3
|
||||
-
|
||||
- addq %rdx, %rsi
|
||||
- movdqu (%rsi), %xmm4
|
||||
- movdqu 16(%rsi), %xmm5
|
||||
- movdqu 32(%rsi), %xmm6
|
||||
- movdqu 48(%rsi), %xmm7
|
||||
-
|
||||
- CMPEQ %xmm4, %xmm0
|
||||
- CMPEQ %xmm5, %xmm1
|
||||
- CMPEQ %xmm6, %xmm2
|
||||
- CMPEQ %xmm7, %xmm3
|
||||
-
|
||||
- pand %xmm0, %xmm1
|
||||
- pand %xmm2, %xmm3
|
||||
- pand %xmm1, %xmm3
|
||||
-
|
||||
- pmovmskb %xmm3, %eax
|
||||
- incw %ax
|
||||
- jnz L(64bytesormore_loop_end)
|
||||
- ret
|
||||
-
|
||||
-L(L2_L3_cache_unaligned):
|
||||
- subq $64, %rdx
|
||||
- .p2align 4
|
||||
-L(L2_L3_unaligned_128bytes_loop):
|
||||
- prefetchnta 0x1c0(%rdi)
|
||||
- prefetchnta 0x1c0(%rsi)
|
||||
-
|
||||
- movdqu (%rdi), %xmm0
|
||||
- movdqu 16(%rdi), %xmm1
|
||||
- movdqu 32(%rdi), %xmm2
|
||||
- movdqu 48(%rdi), %xmm3
|
||||
-
|
||||
- CMPEQ (%rsi), %xmm0
|
||||
- CMPEQ 16(%rsi), %xmm1
|
||||
- CMPEQ 32(%rsi), %xmm2
|
||||
- CMPEQ 48(%rsi), %xmm3
|
||||
-
|
||||
- pand %xmm0, %xmm1
|
||||
- pand %xmm2, %xmm3
|
||||
- pand %xmm1, %xmm3
|
||||
-
|
||||
- pmovmskb %xmm3, %eax
|
||||
- incw %ax
|
||||
- jnz L(64bytesormore_loop_end)
|
||||
-
|
||||
- add $64, %rsi
|
||||
- add $64, %rdi
|
||||
- sub $64, %rdx
|
||||
- ja L(L2_L3_unaligned_128bytes_loop)
|
||||
- jmp L(loop_tail)
|
||||
-
|
||||
-
|
||||
- /* This case is for machines which are sensitive for unaligned
|
||||
- * instructions. */
|
||||
- .p2align 4
|
||||
-L(2aligned):
|
||||
- cmp $128, %rdx
|
||||
- ja L(128bytesormorein2aligned)
|
||||
-L(less128bytesin2aligned):
|
||||
- movdqa (%rdi), %xmm1
|
||||
- CMPEQ (%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin)
|
||||
-
|
||||
- movdqa 16(%rdi), %xmm1
|
||||
- CMPEQ 16(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_16)
|
||||
-
|
||||
- movdqa 32(%rdi), %xmm1
|
||||
- CMPEQ 32(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_32)
|
||||
-
|
||||
- movdqa 48(%rdi), %xmm1
|
||||
- CMPEQ 48(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_48)
|
||||
-
|
||||
- cmp $96, %rdx
|
||||
- jb L(32_to_64_bytes)
|
||||
-
|
||||
- addq $64, %rdi
|
||||
- addq $64, %rsi
|
||||
- subq $64, %rdx
|
||||
-
|
||||
- .p2align 4,, 6
|
||||
-L(aligned_last_64_bytes):
|
||||
- movdqa (%rdi), %xmm1
|
||||
- CMPEQ (%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin)
|
||||
-
|
||||
- movdqa 16(%rdi), %xmm1
|
||||
- CMPEQ 16(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_16)
|
||||
-
|
||||
- movdqu -32(%rdi, %rdx), %xmm0
|
||||
- movdqu -32(%rsi, %rdx), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_end_16)
|
||||
-
|
||||
- movdqu -16(%rdi, %rdx), %xmm0
|
||||
- movdqu -16(%rsi, %rdx), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_end)
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(128bytesormorein2aligned):
|
||||
- cmp $256, %rdx
|
||||
- ja L(aligned_loop)
|
||||
-L(less256bytesin2alinged):
|
||||
- movdqa (%rdi), %xmm1
|
||||
- CMPEQ (%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin)
|
||||
-
|
||||
- movdqa 16(%rdi), %xmm1
|
||||
- CMPEQ 16(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_16)
|
||||
-
|
||||
- movdqa 32(%rdi), %xmm1
|
||||
- CMPEQ 32(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_32)
|
||||
-
|
||||
- movdqa 48(%rdi), %xmm1
|
||||
- CMPEQ 48(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_48)
|
||||
-
|
||||
- addq $64, %rdi
|
||||
- addq $64, %rsi
|
||||
-
|
||||
- movdqa (%rdi), %xmm1
|
||||
- CMPEQ (%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin)
|
||||
-
|
||||
- movdqa 16(%rdi), %xmm1
|
||||
- CMPEQ 16(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_16)
|
||||
-
|
||||
- movdqa 32(%rdi), %xmm1
|
||||
- CMPEQ 32(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_32)
|
||||
-
|
||||
- movdqa 48(%rdi), %xmm1
|
||||
- CMPEQ 48(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_48)
|
||||
-
|
||||
- addq $-128, %rdx
|
||||
- subq $-64, %rsi
|
||||
- subq $-64, %rdi
|
||||
-
|
||||
- cmp $64, %rdx
|
||||
- ja L(less128bytesin2aligned)
|
||||
-
|
||||
- cmp $32, %rdx
|
||||
- ja L(aligned_last_64_bytes)
|
||||
-
|
||||
- movdqu -32(%rdi, %rdx), %xmm0
|
||||
- movdqu -32(%rsi, %rdx), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_end_16)
|
||||
-
|
||||
- movdqu -16(%rdi, %rdx), %xmm0
|
||||
- movdqu -16(%rsi, %rdx), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_end)
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(aligned_loop):
|
||||
-# ifdef DATA_CACHE_SIZE_HALF
|
||||
- mov $DATA_CACHE_SIZE_HALF, %R8_LP
|
||||
-# else
|
||||
- mov __x86_data_cache_size_half(%rip), %R8_LP
|
||||
-# endif
|
||||
- movq %r8, %r9
|
||||
- addq %r8, %r8
|
||||
- addq %r9, %r8
|
||||
- cmpq %r8, %rdx
|
||||
- ja L(L2_L3_cache_aligned)
|
||||
-
|
||||
- sub $64, %rdx
|
||||
- .p2align 4
|
||||
-L(64bytesormore_loopin2aligned):
|
||||
- movdqa (%rdi), %xmm0
|
||||
- movdqa 16(%rdi), %xmm1
|
||||
- movdqa 32(%rdi), %xmm2
|
||||
- movdqa 48(%rdi), %xmm3
|
||||
-
|
||||
- CMPEQ (%rsi), %xmm0
|
||||
- CMPEQ 16(%rsi), %xmm1
|
||||
- CMPEQ 32(%rsi), %xmm2
|
||||
- CMPEQ 48(%rsi), %xmm3
|
||||
-
|
||||
- pand %xmm0, %xmm1
|
||||
- pand %xmm2, %xmm3
|
||||
- pand %xmm1, %xmm3
|
||||
-
|
||||
- pmovmskb %xmm3, %eax
|
||||
- incw %ax
|
||||
- jnz L(64bytesormore_loop_end)
|
||||
- add $64, %rsi
|
||||
- add $64, %rdi
|
||||
- sub $64, %rdx
|
||||
- ja L(64bytesormore_loopin2aligned)
|
||||
- jmp L(loop_tail)
|
||||
-
|
||||
-L(L2_L3_cache_aligned):
|
||||
- subq $64, %rdx
|
||||
- .p2align 4
|
||||
-L(L2_L3_aligned_128bytes_loop):
|
||||
- prefetchnta 0x1c0(%rdi)
|
||||
- prefetchnta 0x1c0(%rsi)
|
||||
- movdqa (%rdi), %xmm0
|
||||
- movdqa 16(%rdi), %xmm1
|
||||
- movdqa 32(%rdi), %xmm2
|
||||
- movdqa 48(%rdi), %xmm3
|
||||
-
|
||||
- CMPEQ (%rsi), %xmm0
|
||||
- CMPEQ 16(%rsi), %xmm1
|
||||
- CMPEQ 32(%rsi), %xmm2
|
||||
- CMPEQ 48(%rsi), %xmm3
|
||||
-
|
||||
- pand %xmm0, %xmm1
|
||||
- pand %xmm2, %xmm3
|
||||
- pand %xmm1, %xmm3
|
||||
-
|
||||
- pmovmskb %xmm3, %eax
|
||||
- incw %ax
|
||||
- jnz L(64bytesormore_loop_end)
|
||||
-
|
||||
- addq $64, %rsi
|
||||
- addq $64, %rdi
|
||||
- subq $64, %rdx
|
||||
- ja L(L2_L3_aligned_128bytes_loop)
|
||||
- jmp L(loop_tail)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(64bytesormore_loop_end):
|
||||
- pmovmskb %xmm0, %ecx
|
||||
- incw %cx
|
||||
- jnz L(loop_end_ret)
|
||||
-
|
||||
- pmovmskb %xmm1, %ecx
|
||||
- notw %cx
|
||||
- sall $16, %ecx
|
||||
- jnz L(loop_end_ret)
|
||||
-
|
||||
- pmovmskb %xmm2, %ecx
|
||||
- notw %cx
|
||||
- shlq $32, %rcx
|
||||
- jnz L(loop_end_ret)
|
||||
-
|
||||
- addq $48, %rdi
|
||||
- addq $48, %rsi
|
||||
- movq %rax, %rcx
|
||||
-
|
||||
- .p2align 4,, 6
|
||||
-L(loop_end_ret):
|
||||
- bsfq %rcx, %rcx
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- movl (%rdi, %rcx), %eax
|
||||
- xorl %edx, %edx
|
||||
- cmpl (%rsi, %rcx), %eax
|
||||
- setg %dl
|
||||
- leal -1(%rdx, %rdx), %eax
|
||||
-# else
|
||||
- movzbl (%rdi, %rcx), %eax
|
||||
- movzbl (%rsi, %rcx), %ecx
|
||||
- subl %ecx, %eax
|
||||
-# endif
|
||||
- ret
|
||||
-END (MEMCMP)
|
||||
-#endif
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,263 +0,0 @@
|
|||
From 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Fri, 15 Apr 2022 12:28:01 -0500
|
||||
Subject: [PATCH] x86: Cleanup page cross code in memcmp-avx2-movbe.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Old code was both inefficient and wasted code size. New code (-62
|
||||
bytes) and comparable or better performance in the page cross case.
|
||||
|
||||
geometric_mean(N=20) of page cross cases New / Original: 0.960
|
||||
|
||||
size, align0, align1, ret, New Time/Old Time
|
||||
1, 4095, 0, 0, 1.001
|
||||
1, 4095, 0, 1, 0.999
|
||||
1, 4095, 0, -1, 1.0
|
||||
2, 4094, 0, 0, 1.0
|
||||
2, 4094, 0, 1, 1.0
|
||||
2, 4094, 0, -1, 1.0
|
||||
3, 4093, 0, 0, 1.0
|
||||
3, 4093, 0, 1, 1.0
|
||||
3, 4093, 0, -1, 1.0
|
||||
4, 4092, 0, 0, 0.987
|
||||
4, 4092, 0, 1, 1.0
|
||||
4, 4092, 0, -1, 1.0
|
||||
5, 4091, 0, 0, 0.984
|
||||
5, 4091, 0, 1, 1.002
|
||||
5, 4091, 0, -1, 1.005
|
||||
6, 4090, 0, 0, 0.993
|
||||
6, 4090, 0, 1, 1.001
|
||||
6, 4090, 0, -1, 1.003
|
||||
7, 4089, 0, 0, 0.991
|
||||
7, 4089, 0, 1, 1.0
|
||||
7, 4089, 0, -1, 1.001
|
||||
8, 4088, 0, 0, 0.875
|
||||
8, 4088, 0, 1, 0.881
|
||||
8, 4088, 0, -1, 0.888
|
||||
9, 4087, 0, 0, 0.872
|
||||
9, 4087, 0, 1, 0.879
|
||||
9, 4087, 0, -1, 0.883
|
||||
10, 4086, 0, 0, 0.878
|
||||
10, 4086, 0, 1, 0.886
|
||||
10, 4086, 0, -1, 0.873
|
||||
11, 4085, 0, 0, 0.878
|
||||
11, 4085, 0, 1, 0.881
|
||||
11, 4085, 0, -1, 0.879
|
||||
12, 4084, 0, 0, 0.873
|
||||
12, 4084, 0, 1, 0.889
|
||||
12, 4084, 0, -1, 0.875
|
||||
13, 4083, 0, 0, 0.873
|
||||
13, 4083, 0, 1, 0.863
|
||||
13, 4083, 0, -1, 0.863
|
||||
14, 4082, 0, 0, 0.838
|
||||
14, 4082, 0, 1, 0.869
|
||||
14, 4082, 0, -1, 0.877
|
||||
15, 4081, 0, 0, 0.841
|
||||
15, 4081, 0, 1, 0.869
|
||||
15, 4081, 0, -1, 0.876
|
||||
16, 4080, 0, 0, 0.988
|
||||
16, 4080, 0, 1, 0.99
|
||||
16, 4080, 0, -1, 0.989
|
||||
17, 4079, 0, 0, 0.978
|
||||
17, 4079, 0, 1, 0.981
|
||||
17, 4079, 0, -1, 0.98
|
||||
18, 4078, 0, 0, 0.981
|
||||
18, 4078, 0, 1, 0.98
|
||||
18, 4078, 0, -1, 0.985
|
||||
19, 4077, 0, 0, 0.977
|
||||
19, 4077, 0, 1, 0.979
|
||||
19, 4077, 0, -1, 0.986
|
||||
20, 4076, 0, 0, 0.977
|
||||
20, 4076, 0, 1, 0.986
|
||||
20, 4076, 0, -1, 0.984
|
||||
21, 4075, 0, 0, 0.977
|
||||
21, 4075, 0, 1, 0.983
|
||||
21, 4075, 0, -1, 0.988
|
||||
22, 4074, 0, 0, 0.983
|
||||
22, 4074, 0, 1, 0.994
|
||||
22, 4074, 0, -1, 0.993
|
||||
23, 4073, 0, 0, 0.98
|
||||
23, 4073, 0, 1, 0.992
|
||||
23, 4073, 0, -1, 0.995
|
||||
24, 4072, 0, 0, 0.989
|
||||
24, 4072, 0, 1, 0.989
|
||||
24, 4072, 0, -1, 0.991
|
||||
25, 4071, 0, 0, 0.99
|
||||
25, 4071, 0, 1, 0.999
|
||||
25, 4071, 0, -1, 0.996
|
||||
26, 4070, 0, 0, 0.993
|
||||
26, 4070, 0, 1, 0.995
|
||||
26, 4070, 0, -1, 0.998
|
||||
27, 4069, 0, 0, 0.993
|
||||
27, 4069, 0, 1, 0.999
|
||||
27, 4069, 0, -1, 1.0
|
||||
28, 4068, 0, 0, 0.997
|
||||
28, 4068, 0, 1, 1.0
|
||||
28, 4068, 0, -1, 0.999
|
||||
29, 4067, 0, 0, 0.996
|
||||
29, 4067, 0, 1, 0.999
|
||||
29, 4067, 0, -1, 0.999
|
||||
30, 4066, 0, 0, 0.991
|
||||
30, 4066, 0, 1, 1.001
|
||||
30, 4066, 0, -1, 0.999
|
||||
31, 4065, 0, 0, 0.988
|
||||
31, 4065, 0, 1, 0.998
|
||||
31, 4065, 0, -1, 0.998
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++--------
|
||||
1 file changed, 61 insertions(+), 37 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
||||
index 16fc673e..99258cf5 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
||||
@@ -429,22 +429,21 @@ L(page_cross_less_vec):
|
||||
# ifndef USE_AS_WMEMCMP
|
||||
cmpl $8, %edx
|
||||
jae L(between_8_15)
|
||||
+ /* Fall through for [4, 7]. */
|
||||
cmpl $4, %edx
|
||||
- jae L(between_4_7)
|
||||
+ jb L(between_2_3)
|
||||
|
||||
- /* Load as big endian to avoid branches. */
|
||||
- movzwl (%rdi), %eax
|
||||
- movzwl (%rsi), %ecx
|
||||
- shll $8, %eax
|
||||
- shll $8, %ecx
|
||||
- bswap %eax
|
||||
- bswap %ecx
|
||||
- movzbl -1(%rdi, %rdx), %edi
|
||||
- movzbl -1(%rsi, %rdx), %esi
|
||||
- orl %edi, %eax
|
||||
- orl %esi, %ecx
|
||||
- /* Subtraction is okay because the upper 8 bits are zero. */
|
||||
- subl %ecx, %eax
|
||||
+ movbe (%rdi), %eax
|
||||
+ movbe (%rsi), %ecx
|
||||
+ shlq $32, %rax
|
||||
+ shlq $32, %rcx
|
||||
+ movbe -4(%rdi, %rdx), %edi
|
||||
+ movbe -4(%rsi, %rdx), %esi
|
||||
+ orq %rdi, %rax
|
||||
+ orq %rsi, %rcx
|
||||
+ subq %rcx, %rax
|
||||
+ /* Fast path for return zero. */
|
||||
+ jnz L(ret_nonzero)
|
||||
/* No ymm register was touched. */
|
||||
ret
|
||||
|
||||
@@ -457,9 +456,33 @@ L(one_or_less):
|
||||
/* No ymm register was touched. */
|
||||
ret
|
||||
|
||||
+ .p2align 4,, 5
|
||||
+L(ret_nonzero):
|
||||
+ sbbl %eax, %eax
|
||||
+ orl $1, %eax
|
||||
+ /* No ymm register was touched. */
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4,, 2
|
||||
+L(zero):
|
||||
+ xorl %eax, %eax
|
||||
+ /* No ymm register was touched. */
|
||||
+ ret
|
||||
+
|
||||
.p2align 4
|
||||
L(between_8_15):
|
||||
-# endif
|
||||
+ movbe (%rdi), %rax
|
||||
+ movbe (%rsi), %rcx
|
||||
+ subq %rcx, %rax
|
||||
+ jnz L(ret_nonzero)
|
||||
+ movbe -8(%rdi, %rdx), %rax
|
||||
+ movbe -8(%rsi, %rdx), %rcx
|
||||
+ subq %rcx, %rax
|
||||
+ /* Fast path for return zero. */
|
||||
+ jnz L(ret_nonzero)
|
||||
+ /* No ymm register was touched. */
|
||||
+ ret
|
||||
+# else
|
||||
/* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
|
||||
vmovq (%rdi), %xmm1
|
||||
vmovq (%rsi), %xmm2
|
||||
@@ -475,16 +498,13 @@ L(between_8_15):
|
||||
VPCMPEQ %xmm1, %xmm2, %xmm2
|
||||
vpmovmskb %xmm2, %eax
|
||||
subl $0xffff, %eax
|
||||
+ /* Fast path for return zero. */
|
||||
jnz L(return_vec_0)
|
||||
/* No ymm register was touched. */
|
||||
ret
|
||||
+# endif
|
||||
|
||||
- .p2align 4
|
||||
-L(zero):
|
||||
- xorl %eax, %eax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
+ .p2align 4,, 10
|
||||
L(between_16_31):
|
||||
/* From 16 to 31 bytes. No branch when size == 16. */
|
||||
vmovdqu (%rsi), %xmm2
|
||||
@@ -501,11 +521,17 @@ L(between_16_31):
|
||||
VPCMPEQ (%rdi), %xmm2, %xmm2
|
||||
vpmovmskb %xmm2, %eax
|
||||
subl $0xffff, %eax
|
||||
+ /* Fast path for return zero. */
|
||||
jnz L(return_vec_0)
|
||||
/* No ymm register was touched. */
|
||||
ret
|
||||
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
+ .p2align 4,, 2
|
||||
+L(zero):
|
||||
+ xorl %eax, %eax
|
||||
+ ret
|
||||
+
|
||||
.p2align 4
|
||||
L(one_or_less):
|
||||
jb L(zero)
|
||||
@@ -520,22 +546,20 @@ L(one_or_less):
|
||||
# else
|
||||
|
||||
.p2align 4
|
||||
-L(between_4_7):
|
||||
- /* Load as big endian with overlapping movbe to avoid branches.
|
||||
- */
|
||||
- movbe (%rdi), %eax
|
||||
- movbe (%rsi), %ecx
|
||||
- shlq $32, %rax
|
||||
- shlq $32, %rcx
|
||||
- movbe -4(%rdi, %rdx), %edi
|
||||
- movbe -4(%rsi, %rdx), %esi
|
||||
- orq %rdi, %rax
|
||||
- orq %rsi, %rcx
|
||||
- subq %rcx, %rax
|
||||
- jz L(zero_4_7)
|
||||
- sbbl %eax, %eax
|
||||
- orl $1, %eax
|
||||
-L(zero_4_7):
|
||||
+L(between_2_3):
|
||||
+ /* Load as big endian to avoid branches. */
|
||||
+ movzwl (%rdi), %eax
|
||||
+ movzwl (%rsi), %ecx
|
||||
+ bswap %eax
|
||||
+ bswap %ecx
|
||||
+ shrl %eax
|
||||
+ shrl %ecx
|
||||
+ movzbl -1(%rdi, %rdx), %edi
|
||||
+ movzbl -1(%rsi, %rdx), %esi
|
||||
+ orl %edi, %eax
|
||||
+ orl %esi, %ecx
|
||||
+ /* Subtraction is okay because the upper bit is zero. */
|
||||
+ subl %ecx, %eax
|
||||
/* No ymm register was touched. */
|
||||
ret
|
||||
# endif
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,876 +0,0 @@
|
|||
From 5307aa9c1800f36a64c183c091c9af392c1fa75c Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Thu, 21 Apr 2022 20:52:28 -0500
|
||||
Subject: [PATCH] x86: Optimize {str|wcs}rchr-sse2
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
The new code unrolls the main loop slightly without adding too much
|
||||
overhead and minimizes the comparisons for the search CHAR.
|
||||
|
||||
Geometric Mean of all benchmarks New / Old: 0.741
|
||||
See email for all results.
|
||||
|
||||
Full xcheck passes on x86_64 with and without multiarch enabled.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +-
|
||||
sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +-
|
||||
sysdeps/x86_64/strrchr.S | 510 +++++++++++++++---------
|
||||
sysdeps/x86_64/wcsrchr.S | 266 +-----------
|
||||
4 files changed, 338 insertions(+), 443 deletions(-)
|
||||
|
||||
Conflicts:
|
||||
sysdeps/x86_64/wcsrchr.S
|
||||
(copyright header)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
|
||||
index 0ec76fe9..6bb1284b 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
|
||||
@@ -17,7 +17,7 @@
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#if IS_IN (libc)
|
||||
-# define strrchr __strrchr_sse2
|
||||
+# define STRRCHR __strrchr_sse2
|
||||
|
||||
# undef weak_alias
|
||||
# define weak_alias(strrchr, rindex)
|
||||
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
|
||||
index d015e953..f26d53b5 100644
|
||||
--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
|
||||
@@ -17,7 +17,6 @@
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#if IS_IN (libc)
|
||||
-# define wcsrchr __wcsrchr_sse2
|
||||
+# define STRRCHR __wcsrchr_sse2
|
||||
#endif
|
||||
-
|
||||
#include "../wcsrchr.S"
|
||||
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
|
||||
index aca98e7e..a58cc220 100644
|
||||
--- a/sysdeps/x86_64/strrchr.S
|
||||
+++ b/sysdeps/x86_64/strrchr.S
|
||||
@@ -19,210 +19,360 @@
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
+#ifndef STRRCHR
|
||||
+# define STRRCHR strrchr
|
||||
+#endif
|
||||
+
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+# define PCMPEQ pcmpeqd
|
||||
+# define CHAR_SIZE 4
|
||||
+# define PMINU pminud
|
||||
+#else
|
||||
+# define PCMPEQ pcmpeqb
|
||||
+# define CHAR_SIZE 1
|
||||
+# define PMINU pminub
|
||||
+#endif
|
||||
+
|
||||
+#define PAGE_SIZE 4096
|
||||
+#define VEC_SIZE 16
|
||||
+
|
||||
.text
|
||||
-ENTRY (strrchr)
|
||||
- movd %esi, %xmm1
|
||||
+ENTRY(STRRCHR)
|
||||
+ movd %esi, %xmm0
|
||||
movq %rdi, %rax
|
||||
- andl $4095, %eax
|
||||
- punpcklbw %xmm1, %xmm1
|
||||
- cmpq $4032, %rax
|
||||
- punpcklwd %xmm1, %xmm1
|
||||
- pshufd $0, %xmm1, %xmm1
|
||||
+ andl $(PAGE_SIZE - 1), %eax
|
||||
+#ifndef USE_AS_WCSRCHR
|
||||
+ punpcklbw %xmm0, %xmm0
|
||||
+ punpcklwd %xmm0, %xmm0
|
||||
+#endif
|
||||
+ pshufd $0, %xmm0, %xmm0
|
||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
ja L(cross_page)
|
||||
- movdqu (%rdi), %xmm0
|
||||
+
|
||||
+L(cross_page_continue):
|
||||
+ movups (%rdi), %xmm1
|
||||
pxor %xmm2, %xmm2
|
||||
- movdqa %xmm0, %xmm3
|
||||
- pcmpeqb %xmm1, %xmm0
|
||||
- pcmpeqb %xmm2, %xmm3
|
||||
- pmovmskb %xmm0, %ecx
|
||||
- pmovmskb %xmm3, %edx
|
||||
- testq %rdx, %rdx
|
||||
- je L(next_48_bytes)
|
||||
- leaq -1(%rdx), %rax
|
||||
- xorq %rdx, %rax
|
||||
- andq %rcx, %rax
|
||||
- je L(exit)
|
||||
- bsrq %rax, %rax
|
||||
+ PCMPEQ %xmm1, %xmm2
|
||||
+ pmovmskb %xmm2, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jz L(aligned_more)
|
||||
+
|
||||
+ PCMPEQ %xmm0, %xmm1
|
||||
+ pmovmskb %xmm1, %eax
|
||||
+ leal -1(%rcx), %edx
|
||||
+ xorl %edx, %ecx
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(ret0)
|
||||
+ bsrl %eax, %eax
|
||||
addq %rdi, %rax
|
||||
+ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
|
||||
+ search CHAR is zero we are correct. Either way `andq
|
||||
+ -CHAR_SIZE, %rax` gets the correct result. */
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+#endif
|
||||
+L(ret0):
|
||||
ret
|
||||
|
||||
+ /* Returns for first vec x1/x2 have hard coded backward search
|
||||
+ path for earlier matches. */
|
||||
.p2align 4
|
||||
-L(next_48_bytes):
|
||||
- movdqu 16(%rdi), %xmm4
|
||||
- movdqa %xmm4, %xmm5
|
||||
- movdqu 32(%rdi), %xmm3
|
||||
- pcmpeqb %xmm1, %xmm4
|
||||
- pcmpeqb %xmm2, %xmm5
|
||||
- movdqu 48(%rdi), %xmm0
|
||||
- pmovmskb %xmm5, %edx
|
||||
- movdqa %xmm3, %xmm5
|
||||
- pcmpeqb %xmm1, %xmm3
|
||||
- pcmpeqb %xmm2, %xmm5
|
||||
- pcmpeqb %xmm0, %xmm2
|
||||
- salq $16, %rdx
|
||||
- pmovmskb %xmm3, %r8d
|
||||
- pmovmskb %xmm5, %eax
|
||||
- pmovmskb %xmm2, %esi
|
||||
- salq $32, %r8
|
||||
- salq $32, %rax
|
||||
- pcmpeqb %xmm1, %xmm0
|
||||
- orq %rdx, %rax
|
||||
- movq %rsi, %rdx
|
||||
- pmovmskb %xmm4, %esi
|
||||
- salq $48, %rdx
|
||||
- salq $16, %rsi
|
||||
- orq %r8, %rsi
|
||||
- orq %rcx, %rsi
|
||||
- pmovmskb %xmm0, %ecx
|
||||
- salq $48, %rcx
|
||||
- orq %rcx, %rsi
|
||||
- orq %rdx, %rax
|
||||
- je L(loop_header2)
|
||||
- leaq -1(%rax), %rcx
|
||||
- xorq %rax, %rcx
|
||||
- andq %rcx, %rsi
|
||||
- je L(exit)
|
||||
- bsrq %rsi, %rsi
|
||||
- leaq (%rdi,%rsi), %rax
|
||||
+L(first_vec_x0_test):
|
||||
+ PCMPEQ %xmm0, %xmm1
|
||||
+ pmovmskb %xmm1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jz L(ret0)
|
||||
+ bsrl %eax, %eax
|
||||
+ addq %r8, %rax
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+#endif
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(loop_header2):
|
||||
- testq %rsi, %rsi
|
||||
- movq %rdi, %rcx
|
||||
- je L(no_c_found)
|
||||
-L(loop_header):
|
||||
- addq $64, %rdi
|
||||
- pxor %xmm7, %xmm7
|
||||
- andq $-64, %rdi
|
||||
- jmp L(loop_entry)
|
||||
+L(first_vec_x1):
|
||||
+ PCMPEQ %xmm0, %xmm2
|
||||
+ pmovmskb %xmm2, %eax
|
||||
+ leal -1(%rcx), %edx
|
||||
+ xorl %edx, %ecx
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(first_vec_x0_test)
|
||||
+ bsrl %eax, %eax
|
||||
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+#endif
|
||||
+ ret
|
||||
|
||||
.p2align 4
|
||||
-L(loop64):
|
||||
- testq %rdx, %rdx
|
||||
- cmovne %rdx, %rsi
|
||||
- cmovne %rdi, %rcx
|
||||
- addq $64, %rdi
|
||||
-L(loop_entry):
|
||||
- movdqa 32(%rdi), %xmm3
|
||||
- pxor %xmm6, %xmm6
|
||||
- movdqa 48(%rdi), %xmm2
|
||||
- movdqa %xmm3, %xmm0
|
||||
- movdqa 16(%rdi), %xmm4
|
||||
- pminub %xmm2, %xmm0
|
||||
- movdqa (%rdi), %xmm5
|
||||
- pminub %xmm4, %xmm0
|
||||
- pminub %xmm5, %xmm0
|
||||
- pcmpeqb %xmm7, %xmm0
|
||||
- pmovmskb %xmm0, %eax
|
||||
- movdqa %xmm5, %xmm0
|
||||
- pcmpeqb %xmm1, %xmm0
|
||||
- pmovmskb %xmm0, %r9d
|
||||
- movdqa %xmm4, %xmm0
|
||||
- pcmpeqb %xmm1, %xmm0
|
||||
- pmovmskb %xmm0, %edx
|
||||
- movdqa %xmm3, %xmm0
|
||||
- pcmpeqb %xmm1, %xmm0
|
||||
- salq $16, %rdx
|
||||
- pmovmskb %xmm0, %r10d
|
||||
- movdqa %xmm2, %xmm0
|
||||
- pcmpeqb %xmm1, %xmm0
|
||||
- salq $32, %r10
|
||||
- orq %r10, %rdx
|
||||
- pmovmskb %xmm0, %r8d
|
||||
- orq %r9, %rdx
|
||||
- salq $48, %r8
|
||||
- orq %r8, %rdx
|
||||
+L(first_vec_x1_test):
|
||||
+ PCMPEQ %xmm0, %xmm2
|
||||
+ pmovmskb %xmm2, %eax
|
||||
testl %eax, %eax
|
||||
- je L(loop64)
|
||||
- pcmpeqb %xmm6, %xmm4
|
||||
- pcmpeqb %xmm6, %xmm3
|
||||
- pcmpeqb %xmm6, %xmm5
|
||||
- pmovmskb %xmm4, %eax
|
||||
- pmovmskb %xmm3, %r10d
|
||||
- pcmpeqb %xmm6, %xmm2
|
||||
- pmovmskb %xmm5, %r9d
|
||||
- salq $32, %r10
|
||||
- salq $16, %rax
|
||||
- pmovmskb %xmm2, %r8d
|
||||
- orq %r10, %rax
|
||||
- orq %r9, %rax
|
||||
- salq $48, %r8
|
||||
- orq %r8, %rax
|
||||
- leaq -1(%rax), %r8
|
||||
- xorq %rax, %r8
|
||||
- andq %r8, %rdx
|
||||
- cmovne %rdi, %rcx
|
||||
- cmovne %rdx, %rsi
|
||||
- bsrq %rsi, %rsi
|
||||
- leaq (%rcx,%rsi), %rax
|
||||
+ jz L(first_vec_x0_test)
|
||||
+ bsrl %eax, %eax
|
||||
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+#endif
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(first_vec_x2):
|
||||
+ PCMPEQ %xmm0, %xmm3
|
||||
+ pmovmskb %xmm3, %eax
|
||||
+ leal -1(%rcx), %edx
|
||||
+ xorl %edx, %ecx
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(first_vec_x1_test)
|
||||
+ bsrl %eax, %eax
|
||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+#endif
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(aligned_more):
|
||||
+ /* Save original pointer if match was in VEC 0. */
|
||||
+ movq %rdi, %r8
|
||||
+ andq $-VEC_SIZE, %rdi
|
||||
+
|
||||
+ movaps VEC_SIZE(%rdi), %xmm2
|
||||
+ pxor %xmm3, %xmm3
|
||||
+ PCMPEQ %xmm2, %xmm3
|
||||
+ pmovmskb %xmm3, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(first_vec_x1)
|
||||
+
|
||||
+ movaps (VEC_SIZE * 2)(%rdi), %xmm3
|
||||
+ pxor %xmm4, %xmm4
|
||||
+ PCMPEQ %xmm3, %xmm4
|
||||
+ pmovmskb %xmm4, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(first_vec_x2)
|
||||
+
|
||||
+ addq $VEC_SIZE, %rdi
|
||||
+ /* Save pointer again before realigning. */
|
||||
+ movq %rdi, %rsi
|
||||
+ andq $-(VEC_SIZE * 2), %rdi
|
||||
+ .p2align 4
|
||||
+L(first_loop):
|
||||
+ /* Do 2x VEC at a time. */
|
||||
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
|
||||
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
|
||||
+ /* Since SSE2 no pminud so wcsrchr needs seperate logic for
|
||||
+ detecting zero. Note if this is found to be a bottleneck it
|
||||
+ may be worth adding an SSE4.1 wcsrchr implementation. */
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+ movaps %xmm5, %xmm6
|
||||
+ pxor %xmm8, %xmm8
|
||||
+
|
||||
+ PCMPEQ %xmm8, %xmm5
|
||||
+ PCMPEQ %xmm4, %xmm8
|
||||
+ por %xmm5, %xmm8
|
||||
+#else
|
||||
+ movaps %xmm5, %xmm6
|
||||
+ PMINU %xmm4, %xmm5
|
||||
+#endif
|
||||
+
|
||||
+ movaps %xmm4, %xmm9
|
||||
+ PCMPEQ %xmm0, %xmm4
|
||||
+ PCMPEQ %xmm0, %xmm6
|
||||
+ movaps %xmm6, %xmm7
|
||||
+ por %xmm4, %xmm6
|
||||
+#ifndef USE_AS_WCSRCHR
|
||||
+ pxor %xmm8, %xmm8
|
||||
+ PCMPEQ %xmm5, %xmm8
|
||||
+#endif
|
||||
+ pmovmskb %xmm8, %ecx
|
||||
+ pmovmskb %xmm6, %eax
|
||||
+
|
||||
+ addq $(VEC_SIZE * 2), %rdi
|
||||
+ /* Use `addl` 1) so we can undo it with `subl` and 2) it can
|
||||
+ macro-fuse with `jz`. */
|
||||
+ addl %ecx, %eax
|
||||
+ jz L(first_loop)
|
||||
+
|
||||
+ /* Check if there is zero match. */
|
||||
+ testl %ecx, %ecx
|
||||
+ jz L(second_loop_match)
|
||||
+
|
||||
+ /* Check if there was a match in last iteration. */
|
||||
+ subl %ecx, %eax
|
||||
+ jnz L(new_match)
|
||||
+
|
||||
+L(first_loop_old_match):
|
||||
+ PCMPEQ %xmm0, %xmm2
|
||||
+ PCMPEQ %xmm0, %xmm3
|
||||
+ pmovmskb %xmm2, %ecx
|
||||
+ pmovmskb %xmm3, %eax
|
||||
+ addl %eax, %ecx
|
||||
+ jz L(first_vec_x0_test)
|
||||
+ /* NB: We could move this shift to before the branch and save a
|
||||
+ bit of code size / performance on the fall through. The
|
||||
+ branch leads to the null case which generally seems hotter
|
||||
+ than char in first 3x VEC. */
|
||||
+ sall $16, %eax
|
||||
+ orl %ecx, %eax
|
||||
+
|
||||
+ bsrl %eax, %eax
|
||||
+ addq %rsi, %rax
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+#endif
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(new_match):
|
||||
+ pxor %xmm6, %xmm6
|
||||
+ PCMPEQ %xmm9, %xmm6
|
||||
+ pmovmskb %xmm6, %eax
|
||||
+ sall $16, %ecx
|
||||
+ orl %eax, %ecx
|
||||
+
|
||||
+ /* We can't reuse either of the old comparisons as since we mask
|
||||
+ of zeros after first zero (instead of using the full
|
||||
+ comparison) we can't gurantee no interference between match
|
||||
+ after end of string and valid match. */
|
||||
+ pmovmskb %xmm4, %eax
|
||||
+ pmovmskb %xmm7, %edx
|
||||
+ sall $16, %edx
|
||||
+ orl %edx, %eax
|
||||
+
|
||||
+ leal -1(%ecx), %edx
|
||||
+ xorl %edx, %ecx
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(first_loop_old_match)
|
||||
+ bsrl %eax, %eax
|
||||
+ addq %rdi, %rax
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+#endif
|
||||
ret
|
||||
|
||||
+ /* Save minimum state for getting most recent match. We can
|
||||
+ throw out all previous work. */
|
||||
.p2align 4
|
||||
-L(no_c_found):
|
||||
- movl $1, %esi
|
||||
- xorl %ecx, %ecx
|
||||
- jmp L(loop_header)
|
||||
+L(second_loop_match):
|
||||
+ movq %rdi, %rsi
|
||||
+ movaps %xmm4, %xmm2
|
||||
+ movaps %xmm7, %xmm3
|
||||
|
||||
.p2align 4
|
||||
-L(exit):
|
||||
- xorl %eax, %eax
|
||||
+L(second_loop):
|
||||
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
|
||||
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
|
||||
+ /* Since SSE2 no pminud so wcsrchr needs seperate logic for
|
||||
+ detecting zero. Note if this is found to be a bottleneck it
|
||||
+ may be worth adding an SSE4.1 wcsrchr implementation. */
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+ movaps %xmm5, %xmm6
|
||||
+ pxor %xmm8, %xmm8
|
||||
+
|
||||
+ PCMPEQ %xmm8, %xmm5
|
||||
+ PCMPEQ %xmm4, %xmm8
|
||||
+ por %xmm5, %xmm8
|
||||
+#else
|
||||
+ movaps %xmm5, %xmm6
|
||||
+ PMINU %xmm4, %xmm5
|
||||
+#endif
|
||||
+
|
||||
+ movaps %xmm4, %xmm9
|
||||
+ PCMPEQ %xmm0, %xmm4
|
||||
+ PCMPEQ %xmm0, %xmm6
|
||||
+ movaps %xmm6, %xmm7
|
||||
+ por %xmm4, %xmm6
|
||||
+#ifndef USE_AS_WCSRCHR
|
||||
+ pxor %xmm8, %xmm8
|
||||
+ PCMPEQ %xmm5, %xmm8
|
||||
+#endif
|
||||
+
|
||||
+ pmovmskb %xmm8, %ecx
|
||||
+ pmovmskb %xmm6, %eax
|
||||
+
|
||||
+ addq $(VEC_SIZE * 2), %rdi
|
||||
+ /* Either null term or new occurence of CHAR. */
|
||||
+ addl %ecx, %eax
|
||||
+ jz L(second_loop)
|
||||
+
|
||||
+ /* No null term so much be new occurence of CHAR. */
|
||||
+ testl %ecx, %ecx
|
||||
+ jz L(second_loop_match)
|
||||
+
|
||||
+
|
||||
+ subl %ecx, %eax
|
||||
+ jnz L(second_loop_new_match)
|
||||
+
|
||||
+L(second_loop_old_match):
|
||||
+ pmovmskb %xmm2, %ecx
|
||||
+ pmovmskb %xmm3, %eax
|
||||
+ sall $16, %eax
|
||||
+ orl %ecx, %eax
|
||||
+ bsrl %eax, %eax
|
||||
+ addq %rsi, %rax
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+#endif
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
+L(second_loop_new_match):
|
||||
+ pxor %xmm6, %xmm6
|
||||
+ PCMPEQ %xmm9, %xmm6
|
||||
+ pmovmskb %xmm6, %eax
|
||||
+ sall $16, %ecx
|
||||
+ orl %eax, %ecx
|
||||
+
|
||||
+ /* We can't reuse either of the old comparisons as since we mask
|
||||
+ of zeros after first zero (instead of using the full
|
||||
+ comparison) we can't gurantee no interference between match
|
||||
+ after end of string and valid match. */
|
||||
+ pmovmskb %xmm4, %eax
|
||||
+ pmovmskb %xmm7, %edx
|
||||
+ sall $16, %edx
|
||||
+ orl %edx, %eax
|
||||
+
|
||||
+ leal -1(%ecx), %edx
|
||||
+ xorl %edx, %ecx
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(second_loop_old_match)
|
||||
+ bsrl %eax, %eax
|
||||
+ addq %rdi, %rax
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+#endif
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4,, 4
|
||||
L(cross_page):
|
||||
- movq %rdi, %rax
|
||||
- pxor %xmm0, %xmm0
|
||||
- andq $-64, %rax
|
||||
- movdqu (%rax), %xmm5
|
||||
- movdqa %xmm5, %xmm6
|
||||
- movdqu 16(%rax), %xmm4
|
||||
- pcmpeqb %xmm1, %xmm5
|
||||
- pcmpeqb %xmm0, %xmm6
|
||||
- movdqu 32(%rax), %xmm3
|
||||
- pmovmskb %xmm6, %esi
|
||||
- movdqa %xmm4, %xmm6
|
||||
- movdqu 48(%rax), %xmm2
|
||||
- pcmpeqb %xmm1, %xmm4
|
||||
- pcmpeqb %xmm0, %xmm6
|
||||
- pmovmskb %xmm6, %edx
|
||||
- movdqa %xmm3, %xmm6
|
||||
- pcmpeqb %xmm1, %xmm3
|
||||
- pcmpeqb %xmm0, %xmm6
|
||||
- pcmpeqb %xmm2, %xmm0
|
||||
- salq $16, %rdx
|
||||
- pmovmskb %xmm3, %r9d
|
||||
- pmovmskb %xmm6, %r8d
|
||||
- pmovmskb %xmm0, %ecx
|
||||
- salq $32, %r9
|
||||
- salq $32, %r8
|
||||
- pcmpeqb %xmm1, %xmm2
|
||||
- orq %r8, %rdx
|
||||
- salq $48, %rcx
|
||||
- pmovmskb %xmm5, %r8d
|
||||
- orq %rsi, %rdx
|
||||
- pmovmskb %xmm4, %esi
|
||||
- orq %rcx, %rdx
|
||||
- pmovmskb %xmm2, %ecx
|
||||
- salq $16, %rsi
|
||||
- salq $48, %rcx
|
||||
- orq %r9, %rsi
|
||||
- orq %r8, %rsi
|
||||
- orq %rcx, %rsi
|
||||
+ movq %rdi, %rsi
|
||||
+ andq $-VEC_SIZE, %rsi
|
||||
+ movaps (%rsi), %xmm1
|
||||
+ pxor %xmm2, %xmm2
|
||||
+ PCMPEQ %xmm1, %xmm2
|
||||
+ pmovmskb %xmm2, %edx
|
||||
movl %edi, %ecx
|
||||
- subl %eax, %ecx
|
||||
- shrq %cl, %rdx
|
||||
- shrq %cl, %rsi
|
||||
- testq %rdx, %rdx
|
||||
- je L(loop_header2)
|
||||
- leaq -1(%rdx), %rax
|
||||
- xorq %rdx, %rax
|
||||
- andq %rax, %rsi
|
||||
- je L(exit)
|
||||
- bsrq %rsi, %rax
|
||||
+ andl $(VEC_SIZE - 1), %ecx
|
||||
+ sarl %cl, %edx
|
||||
+ jz L(cross_page_continue)
|
||||
+ PCMPEQ %xmm0, %xmm1
|
||||
+ pmovmskb %xmm1, %eax
|
||||
+ sarl %cl, %eax
|
||||
+ leal -1(%rdx), %ecx
|
||||
+ xorl %edx, %ecx
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(ret1)
|
||||
+ bsrl %eax, %eax
|
||||
addq %rdi, %rax
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+#endif
|
||||
+L(ret1):
|
||||
ret
|
||||
-END (strrchr)
|
||||
+END(STRRCHR)
|
||||
|
||||
-weak_alias (strrchr, rindex)
|
||||
-libc_hidden_builtin_def (strrchr)
|
||||
+#ifndef USE_AS_WCSRCHR
|
||||
+ weak_alias (STRRCHR, rindex)
|
||||
+ libc_hidden_builtin_def (STRRCHR)
|
||||
+#endif
|
||||
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
|
||||
index 2f388537..ae3cfa7d 100644
|
||||
--- a/sysdeps/x86_64/wcsrchr.S
|
||||
+++ b/sysdeps/x86_64/wcsrchr.S
|
||||
@@ -17,266 +17,12 @@
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
-#include <sysdep.h>
|
||||
|
||||
- .text
|
||||
-ENTRY (wcsrchr)
|
||||
+#define USE_AS_WCSRCHR 1
|
||||
+#define NO_PMINU 1
|
||||
|
||||
- movd %rsi, %xmm1
|
||||
- mov %rdi, %rcx
|
||||
- punpckldq %xmm1, %xmm1
|
||||
- pxor %xmm2, %xmm2
|
||||
- punpckldq %xmm1, %xmm1
|
||||
- and $63, %rcx
|
||||
- cmp $48, %rcx
|
||||
- ja L(crosscache)
|
||||
+#ifndef STRRCHR
|
||||
+# define STRRCHR wcsrchr
|
||||
+#endif
|
||||
|
||||
- movdqu (%rdi), %xmm0
|
||||
- pcmpeqd %xmm0, %xmm2
|
||||
- pcmpeqd %xmm1, %xmm0
|
||||
- pmovmskb %xmm2, %rcx
|
||||
- pmovmskb %xmm0, %rax
|
||||
- add $16, %rdi
|
||||
-
|
||||
- test %rax, %rax
|
||||
- jnz L(unaligned_match1)
|
||||
-
|
||||
- test %rcx, %rcx
|
||||
- jnz L(return_null)
|
||||
-
|
||||
- and $-16, %rdi
|
||||
- xor %r8, %r8
|
||||
- jmp L(loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(unaligned_match1):
|
||||
- test %rcx, %rcx
|
||||
- jnz L(prolog_find_zero_1)
|
||||
-
|
||||
- mov %rax, %r8
|
||||
- mov %rdi, %rsi
|
||||
- and $-16, %rdi
|
||||
- jmp L(loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(crosscache):
|
||||
- and $15, %rcx
|
||||
- and $-16, %rdi
|
||||
- pxor %xmm3, %xmm3
|
||||
- movdqa (%rdi), %xmm0
|
||||
- pcmpeqd %xmm0, %xmm3
|
||||
- pcmpeqd %xmm1, %xmm0
|
||||
- pmovmskb %xmm3, %rdx
|
||||
- pmovmskb %xmm0, %rax
|
||||
- shr %cl, %rdx
|
||||
- shr %cl, %rax
|
||||
- add $16, %rdi
|
||||
-
|
||||
- test %rax, %rax
|
||||
- jnz L(unaligned_match)
|
||||
-
|
||||
- test %rdx, %rdx
|
||||
- jnz L(return_null)
|
||||
-
|
||||
- xor %r8, %r8
|
||||
- jmp L(loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(unaligned_match):
|
||||
- test %rdx, %rdx
|
||||
- jnz L(prolog_find_zero)
|
||||
-
|
||||
- mov %rax, %r8
|
||||
- lea (%rdi, %rcx), %rsi
|
||||
-
|
||||
-/* Loop start on aligned string. */
|
||||
- .p2align 4
|
||||
-L(loop):
|
||||
- movdqa (%rdi), %xmm0
|
||||
- pcmpeqd %xmm0, %xmm2
|
||||
- add $16, %rdi
|
||||
- pcmpeqd %xmm1, %xmm0
|
||||
- pmovmskb %xmm2, %rcx
|
||||
- pmovmskb %xmm0, %rax
|
||||
- or %rax, %rcx
|
||||
- jnz L(matches)
|
||||
-
|
||||
- movdqa (%rdi), %xmm3
|
||||
- pcmpeqd %xmm3, %xmm2
|
||||
- add $16, %rdi
|
||||
- pcmpeqd %xmm1, %xmm3
|
||||
- pmovmskb %xmm2, %rcx
|
||||
- pmovmskb %xmm3, %rax
|
||||
- or %rax, %rcx
|
||||
- jnz L(matches)
|
||||
-
|
||||
- movdqa (%rdi), %xmm4
|
||||
- pcmpeqd %xmm4, %xmm2
|
||||
- add $16, %rdi
|
||||
- pcmpeqd %xmm1, %xmm4
|
||||
- pmovmskb %xmm2, %rcx
|
||||
- pmovmskb %xmm4, %rax
|
||||
- or %rax, %rcx
|
||||
- jnz L(matches)
|
||||
-
|
||||
- movdqa (%rdi), %xmm5
|
||||
- pcmpeqd %xmm5, %xmm2
|
||||
- add $16, %rdi
|
||||
- pcmpeqd %xmm1, %xmm5
|
||||
- pmovmskb %xmm2, %rcx
|
||||
- pmovmskb %xmm5, %rax
|
||||
- or %rax, %rcx
|
||||
- jz L(loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(matches):
|
||||
- test %rax, %rax
|
||||
- jnz L(match)
|
||||
-L(return_value):
|
||||
- test %r8, %r8
|
||||
- jz L(return_null)
|
||||
- mov %r8, %rax
|
||||
- mov %rsi, %rdi
|
||||
-
|
||||
- test $15 << 4, %ah
|
||||
- jnz L(match_fourth_wchar)
|
||||
- test %ah, %ah
|
||||
- jnz L(match_third_wchar)
|
||||
- test $15 << 4, %al
|
||||
- jnz L(match_second_wchar)
|
||||
- lea -16(%rdi), %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(match):
|
||||
- pmovmskb %xmm2, %rcx
|
||||
- test %rcx, %rcx
|
||||
- jnz L(find_zero)
|
||||
- mov %rax, %r8
|
||||
- mov %rdi, %rsi
|
||||
- jmp L(loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(find_zero):
|
||||
- test $15, %cl
|
||||
- jnz L(find_zero_in_first_wchar)
|
||||
- test %cl, %cl
|
||||
- jnz L(find_zero_in_second_wchar)
|
||||
- test $15, %ch
|
||||
- jnz L(find_zero_in_third_wchar)
|
||||
-
|
||||
- and $1 << 13 - 1, %rax
|
||||
- jz L(return_value)
|
||||
-
|
||||
- test $15 << 4, %ah
|
||||
- jnz L(match_fourth_wchar)
|
||||
- test %ah, %ah
|
||||
- jnz L(match_third_wchar)
|
||||
- test $15 << 4, %al
|
||||
- jnz L(match_second_wchar)
|
||||
- lea -16(%rdi), %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(find_zero_in_first_wchar):
|
||||
- test $1, %rax
|
||||
- jz L(return_value)
|
||||
- lea -16(%rdi), %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(find_zero_in_second_wchar):
|
||||
- and $1 << 5 - 1, %rax
|
||||
- jz L(return_value)
|
||||
-
|
||||
- test $15 << 4, %al
|
||||
- jnz L(match_second_wchar)
|
||||
- lea -16(%rdi), %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(find_zero_in_third_wchar):
|
||||
- and $1 << 9 - 1, %rax
|
||||
- jz L(return_value)
|
||||
-
|
||||
- test %ah, %ah
|
||||
- jnz L(match_third_wchar)
|
||||
- test $15 << 4, %al
|
||||
- jnz L(match_second_wchar)
|
||||
- lea -16(%rdi), %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(prolog_find_zero):
|
||||
- add %rcx, %rdi
|
||||
- mov %rdx, %rcx
|
||||
-L(prolog_find_zero_1):
|
||||
- test $15, %cl
|
||||
- jnz L(prolog_find_zero_in_first_wchar)
|
||||
- test %cl, %cl
|
||||
- jnz L(prolog_find_zero_in_second_wchar)
|
||||
- test $15, %ch
|
||||
- jnz L(prolog_find_zero_in_third_wchar)
|
||||
-
|
||||
- and $1 << 13 - 1, %rax
|
||||
- jz L(return_null)
|
||||
-
|
||||
- test $15 << 4, %ah
|
||||
- jnz L(match_fourth_wchar)
|
||||
- test %ah, %ah
|
||||
- jnz L(match_third_wchar)
|
||||
- test $15 << 4, %al
|
||||
- jnz L(match_second_wchar)
|
||||
- lea -16(%rdi), %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(prolog_find_zero_in_first_wchar):
|
||||
- test $1, %rax
|
||||
- jz L(return_null)
|
||||
- lea -16(%rdi), %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(prolog_find_zero_in_second_wchar):
|
||||
- and $1 << 5 - 1, %rax
|
||||
- jz L(return_null)
|
||||
-
|
||||
- test $15 << 4, %al
|
||||
- jnz L(match_second_wchar)
|
||||
- lea -16(%rdi), %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(prolog_find_zero_in_third_wchar):
|
||||
- and $1 << 9 - 1, %rax
|
||||
- jz L(return_null)
|
||||
-
|
||||
- test %ah, %ah
|
||||
- jnz L(match_third_wchar)
|
||||
- test $15 << 4, %al
|
||||
- jnz L(match_second_wchar)
|
||||
- lea -16(%rdi), %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(match_second_wchar):
|
||||
- lea -12(%rdi), %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(match_third_wchar):
|
||||
- lea -8(%rdi), %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(match_fourth_wchar):
|
||||
- lea -4(%rdi), %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(return_null):
|
||||
- xor %rax, %rax
|
||||
- ret
|
||||
-
|
||||
-END (wcsrchr)
|
||||
+#include "../strrchr.S"
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,501 +0,0 @@
|
|||
From df7e295d18ffa34f629578c0017a9881af7620f6 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Thu, 21 Apr 2022 20:52:29 -0500
|
||||
Subject: [PATCH] x86: Optimize {str|wcs}rchr-avx2
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
The new code unrolls the main loop slightly without adding too much
|
||||
overhead and minimizes the comparisons for the search CHAR.
|
||||
|
||||
Geometric Mean of all benchmarks New / Old: 0.832
|
||||
See email for all results.
|
||||
|
||||
Full xcheck passes on x86_64 with and without multiarch enabled.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strrchr-avx2.S | 426 +++++++++++++++---------
|
||||
1 file changed, 269 insertions(+), 157 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
|
||||
index c949410b..3d26fad4 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
|
||||
@@ -27,9 +27,13 @@
|
||||
# ifdef USE_AS_WCSRCHR
|
||||
# define VPBROADCAST vpbroadcastd
|
||||
# define VPCMPEQ vpcmpeqd
|
||||
+# define VPMIN vpminud
|
||||
+# define CHAR_SIZE 4
|
||||
# else
|
||||
# define VPBROADCAST vpbroadcastb
|
||||
# define VPCMPEQ vpcmpeqb
|
||||
+# define VPMIN vpminub
|
||||
+# define CHAR_SIZE 1
|
||||
# endif
|
||||
|
||||
# ifndef VZEROUPPER
|
||||
@@ -41,196 +45,304 @@
|
||||
# endif
|
||||
|
||||
# define VEC_SIZE 32
|
||||
+# define PAGE_SIZE 4096
|
||||
|
||||
- .section SECTION(.text),"ax",@progbits
|
||||
-ENTRY (STRRCHR)
|
||||
- movd %esi, %xmm4
|
||||
- movl %edi, %ecx
|
||||
+ .section SECTION(.text), "ax", @progbits
|
||||
+ENTRY(STRRCHR)
|
||||
+ movd %esi, %xmm7
|
||||
+ movl %edi, %eax
|
||||
/* Broadcast CHAR to YMM4. */
|
||||
- VPBROADCAST %xmm4, %ymm4
|
||||
+ VPBROADCAST %xmm7, %ymm7
|
||||
vpxor %xmm0, %xmm0, %xmm0
|
||||
|
||||
- /* Check if we may cross page boundary with one vector load. */
|
||||
- andl $(2 * VEC_SIZE - 1), %ecx
|
||||
- cmpl $VEC_SIZE, %ecx
|
||||
- ja L(cros_page_boundary)
|
||||
+ /* Shift here instead of `andl` to save code size (saves a fetch
|
||||
+ block). */
|
||||
+ sall $20, %eax
|
||||
+ cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
|
||||
+ ja L(cross_page)
|
||||
|
||||
+L(page_cross_continue):
|
||||
vmovdqu (%rdi), %ymm1
|
||||
- VPCMPEQ %ymm1, %ymm0, %ymm2
|
||||
- VPCMPEQ %ymm1, %ymm4, %ymm3
|
||||
- vpmovmskb %ymm2, %ecx
|
||||
- vpmovmskb %ymm3, %eax
|
||||
- addq $VEC_SIZE, %rdi
|
||||
+ /* Check end of string match. */
|
||||
+ VPCMPEQ %ymm1, %ymm0, %ymm6
|
||||
+ vpmovmskb %ymm6, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jz L(aligned_more)
|
||||
+
|
||||
+ /* Only check match with search CHAR if needed. */
|
||||
+ VPCMPEQ %ymm1, %ymm7, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ /* Check if match before first zero. */
|
||||
+ blsmskl %ecx, %ecx
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(ret0)
|
||||
+ bsrl %eax, %eax
|
||||
+ addq %rdi, %rax
|
||||
+ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
|
||||
+ search CHAR is zero we are correct. Either way `andq
|
||||
+ -CHAR_SIZE, %rax` gets the correct result. */
|
||||
+# ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+# endif
|
||||
+L(ret0):
|
||||
+L(return_vzeroupper):
|
||||
+ ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
+
|
||||
+ /* Returns for first vec x1/x2 have hard coded backward search
|
||||
+ path for earlier matches. */
|
||||
+ .p2align 4,, 10
|
||||
+L(first_vec_x1):
|
||||
+ VPCMPEQ %ymm2, %ymm7, %ymm6
|
||||
+ vpmovmskb %ymm6, %eax
|
||||
+ blsmskl %ecx, %ecx
|
||||
+ andl %ecx, %eax
|
||||
+ jnz L(first_vec_x1_return)
|
||||
+
|
||||
+ .p2align 4,, 4
|
||||
+L(first_vec_x0_test):
|
||||
+ VPCMPEQ %ymm1, %ymm7, %ymm6
|
||||
+ vpmovmskb %ymm6, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jz L(ret1)
|
||||
+ bsrl %eax, %eax
|
||||
+ addq %r8, %rax
|
||||
+# ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+# endif
|
||||
+L(ret1):
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
+ .p2align 4,, 10
|
||||
+L(first_vec_x0_x1_test):
|
||||
+ VPCMPEQ %ymm2, %ymm7, %ymm6
|
||||
+ vpmovmskb %ymm6, %eax
|
||||
+ /* Check ymm2 for search CHAR match. If no match then check ymm1
|
||||
+ before returning. */
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec)
|
||||
+ jz L(first_vec_x0_test)
|
||||
+ .p2align 4,, 4
|
||||
+L(first_vec_x1_return):
|
||||
+ bsrl %eax, %eax
|
||||
+ leaq 1(%rdi, %rax), %rax
|
||||
+# ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+# endif
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
- testl %ecx, %ecx
|
||||
- jnz L(return_null)
|
||||
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
- xorl %edx, %edx
|
||||
- jmp L(aligned_loop)
|
||||
+ .p2align 4,, 10
|
||||
+L(first_vec_x2):
|
||||
+ VPCMPEQ %ymm3, %ymm7, %ymm6
|
||||
+ vpmovmskb %ymm6, %eax
|
||||
+ blsmskl %ecx, %ecx
|
||||
+ /* If no in-range search CHAR match in ymm3 then need to check
|
||||
+ ymm1/ymm2 for an earlier match (we delay checking search
|
||||
+ CHAR matches until needed). */
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(first_vec_x0_x1_test)
|
||||
+ bsrl %eax, %eax
|
||||
+ leaq (VEC_SIZE + 1)(%rdi, %rax), %rax
|
||||
+# ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+# endif
|
||||
+ VZEROUPPER_RETURN
|
||||
+
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec):
|
||||
- /* Check if there is a nul CHAR. */
|
||||
+L(aligned_more):
|
||||
+ /* Save original pointer if match was in VEC 0. */
|
||||
+ movq %rdi, %r8
|
||||
+
|
||||
+ /* Align src. */
|
||||
+ orq $(VEC_SIZE - 1), %rdi
|
||||
+ vmovdqu 1(%rdi), %ymm2
|
||||
+ VPCMPEQ %ymm2, %ymm0, %ymm6
|
||||
+ vpmovmskb %ymm6, %ecx
|
||||
testl %ecx, %ecx
|
||||
- jnz L(char_and_nul_in_first_vec)
|
||||
+ jnz L(first_vec_x1)
|
||||
|
||||
- /* Remember the match and keep searching. */
|
||||
- movl %eax, %edx
|
||||
- movq %rdi, %rsi
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
- jmp L(aligned_loop)
|
||||
+ vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3
|
||||
+ VPCMPEQ %ymm3, %ymm0, %ymm6
|
||||
+ vpmovmskb %ymm6, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(first_vec_x2)
|
||||
|
||||
+ /* Save pointer again before realigning. */
|
||||
+ movq %rdi, %rsi
|
||||
+ addq $(VEC_SIZE + 1), %rdi
|
||||
+ andq $-(VEC_SIZE * 2), %rdi
|
||||
.p2align 4
|
||||
-L(cros_page_boundary):
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
- vmovdqa (%rdi), %ymm1
|
||||
- VPCMPEQ %ymm1, %ymm0, %ymm2
|
||||
- VPCMPEQ %ymm1, %ymm4, %ymm3
|
||||
- vpmovmskb %ymm2, %edx
|
||||
- vpmovmskb %ymm3, %eax
|
||||
- shrl %cl, %edx
|
||||
- shrl %cl, %eax
|
||||
- addq $VEC_SIZE, %rdi
|
||||
-
|
||||
- /* Check if there is a CHAR. */
|
||||
+L(first_aligned_loop):
|
||||
+ /* Do 2x VEC at a time. Any more and the cost of finding the
|
||||
+ match outweights loop benefit. */
|
||||
+ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
|
||||
+ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
|
||||
+
|
||||
+ VPCMPEQ %ymm4, %ymm7, %ymm6
|
||||
+ VPMIN %ymm4, %ymm5, %ymm8
|
||||
+ VPCMPEQ %ymm5, %ymm7, %ymm10
|
||||
+ vpor %ymm6, %ymm10, %ymm5
|
||||
+ VPCMPEQ %ymm8, %ymm0, %ymm8
|
||||
+ vpor %ymm5, %ymm8, %ymm9
|
||||
+
|
||||
+ vpmovmskb %ymm9, %eax
|
||||
+ addq $(VEC_SIZE * 2), %rdi
|
||||
+ /* No zero or search CHAR. */
|
||||
testl %eax, %eax
|
||||
- jnz L(found_char)
|
||||
-
|
||||
- testl %edx, %edx
|
||||
- jnz L(return_null)
|
||||
+ jz L(first_aligned_loop)
|
||||
|
||||
- jmp L(aligned_loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(found_char):
|
||||
- testl %edx, %edx
|
||||
- jnz L(char_and_nul)
|
||||
+ /* If no zero CHAR then go to second loop (this allows us to
|
||||
+ throw away all prior work). */
|
||||
+ vpmovmskb %ymm8, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jz L(second_aligned_loop_prep)
|
||||
|
||||
- /* Remember the match and keep searching. */
|
||||
- movl %eax, %edx
|
||||
- leaq (%rdi, %rcx), %rsi
|
||||
+ /* Search char could be zero so we need to get the true match.
|
||||
+ */
|
||||
+ vpmovmskb %ymm5, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(first_aligned_loop_return)
|
||||
|
||||
- .p2align 4
|
||||
-L(aligned_loop):
|
||||
- vmovdqa (%rdi), %ymm1
|
||||
- VPCMPEQ %ymm1, %ymm0, %ymm2
|
||||
- addq $VEC_SIZE, %rdi
|
||||
- VPCMPEQ %ymm1, %ymm4, %ymm3
|
||||
- vpmovmskb %ymm2, %ecx
|
||||
- vpmovmskb %ymm3, %eax
|
||||
- orl %eax, %ecx
|
||||
- jnz L(char_nor_null)
|
||||
-
|
||||
- vmovdqa (%rdi), %ymm1
|
||||
- VPCMPEQ %ymm1, %ymm0, %ymm2
|
||||
- add $VEC_SIZE, %rdi
|
||||
- VPCMPEQ %ymm1, %ymm4, %ymm3
|
||||
- vpmovmskb %ymm2, %ecx
|
||||
+ .p2align 4,, 4
|
||||
+L(first_vec_x1_or_x2):
|
||||
+ VPCMPEQ %ymm3, %ymm7, %ymm3
|
||||
+ VPCMPEQ %ymm2, %ymm7, %ymm2
|
||||
vpmovmskb %ymm3, %eax
|
||||
- orl %eax, %ecx
|
||||
- jnz L(char_nor_null)
|
||||
-
|
||||
- vmovdqa (%rdi), %ymm1
|
||||
- VPCMPEQ %ymm1, %ymm0, %ymm2
|
||||
- addq $VEC_SIZE, %rdi
|
||||
- VPCMPEQ %ymm1, %ymm4, %ymm3
|
||||
- vpmovmskb %ymm2, %ecx
|
||||
- vpmovmskb %ymm3, %eax
|
||||
- orl %eax, %ecx
|
||||
- jnz L(char_nor_null)
|
||||
-
|
||||
- vmovdqa (%rdi), %ymm1
|
||||
- VPCMPEQ %ymm1, %ymm0, %ymm2
|
||||
- addq $VEC_SIZE, %rdi
|
||||
- VPCMPEQ %ymm1, %ymm4, %ymm3
|
||||
- vpmovmskb %ymm2, %ecx
|
||||
- vpmovmskb %ymm3, %eax
|
||||
- orl %eax, %ecx
|
||||
- jz L(aligned_loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(char_nor_null):
|
||||
- /* Find a CHAR or a nul CHAR in a loop. */
|
||||
- testl %eax, %eax
|
||||
- jnz L(match)
|
||||
-L(return_value):
|
||||
- testl %edx, %edx
|
||||
- jz L(return_null)
|
||||
- movl %edx, %eax
|
||||
- movq %rsi, %rdi
|
||||
+ vpmovmskb %ymm2, %edx
|
||||
+ /* Use add for macro-fusion. */
|
||||
+ addq %rax, %rdx
|
||||
+ jz L(first_vec_x0_test)
|
||||
+ /* NB: We could move this shift to before the branch and save a
|
||||
+ bit of code size / performance on the fall through. The
|
||||
+ branch leads to the null case which generally seems hotter
|
||||
+ than char in first 3x VEC. */
|
||||
+ salq $32, %rax
|
||||
+ addq %rdx, %rax
|
||||
+ bsrq %rax, %rax
|
||||
+ leaq 1(%rsi, %rax), %rax
|
||||
+# ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+# endif
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
+ .p2align 4,, 8
|
||||
+L(first_aligned_loop_return):
|
||||
+ VPCMPEQ %ymm4, %ymm0, %ymm4
|
||||
+ vpmovmskb %ymm4, %edx
|
||||
+ salq $32, %rcx
|
||||
+ orq %rdx, %rcx
|
||||
+
|
||||
+ vpmovmskb %ymm10, %eax
|
||||
+ vpmovmskb %ymm6, %edx
|
||||
+ salq $32, %rax
|
||||
+ orq %rdx, %rax
|
||||
+ blsmskq %rcx, %rcx
|
||||
+ andq %rcx, %rax
|
||||
+ jz L(first_vec_x1_or_x2)
|
||||
+
|
||||
+ bsrq %rax, %rax
|
||||
+ leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax
|
||||
# ifdef USE_AS_WCSRCHR
|
||||
- /* Keep the first bit for each matching CHAR for bsr. */
|
||||
- andl $0x11111111, %eax
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
# endif
|
||||
- bsrl %eax, %eax
|
||||
- leaq -VEC_SIZE(%rdi, %rax), %rax
|
||||
-L(return_vzeroupper):
|
||||
- ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
+ /* Search char cannot be zero. */
|
||||
.p2align 4
|
||||
-L(match):
|
||||
- /* Find a CHAR. Check if there is a nul CHAR. */
|
||||
- vpmovmskb %ymm2, %ecx
|
||||
- testl %ecx, %ecx
|
||||
- jnz L(find_nul)
|
||||
-
|
||||
- /* Remember the match and keep searching. */
|
||||
- movl %eax, %edx
|
||||
+L(second_aligned_loop_set_furthest_match):
|
||||
+ /* Save VEC and pointer from most recent match. */
|
||||
+L(second_aligned_loop_prep):
|
||||
movq %rdi, %rsi
|
||||
- jmp L(aligned_loop)
|
||||
+ vmovdqu %ymm6, %ymm2
|
||||
+ vmovdqu %ymm10, %ymm3
|
||||
|
||||
.p2align 4
|
||||
-L(find_nul):
|
||||
-# ifdef USE_AS_WCSRCHR
|
||||
- /* Keep the first bit for each matching CHAR for bsr. */
|
||||
- andl $0x11111111, %ecx
|
||||
- andl $0x11111111, %eax
|
||||
-# endif
|
||||
- /* Mask out any matching bits after the nul CHAR. */
|
||||
- movl %ecx, %r8d
|
||||
- subl $1, %r8d
|
||||
- xorl %ecx, %r8d
|
||||
- andl %r8d, %eax
|
||||
+L(second_aligned_loop):
|
||||
+ /* Search 2x at at time. */
|
||||
+ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
|
||||
+ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
|
||||
+
|
||||
+ VPCMPEQ %ymm4, %ymm7, %ymm6
|
||||
+ VPMIN %ymm4, %ymm5, %ymm1
|
||||
+ VPCMPEQ %ymm5, %ymm7, %ymm10
|
||||
+ vpor %ymm6, %ymm10, %ymm5
|
||||
+ VPCMPEQ %ymm1, %ymm0, %ymm1
|
||||
+ vpor %ymm5, %ymm1, %ymm9
|
||||
+
|
||||
+ vpmovmskb %ymm9, %eax
|
||||
+ addq $(VEC_SIZE * 2), %rdi
|
||||
testl %eax, %eax
|
||||
- /* If there is no CHAR here, return the remembered one. */
|
||||
- jz L(return_value)
|
||||
- bsrl %eax, %eax
|
||||
- leaq -VEC_SIZE(%rdi, %rax), %rax
|
||||
- VZEROUPPER_RETURN
|
||||
-
|
||||
- .p2align 4
|
||||
-L(char_and_nul):
|
||||
- /* Find both a CHAR and a nul CHAR. */
|
||||
- addq %rcx, %rdi
|
||||
- movl %edx, %ecx
|
||||
-L(char_and_nul_in_first_vec):
|
||||
-# ifdef USE_AS_WCSRCHR
|
||||
- /* Keep the first bit for each matching CHAR for bsr. */
|
||||
- andl $0x11111111, %ecx
|
||||
- andl $0x11111111, %eax
|
||||
-# endif
|
||||
- /* Mask out any matching bits after the nul CHAR. */
|
||||
- movl %ecx, %r8d
|
||||
- subl $1, %r8d
|
||||
- xorl %ecx, %r8d
|
||||
- andl %r8d, %eax
|
||||
+ jz L(second_aligned_loop)
|
||||
+ vpmovmskb %ymm1, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jz L(second_aligned_loop_set_furthest_match)
|
||||
+ vpmovmskb %ymm5, %eax
|
||||
testl %eax, %eax
|
||||
- /* Return null pointer if the nul CHAR comes first. */
|
||||
- jz L(return_null)
|
||||
- bsrl %eax, %eax
|
||||
- leaq -VEC_SIZE(%rdi, %rax), %rax
|
||||
+ jnz L(return_new_match)
|
||||
+
|
||||
+ /* This is the hot patch. We know CHAR is inbounds and that
|
||||
+ ymm3/ymm2 have latest match. */
|
||||
+ .p2align 4,, 4
|
||||
+L(return_old_match):
|
||||
+ vpmovmskb %ymm3, %eax
|
||||
+ vpmovmskb %ymm2, %edx
|
||||
+ salq $32, %rax
|
||||
+ orq %rdx, %rax
|
||||
+ bsrq %rax, %rax
|
||||
+ /* Search char cannot be zero so safe to just use lea for
|
||||
+ wcsrchr. */
|
||||
+ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
- .p2align 4
|
||||
-L(return_null):
|
||||
- xorl %eax, %eax
|
||||
+ /* Last iteration also potentially has a match. */
|
||||
+ .p2align 4,, 8
|
||||
+L(return_new_match):
|
||||
+ VPCMPEQ %ymm4, %ymm0, %ymm4
|
||||
+ vpmovmskb %ymm4, %edx
|
||||
+ salq $32, %rcx
|
||||
+ orq %rdx, %rcx
|
||||
+
|
||||
+ vpmovmskb %ymm10, %eax
|
||||
+ vpmovmskb %ymm6, %edx
|
||||
+ salq $32, %rax
|
||||
+ orq %rdx, %rax
|
||||
+ blsmskq %rcx, %rcx
|
||||
+ andq %rcx, %rax
|
||||
+ jz L(return_old_match)
|
||||
+ bsrq %rax, %rax
|
||||
+ /* Search char cannot be zero so safe to just use lea for
|
||||
+ wcsrchr. */
|
||||
+ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
-END (STRRCHR)
|
||||
+ .p2align 4,, 4
|
||||
+L(cross_page):
|
||||
+ movq %rdi, %rsi
|
||||
+ andq $-VEC_SIZE, %rsi
|
||||
+ vmovdqu (%rsi), %ymm1
|
||||
+ VPCMPEQ %ymm1, %ymm0, %ymm6
|
||||
+ vpmovmskb %ymm6, %ecx
|
||||
+ /* Shift out zero CHAR matches that are before the begining of
|
||||
+ src (rdi). */
|
||||
+ shrxl %edi, %ecx, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jz L(page_cross_continue)
|
||||
+ VPCMPEQ %ymm1, %ymm7, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+
|
||||
+ /* Shift out search CHAR matches that are before the begining of
|
||||
+ src (rdi). */
|
||||
+ shrxl %edi, %eax, %eax
|
||||
+ blsmskl %ecx, %ecx
|
||||
+ /* Check if any search CHAR match in range. */
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(ret2)
|
||||
+ bsrl %eax, %eax
|
||||
+ addq %rdi, %rax
|
||||
+# ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+# endif
|
||||
+L(ret2):
|
||||
+ VZEROUPPER_RETURN
|
||||
+END(STRRCHR)
|
||||
#endif
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,558 +0,0 @@
|
|||
From c966099cdc3e0fdf92f63eac09b22fa7e5f5f02d Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Thu, 21 Apr 2022 20:52:30 -0500
|
||||
Subject: [PATCH] x86: Optimize {str|wcs}rchr-evex
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
The new code unrolls the main loop slightly without adding too much
|
||||
overhead and minimizes the comparisons for the search CHAR.
|
||||
|
||||
Geometric Mean of all benchmarks New / Old: 0.755
|
||||
See email for all results.
|
||||
|
||||
Full xcheck passes on x86_64 with and without multiarch enabled.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strrchr-evex.S | 471 +++++++++++++++---------
|
||||
1 file changed, 290 insertions(+), 181 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
|
||||
index f920b5a5..f5b6d755 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
|
||||
@@ -24,242 +24,351 @@
|
||||
# define STRRCHR __strrchr_evex
|
||||
# endif
|
||||
|
||||
-# define VMOVU vmovdqu64
|
||||
-# define VMOVA vmovdqa64
|
||||
+# define VMOVU vmovdqu64
|
||||
+# define VMOVA vmovdqa64
|
||||
|
||||
# ifdef USE_AS_WCSRCHR
|
||||
+# define SHIFT_REG esi
|
||||
+
|
||||
+# define kunpck kunpckbw
|
||||
+# define kmov_2x kmovd
|
||||
+# define maskz_2x ecx
|
||||
+# define maskm_2x eax
|
||||
+# define CHAR_SIZE 4
|
||||
+# define VPMIN vpminud
|
||||
+# define VPTESTN vptestnmd
|
||||
# define VPBROADCAST vpbroadcastd
|
||||
-# define VPCMP vpcmpd
|
||||
-# define SHIFT_REG r8d
|
||||
+# define VPCMP vpcmpd
|
||||
# else
|
||||
+# define SHIFT_REG edi
|
||||
+
|
||||
+# define kunpck kunpckdq
|
||||
+# define kmov_2x kmovq
|
||||
+# define maskz_2x rcx
|
||||
+# define maskm_2x rax
|
||||
+
|
||||
+# define CHAR_SIZE 1
|
||||
+# define VPMIN vpminub
|
||||
+# define VPTESTN vptestnmb
|
||||
# define VPBROADCAST vpbroadcastb
|
||||
-# define VPCMP vpcmpb
|
||||
-# define SHIFT_REG ecx
|
||||
+# define VPCMP vpcmpb
|
||||
# endif
|
||||
|
||||
# define XMMZERO xmm16
|
||||
# define YMMZERO ymm16
|
||||
# define YMMMATCH ymm17
|
||||
-# define YMM1 ymm18
|
||||
+# define YMMSAVE ymm18
|
||||
+
|
||||
+# define YMM1 ymm19
|
||||
+# define YMM2 ymm20
|
||||
+# define YMM3 ymm21
|
||||
+# define YMM4 ymm22
|
||||
+# define YMM5 ymm23
|
||||
+# define YMM6 ymm24
|
||||
+# define YMM7 ymm25
|
||||
+# define YMM8 ymm26
|
||||
|
||||
-# define VEC_SIZE 32
|
||||
|
||||
- .section .text.evex,"ax",@progbits
|
||||
-ENTRY (STRRCHR)
|
||||
- movl %edi, %ecx
|
||||
+# define VEC_SIZE 32
|
||||
+# define PAGE_SIZE 4096
|
||||
+ .section .text.evex, "ax", @progbits
|
||||
+ENTRY(STRRCHR)
|
||||
+ movl %edi, %eax
|
||||
/* Broadcast CHAR to YMMMATCH. */
|
||||
VPBROADCAST %esi, %YMMMATCH
|
||||
|
||||
- vpxorq %XMMZERO, %XMMZERO, %XMMZERO
|
||||
-
|
||||
- /* Check if we may cross page boundary with one vector load. */
|
||||
- andl $(2 * VEC_SIZE - 1), %ecx
|
||||
- cmpl $VEC_SIZE, %ecx
|
||||
- ja L(cros_page_boundary)
|
||||
+ andl $(PAGE_SIZE - 1), %eax
|
||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
+ jg L(cross_page_boundary)
|
||||
|
||||
+L(page_cross_continue):
|
||||
VMOVU (%rdi), %YMM1
|
||||
-
|
||||
- /* Each bit in K0 represents a null byte in YMM1. */
|
||||
- VPCMP $0, %YMMZERO, %YMM1, %k0
|
||||
- /* Each bit in K1 represents a CHAR in YMM1. */
|
||||
- VPCMP $0, %YMMMATCH, %YMM1, %k1
|
||||
+ /* k0 has a 1 for each zero CHAR in YMM1. */
|
||||
+ VPTESTN %YMM1, %YMM1, %k0
|
||||
kmovd %k0, %ecx
|
||||
- kmovd %k1, %eax
|
||||
-
|
||||
- addq $VEC_SIZE, %rdi
|
||||
-
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec)
|
||||
-
|
||||
testl %ecx, %ecx
|
||||
- jnz L(return_null)
|
||||
-
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
- xorl %edx, %edx
|
||||
- jmp L(aligned_loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(first_vec):
|
||||
- /* Check if there is a null byte. */
|
||||
- testl %ecx, %ecx
|
||||
- jnz L(char_and_nul_in_first_vec)
|
||||
-
|
||||
- /* Remember the match and keep searching. */
|
||||
- movl %eax, %edx
|
||||
- movq %rdi, %rsi
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
- jmp L(aligned_loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(cros_page_boundary):
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
+ jz L(aligned_more)
|
||||
+ /* fallthrough: zero CHAR in first VEC. */
|
||||
|
||||
+ /* K1 has a 1 for each search CHAR match in YMM1. */
|
||||
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
|
||||
+ kmovd %k1, %eax
|
||||
+ /* Build mask up until first zero CHAR (used to mask of
|
||||
+ potential search CHAR matches past the end of the string).
|
||||
+ */
|
||||
+ blsmskl %ecx, %ecx
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(ret0)
|
||||
+ /* Get last match (the `andl` removed any out of bounds
|
||||
+ matches). */
|
||||
+ bsrl %eax, %eax
|
||||
# ifdef USE_AS_WCSRCHR
|
||||
- /* NB: Divide shift count by 4 since each bit in K1 represent 4
|
||||
- bytes. */
|
||||
- movl %ecx, %SHIFT_REG
|
||||
- sarl $2, %SHIFT_REG
|
||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
||||
+# else
|
||||
+ addq %rdi, %rax
|
||||
# endif
|
||||
+L(ret0):
|
||||
+ ret
|
||||
|
||||
- VMOVA (%rdi), %YMM1
|
||||
-
|
||||
- /* Each bit in K0 represents a null byte in YMM1. */
|
||||
- VPCMP $0, %YMMZERO, %YMM1, %k0
|
||||
- /* Each bit in K1 represents a CHAR in YMM1. */
|
||||
+ /* Returns for first vec x1/x2/x3 have hard coded backward
|
||||
+ search path for earlier matches. */
|
||||
+ .p2align 4,, 6
|
||||
+L(first_vec_x1):
|
||||
+ VPCMP $0, %YMMMATCH, %YMM2, %k1
|
||||
+ kmovd %k1, %eax
|
||||
+ blsmskl %ecx, %ecx
|
||||
+ /* eax non-zero if search CHAR in range. */
|
||||
+ andl %ecx, %eax
|
||||
+ jnz L(first_vec_x1_return)
|
||||
+
|
||||
+ /* fallthrough: no match in YMM2 then need to check for earlier
|
||||
+ matches (in YMM1). */
|
||||
+ .p2align 4,, 4
|
||||
+L(first_vec_x0_test):
|
||||
VPCMP $0, %YMMMATCH, %YMM1, %k1
|
||||
- kmovd %k0, %edx
|
||||
kmovd %k1, %eax
|
||||
-
|
||||
- shrxl %SHIFT_REG, %edx, %edx
|
||||
- shrxl %SHIFT_REG, %eax, %eax
|
||||
- addq $VEC_SIZE, %rdi
|
||||
-
|
||||
- /* Check if there is a CHAR. */
|
||||
testl %eax, %eax
|
||||
- jnz L(found_char)
|
||||
-
|
||||
- testl %edx, %edx
|
||||
- jnz L(return_null)
|
||||
-
|
||||
- jmp L(aligned_loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(found_char):
|
||||
- testl %edx, %edx
|
||||
- jnz L(char_and_nul)
|
||||
-
|
||||
- /* Remember the match and keep searching. */
|
||||
- movl %eax, %edx
|
||||
- leaq (%rdi, %rcx), %rsi
|
||||
+ jz L(ret1)
|
||||
+ bsrl %eax, %eax
|
||||
+# ifdef USE_AS_WCSRCHR
|
||||
+ leaq (%rsi, %rax, CHAR_SIZE), %rax
|
||||
+# else
|
||||
+ addq %rsi, %rax
|
||||
+# endif
|
||||
+L(ret1):
|
||||
+ ret
|
||||
|
||||
- .p2align 4
|
||||
-L(aligned_loop):
|
||||
- VMOVA (%rdi), %YMM1
|
||||
- addq $VEC_SIZE, %rdi
|
||||
+ .p2align 4,, 10
|
||||
+L(first_vec_x1_or_x2):
|
||||
+ VPCMP $0, %YMM3, %YMMMATCH, %k3
|
||||
+ VPCMP $0, %YMM2, %YMMMATCH, %k2
|
||||
+ /* K2 and K3 have 1 for any search CHAR match. Test if any
|
||||
+ matches between either of them. Otherwise check YMM1. */
|
||||
+ kortestd %k2, %k3
|
||||
+ jz L(first_vec_x0_test)
|
||||
+
|
||||
+ /* Guranteed that YMM2 and YMM3 are within range so merge the
|
||||
+ two bitmasks then get last result. */
|
||||
+ kunpck %k2, %k3, %k3
|
||||
+ kmovq %k3, %rax
|
||||
+ bsrq %rax, %rax
|
||||
+ leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
|
||||
- /* Each bit in K0 represents a null byte in YMM1. */
|
||||
- VPCMP $0, %YMMZERO, %YMM1, %k0
|
||||
- /* Each bit in K1 represents a CHAR in YMM1. */
|
||||
- VPCMP $0, %YMMMATCH, %YMM1, %k1
|
||||
- kmovd %k0, %ecx
|
||||
+ .p2align 4,, 6
|
||||
+L(first_vec_x3):
|
||||
+ VPCMP $0, %YMMMATCH, %YMM4, %k1
|
||||
kmovd %k1, %eax
|
||||
- orl %eax, %ecx
|
||||
- jnz L(char_nor_null)
|
||||
+ blsmskl %ecx, %ecx
|
||||
+ /* If no search CHAR match in range check YMM1/YMM2/YMM3. */
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(first_vec_x1_or_x2)
|
||||
+ bsrl %eax, %eax
|
||||
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
|
||||
- VMOVA (%rdi), %YMM1
|
||||
- add $VEC_SIZE, %rdi
|
||||
+ .p2align 4,, 6
|
||||
+L(first_vec_x0_x1_test):
|
||||
+ VPCMP $0, %YMMMATCH, %YMM2, %k1
|
||||
+ kmovd %k1, %eax
|
||||
+ /* Check YMM2 for last match first. If no match try YMM1. */
|
||||
+ testl %eax, %eax
|
||||
+ jz L(first_vec_x0_test)
|
||||
+ .p2align 4,, 4
|
||||
+L(first_vec_x1_return):
|
||||
+ bsrl %eax, %eax
|
||||
+ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
|
||||
- /* Each bit in K0 represents a null byte in YMM1. */
|
||||
- VPCMP $0, %YMMZERO, %YMM1, %k0
|
||||
- /* Each bit in K1 represents a CHAR in YMM1. */
|
||||
- VPCMP $0, %YMMMATCH, %YMM1, %k1
|
||||
- kmovd %k0, %ecx
|
||||
+ .p2align 4,, 10
|
||||
+L(first_vec_x2):
|
||||
+ VPCMP $0, %YMMMATCH, %YMM3, %k1
|
||||
kmovd %k1, %eax
|
||||
- orl %eax, %ecx
|
||||
- jnz L(char_nor_null)
|
||||
+ blsmskl %ecx, %ecx
|
||||
+ /* Check YMM3 for last match first. If no match try YMM2/YMM1.
|
||||
+ */
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(first_vec_x0_x1_test)
|
||||
+ bsrl %eax, %eax
|
||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
|
||||
- VMOVA (%rdi), %YMM1
|
||||
- addq $VEC_SIZE, %rdi
|
||||
|
||||
- /* Each bit in K0 represents a null byte in YMM1. */
|
||||
- VPCMP $0, %YMMZERO, %YMM1, %k0
|
||||
- /* Each bit in K1 represents a CHAR in YMM1. */
|
||||
- VPCMP $0, %YMMMATCH, %YMM1, %k1
|
||||
+ .p2align 4
|
||||
+L(aligned_more):
|
||||
+ /* Need to keep original pointer incase YMM1 has last match. */
|
||||
+ movq %rdi, %rsi
|
||||
+ andq $-VEC_SIZE, %rdi
|
||||
+ VMOVU VEC_SIZE(%rdi), %YMM2
|
||||
+ VPTESTN %YMM2, %YMM2, %k0
|
||||
kmovd %k0, %ecx
|
||||
- kmovd %k1, %eax
|
||||
- orl %eax, %ecx
|
||||
- jnz L(char_nor_null)
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(first_vec_x1)
|
||||
|
||||
- VMOVA (%rdi), %YMM1
|
||||
- addq $VEC_SIZE, %rdi
|
||||
+ VMOVU (VEC_SIZE * 2)(%rdi), %YMM3
|
||||
+ VPTESTN %YMM3, %YMM3, %k0
|
||||
+ kmovd %k0, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(first_vec_x2)
|
||||
|
||||
- /* Each bit in K0 represents a null byte in YMM1. */
|
||||
- VPCMP $0, %YMMZERO, %YMM1, %k0
|
||||
- /* Each bit in K1 represents a CHAR in YMM1. */
|
||||
- VPCMP $0, %YMMMATCH, %YMM1, %k1
|
||||
+ VMOVU (VEC_SIZE * 3)(%rdi), %YMM4
|
||||
+ VPTESTN %YMM4, %YMM4, %k0
|
||||
kmovd %k0, %ecx
|
||||
- kmovd %k1, %eax
|
||||
- orl %eax, %ecx
|
||||
- jz L(aligned_loop)
|
||||
+ movq %rdi, %r8
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(first_vec_x3)
|
||||
|
||||
+ andq $-(VEC_SIZE * 2), %rdi
|
||||
.p2align 4
|
||||
-L(char_nor_null):
|
||||
- /* Find a CHAR or a null byte in a loop. */
|
||||
+L(first_aligned_loop):
|
||||
+ /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
|
||||
+ they don't store a match. */
|
||||
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM5
|
||||
+ VMOVA (VEC_SIZE * 5)(%rdi), %YMM6
|
||||
+
|
||||
+ VPCMP $0, %YMM5, %YMMMATCH, %k2
|
||||
+ vpxord %YMM6, %YMMMATCH, %YMM7
|
||||
+
|
||||
+ VPMIN %YMM5, %YMM6, %YMM8
|
||||
+ VPMIN %YMM8, %YMM7, %YMM7
|
||||
+
|
||||
+ VPTESTN %YMM7, %YMM7, %k1
|
||||
+ subq $(VEC_SIZE * -2), %rdi
|
||||
+ kortestd %k1, %k2
|
||||
+ jz L(first_aligned_loop)
|
||||
+
|
||||
+ VPCMP $0, %YMM6, %YMMMATCH, %k3
|
||||
+ VPTESTN %YMM8, %YMM8, %k1
|
||||
+ ktestd %k1, %k1
|
||||
+ jz L(second_aligned_loop_prep)
|
||||
+
|
||||
+ kortestd %k2, %k3
|
||||
+ jnz L(return_first_aligned_loop)
|
||||
+
|
||||
+ .p2align 4,, 6
|
||||
+L(first_vec_x1_or_x2_or_x3):
|
||||
+ VPCMP $0, %YMM4, %YMMMATCH, %k4
|
||||
+ kmovd %k4, %eax
|
||||
testl %eax, %eax
|
||||
- jnz L(match)
|
||||
-L(return_value):
|
||||
- testl %edx, %edx
|
||||
- jz L(return_null)
|
||||
- movl %edx, %eax
|
||||
- movq %rsi, %rdi
|
||||
+ jz L(first_vec_x1_or_x2)
|
||||
bsrl %eax, %eax
|
||||
-# ifdef USE_AS_WCSRCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
|
||||
-# else
|
||||
- leaq -VEC_SIZE(%rdi, %rax), %rax
|
||||
-# endif
|
||||
+ leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(match):
|
||||
- /* Find a CHAR. Check if there is a null byte. */
|
||||
- kmovd %k0, %ecx
|
||||
- testl %ecx, %ecx
|
||||
- jnz L(find_nul)
|
||||
+ .p2align 4,, 8
|
||||
+L(return_first_aligned_loop):
|
||||
+ VPTESTN %YMM5, %YMM5, %k0
|
||||
+ kunpck %k0, %k1, %k0
|
||||
+ kmov_2x %k0, %maskz_2x
|
||||
+
|
||||
+ blsmsk %maskz_2x, %maskz_2x
|
||||
+ kunpck %k2, %k3, %k3
|
||||
+ kmov_2x %k3, %maskm_2x
|
||||
+ and %maskz_2x, %maskm_2x
|
||||
+ jz L(first_vec_x1_or_x2_or_x3)
|
||||
|
||||
- /* Remember the match and keep searching. */
|
||||
- movl %eax, %edx
|
||||
+ bsr %maskm_2x, %maskm_2x
|
||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+ /* We can throw away the work done for the first 4x checks here
|
||||
+ as we have a later match. This is the 'fast' path persay.
|
||||
+ */
|
||||
+L(second_aligned_loop_prep):
|
||||
+L(second_aligned_loop_set_furthest_match):
|
||||
movq %rdi, %rsi
|
||||
- jmp L(aligned_loop)
|
||||
+ kunpck %k2, %k3, %k4
|
||||
|
||||
.p2align 4
|
||||
-L(find_nul):
|
||||
- /* Mask out any matching bits after the null byte. */
|
||||
- movl %ecx, %r8d
|
||||
- subl $1, %r8d
|
||||
- xorl %ecx, %r8d
|
||||
- andl %r8d, %eax
|
||||
- testl %eax, %eax
|
||||
- /* If there is no CHAR here, return the remembered one. */
|
||||
- jz L(return_value)
|
||||
- bsrl %eax, %eax
|
||||
+L(second_aligned_loop):
|
||||
+ VMOVU (VEC_SIZE * 4)(%rdi), %YMM1
|
||||
+ VMOVU (VEC_SIZE * 5)(%rdi), %YMM2
|
||||
+
|
||||
+ VPCMP $0, %YMM1, %YMMMATCH, %k2
|
||||
+ vpxord %YMM2, %YMMMATCH, %YMM3
|
||||
+
|
||||
+ VPMIN %YMM1, %YMM2, %YMM4
|
||||
+ VPMIN %YMM3, %YMM4, %YMM3
|
||||
+
|
||||
+ VPTESTN %YMM3, %YMM3, %k1
|
||||
+ subq $(VEC_SIZE * -2), %rdi
|
||||
+ kortestd %k1, %k2
|
||||
+ jz L(second_aligned_loop)
|
||||
+
|
||||
+ VPCMP $0, %YMM2, %YMMMATCH, %k3
|
||||
+ VPTESTN %YMM4, %YMM4, %k1
|
||||
+ ktestd %k1, %k1
|
||||
+ jz L(second_aligned_loop_set_furthest_match)
|
||||
+
|
||||
+ kortestd %k2, %k3
|
||||
+ /* branch here because there is a significant advantage interms
|
||||
+ of output dependency chance in using edx. */
|
||||
+ jnz L(return_new_match)
|
||||
+L(return_old_match):
|
||||
+ kmovq %k4, %rax
|
||||
+ bsrq %rax, %rax
|
||||
+ leaq (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
+
|
||||
+L(return_new_match):
|
||||
+ VPTESTN %YMM1, %YMM1, %k0
|
||||
+ kunpck %k0, %k1, %k0
|
||||
+ kmov_2x %k0, %maskz_2x
|
||||
+
|
||||
+ blsmsk %maskz_2x, %maskz_2x
|
||||
+ kunpck %k2, %k3, %k3
|
||||
+ kmov_2x %k3, %maskm_2x
|
||||
+ and %maskz_2x, %maskm_2x
|
||||
+ jz L(return_old_match)
|
||||
+
|
||||
+ bsr %maskm_2x, %maskm_2x
|
||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
+
|
||||
+L(cross_page_boundary):
|
||||
+ /* eax contains all the page offset bits of src (rdi). `xor rdi,
|
||||
+ rax` sets pointer will all page offset bits cleared so
|
||||
+ offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
|
||||
+ before page cross (guranteed to be safe to read). Doing this
|
||||
+ as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
|
||||
+ a bit of code size. */
|
||||
+ xorq %rdi, %rax
|
||||
+ VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
|
||||
+ VPTESTN %YMM1, %YMM1, %k0
|
||||
+ kmovd %k0, %ecx
|
||||
+
|
||||
+ /* Shift out zero CHAR matches that are before the begining of
|
||||
+ src (rdi). */
|
||||
# ifdef USE_AS_WCSRCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
|
||||
-# else
|
||||
- leaq -VEC_SIZE(%rdi, %rax), %rax
|
||||
+ movl %edi, %esi
|
||||
+ andl $(VEC_SIZE - 1), %esi
|
||||
+ shrl $2, %esi
|
||||
# endif
|
||||
- ret
|
||||
+ shrxl %SHIFT_REG, %ecx, %ecx
|
||||
|
||||
- .p2align 4
|
||||
-L(char_and_nul):
|
||||
- /* Find both a CHAR and a null byte. */
|
||||
- addq %rcx, %rdi
|
||||
- movl %edx, %ecx
|
||||
-L(char_and_nul_in_first_vec):
|
||||
- /* Mask out any matching bits after the null byte. */
|
||||
- movl %ecx, %r8d
|
||||
- subl $1, %r8d
|
||||
- xorl %ecx, %r8d
|
||||
- andl %r8d, %eax
|
||||
- testl %eax, %eax
|
||||
- /* Return null pointer if the null byte comes first. */
|
||||
- jz L(return_null)
|
||||
+ testl %ecx, %ecx
|
||||
+ jz L(page_cross_continue)
|
||||
+
|
||||
+ /* Found zero CHAR so need to test for search CHAR. */
|
||||
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
|
||||
+ kmovd %k1, %eax
|
||||
+ /* Shift out search CHAR matches that are before the begining of
|
||||
+ src (rdi). */
|
||||
+ shrxl %SHIFT_REG, %eax, %eax
|
||||
+
|
||||
+ /* Check if any search CHAR match in range. */
|
||||
+ blsmskl %ecx, %ecx
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(ret3)
|
||||
bsrl %eax, %eax
|
||||
# ifdef USE_AS_WCSRCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
|
||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
||||
# else
|
||||
- leaq -VEC_SIZE(%rdi, %rax), %rax
|
||||
+ addq %rdi, %rax
|
||||
# endif
|
||||
+L(ret3):
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(return_null):
|
||||
- xorl %eax, %eax
|
||||
- ret
|
||||
-
|
||||
-END (STRRCHR)
|
||||
+END(STRRCHR)
|
||||
#endif
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,73 +0,0 @@
|
|||
From 911c63a51c690dd1a97dfc587097277029baf00f Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Wed, 27 Apr 2022 15:13:02 -0500
|
||||
Subject: [PATCH] sysdeps: Add 'get_fast_jitter' interace in fast-jitter.h
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
'get_fast_jitter' is meant to be used purely for performance
|
||||
purposes. In all cases it's used it should be acceptable to get no
|
||||
randomness (see default case). An example use case is in setting
|
||||
jitter for retries between threads at a lock. There is a
|
||||
performance benefit to having jitter, but only if the jitter can
|
||||
be generated very quickly and ultimately there is no serious issue
|
||||
if no jitter is generated.
|
||||
|
||||
The implementation generally uses 'HP_TIMING_NOW' iff it is
|
||||
inlined (avoid any potential syscall paths).
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/generic/fast-jitter.h | 42 +++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 42 insertions(+)
|
||||
create mode 100644 sysdeps/generic/fast-jitter.h
|
||||
|
||||
diff --git a/sysdeps/generic/fast-jitter.h b/sysdeps/generic/fast-jitter.h
|
||||
new file mode 100644
|
||||
index 00000000..4dd53e34
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/generic/fast-jitter.h
|
||||
@@ -0,0 +1,42 @@
|
||||
+/* Fallback for fast jitter just return 0.
|
||||
+ Copyright (C) 2019-2022 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifndef _FAST_JITTER_H
|
||||
+# define _FAST_JITTER_H
|
||||
+
|
||||
+# include <stdint.h>
|
||||
+# include <hp-timing.h>
|
||||
+
|
||||
+/* Baseline just return 0. We could create jitter using a clock or
|
||||
+ 'random_bits' but that may imply a syscall and the goal of
|
||||
+ 'get_fast_jitter' is minimal overhead "randomness" when such
|
||||
+ randomness helps performance. Adding high overhead the function
|
||||
+ defeats the purpose. */
|
||||
+static inline uint32_t
|
||||
+get_fast_jitter (void)
|
||||
+{
|
||||
+# if HP_TIMING_INLINE
|
||||
+ hp_timing_t jitter;
|
||||
+ HP_TIMING_NOW (jitter);
|
||||
+ return (uint32_t) jitter;
|
||||
+# else
|
||||
+ return 0;
|
||||
+# endif
|
||||
+}
|
||||
+
|
||||
+#endif
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,226 +0,0 @@
|
|||
From 8162147872491bb5b48e91543b19c49a29ae6b6d Mon Sep 17 00:00:00 2001
|
||||
From: Wangyang Guo <wangyang.guo@intel.com>
|
||||
Date: Fri, 6 May 2022 01:50:10 +0000
|
||||
Subject: [PATCH] nptl: Add backoff mechanism to spinlock loop
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
When mutiple threads waiting for lock at the same time, once lock owner
|
||||
releases the lock, waiters will see lock available and all try to lock,
|
||||
which may cause an expensive CAS storm.
|
||||
|
||||
Binary exponential backoff with random jitter is introduced. As try-lock
|
||||
attempt increases, there is more likely that a larger number threads
|
||||
compete for adaptive mutex lock, so increase wait time in exponential.
|
||||
A random jitter is also added to avoid synchronous try-lock from other
|
||||
threads.
|
||||
|
||||
v2: Remove read-check before try-lock for performance.
|
||||
|
||||
v3:
|
||||
1. Restore read-check since it works well in some platform.
|
||||
2. Make backoff arch dependent, and enable it for x86_64.
|
||||
3. Limit max backoff to reduce latency in large critical section.
|
||||
|
||||
v4: Fix strict-prototypes error in sysdeps/nptl/pthread_mutex_backoff.h
|
||||
|
||||
v5: Commit log updated for regression in large critical section.
|
||||
|
||||
Result of pthread-mutex-locks bench
|
||||
|
||||
Test Platform: Xeon 8280L (2 socket, 112 CPUs in total)
|
||||
First Row: thread number
|
||||
First Col: critical section length
|
||||
Values: backoff vs upstream, time based, low is better
|
||||
|
||||
non-critical-length: 1
|
||||
1 2 4 8 16 32 64 112 140
|
||||
0 0.99 0.58 0.52 0.49 0.43 0.44 0.46 0.52 0.54
|
||||
1 0.98 0.43 0.56 0.50 0.44 0.45 0.50 0.56 0.57
|
||||
2 0.99 0.41 0.57 0.51 0.45 0.47 0.48 0.60 0.61
|
||||
4 0.99 0.45 0.59 0.53 0.48 0.49 0.52 0.64 0.65
|
||||
8 1.00 0.66 0.71 0.63 0.56 0.59 0.66 0.72 0.71
|
||||
16 0.97 0.78 0.91 0.73 0.67 0.70 0.79 0.80 0.80
|
||||
32 0.95 1.17 0.98 0.87 0.82 0.86 0.89 0.90 0.90
|
||||
64 0.96 0.95 1.01 1.01 0.98 1.00 1.03 0.99 0.99
|
||||
128 0.99 1.01 1.01 1.17 1.08 1.12 1.02 0.97 1.02
|
||||
|
||||
non-critical-length: 32
|
||||
1 2 4 8 16 32 64 112 140
|
||||
0 1.03 0.97 0.75 0.65 0.58 0.58 0.56 0.70 0.70
|
||||
1 0.94 0.95 0.76 0.65 0.58 0.58 0.61 0.71 0.72
|
||||
2 0.97 0.96 0.77 0.66 0.58 0.59 0.62 0.74 0.74
|
||||
4 0.99 0.96 0.78 0.66 0.60 0.61 0.66 0.76 0.77
|
||||
8 0.99 0.99 0.84 0.70 0.64 0.66 0.71 0.80 0.80
|
||||
16 0.98 0.97 0.95 0.76 0.70 0.73 0.81 0.85 0.84
|
||||
32 1.04 1.12 1.04 0.89 0.82 0.86 0.93 0.91 0.91
|
||||
64 0.99 1.15 1.07 1.00 0.99 1.01 1.05 0.99 0.99
|
||||
128 1.00 1.21 1.20 1.22 1.25 1.31 1.12 1.10 0.99
|
||||
|
||||
non-critical-length: 128
|
||||
1 2 4 8 16 32 64 112 140
|
||||
0 1.02 1.00 0.99 0.67 0.61 0.61 0.61 0.74 0.73
|
||||
1 0.95 0.99 1.00 0.68 0.61 0.60 0.60 0.74 0.74
|
||||
2 1.00 1.04 1.00 0.68 0.59 0.61 0.65 0.76 0.76
|
||||
4 1.00 0.96 0.98 0.70 0.63 0.63 0.67 0.78 0.77
|
||||
8 1.01 1.02 0.89 0.73 0.65 0.67 0.71 0.81 0.80
|
||||
16 0.99 0.96 0.96 0.79 0.71 0.73 0.80 0.84 0.84
|
||||
32 0.99 0.95 1.05 0.89 0.84 0.85 0.94 0.92 0.91
|
||||
64 1.00 0.99 1.16 1.04 1.00 1.02 1.06 0.99 0.99
|
||||
128 1.00 1.06 0.98 1.14 1.39 1.26 1.08 1.02 0.98
|
||||
|
||||
There is regression in large critical section. But adaptive mutex is
|
||||
aimed for "quick" locks. Small critical section is more common when
|
||||
users choose to use adaptive pthread_mutex.
|
||||
|
||||
Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
|
||||
Conflicts:
|
||||
pthreadP.h
|
||||
(had been moved)
|
||||
nptl/pthread_mutex_lock.c
|
||||
(max_adaptive_count renamed)
|
||||
|
||||
---
|
||||
nptl/pthreadP.h | 1 +
|
||||
nptl/pthread_mutex_lock.c | 16 +++++++--
|
||||
sysdeps/nptl/pthread_mutex_backoff.h | 35 ++++++++++++++++++
|
||||
sysdeps/x86_64/nptl/pthread_mutex_backoff.h | 39 +++++++++++++++++++++
|
||||
4 files changed, 89 insertions(+), 2 deletions(-)
|
||||
create mode 100644 sysdeps/nptl/pthread_mutex_backoff.h
|
||||
create mode 100644 sysdeps/x86_64/nptl/pthread_mutex_backoff.h
|
||||
|
||||
diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h
|
||||
index 7ddc166c..1550e3b6 100644
|
||||
--- a/nptl/pthreadP.h
|
||||
+++ b/nptl/pthreadP.h
|
||||
@@ -33,6 +33,7 @@
|
||||
#include <kernel-features.h>
|
||||
#include <errno.h>
|
||||
#include <internal-signals.h>
|
||||
+#include <pthread_mutex_backoff.h>
|
||||
|
||||
|
||||
/* Atomic operations on TLS memory. */
|
||||
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
|
||||
index d96a9933..c7770fc9 100644
|
||||
--- a/nptl/pthread_mutex_lock.c
|
||||
+++ b/nptl/pthread_mutex_lock.c
|
||||
@@ -133,14 +133,26 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
|
||||
int cnt = 0;
|
||||
int max_cnt = MIN (MAX_ADAPTIVE_COUNT,
|
||||
mutex->__data.__spins * 2 + 10);
|
||||
+ int spin_count, exp_backoff = 1;
|
||||
+ unsigned int jitter = get_jitter ();
|
||||
do
|
||||
{
|
||||
- if (cnt++ >= max_cnt)
|
||||
+ /* In each loop, spin count is exponential backoff plus
|
||||
+ random jitter, random range is [0, exp_backoff-1]. */
|
||||
+ spin_count = exp_backoff + (jitter & (exp_backoff - 1));
|
||||
+ cnt += spin_count;
|
||||
+ if (cnt >= max_cnt)
|
||||
{
|
||||
+ /* If cnt exceeds max spin count, just go to wait
|
||||
+ queue. */
|
||||
LLL_MUTEX_LOCK (mutex);
|
||||
break;
|
||||
}
|
||||
- atomic_spin_nop ();
|
||||
+ do
|
||||
+ atomic_spin_nop ();
|
||||
+ while (--spin_count > 0);
|
||||
+ /* Prepare for next loop. */
|
||||
+ exp_backoff = get_next_backoff (exp_backoff);
|
||||
}
|
||||
while (LLL_MUTEX_READ_LOCK (mutex) != 0
|
||||
|| LLL_MUTEX_TRYLOCK (mutex) != 0);
|
||||
diff --git a/sysdeps/nptl/pthread_mutex_backoff.h b/sysdeps/nptl/pthread_mutex_backoff.h
|
||||
new file mode 100644
|
||||
index 00000000..5b26c22a
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/nptl/pthread_mutex_backoff.h
|
||||
@@ -0,0 +1,35 @@
|
||||
+/* Pthread mutex backoff configuration.
|
||||
+ Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+#ifndef _PTHREAD_MUTEX_BACKOFF_H
|
||||
+#define _PTHREAD_MUTEX_BACKOFF_H 1
|
||||
+
|
||||
+static inline unsigned int
|
||||
+get_jitter (void)
|
||||
+{
|
||||
+ /* Arch dependent random jitter, return 0 disables random. */
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static inline int
|
||||
+get_next_backoff (int backoff)
|
||||
+{
|
||||
+ /* Next backoff, return 1 disables mutex backoff. */
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
+#endif
|
||||
diff --git a/sysdeps/x86_64/nptl/pthread_mutex_backoff.h b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
|
||||
new file mode 100644
|
||||
index 00000000..ec74c3d9
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
|
||||
@@ -0,0 +1,39 @@
|
||||
+/* Pthread mutex backoff configuration.
|
||||
+ Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+#ifndef _PTHREAD_MUTEX_BACKOFF_H
|
||||
+#define _PTHREAD_MUTEX_BACKOFF_H 1
|
||||
+
|
||||
+#include <fast-jitter.h>
|
||||
+
|
||||
+static inline unsigned int
|
||||
+get_jitter (void)
|
||||
+{
|
||||
+ return get_fast_jitter ();
|
||||
+}
|
||||
+
|
||||
+#define MAX_BACKOFF 16
|
||||
+
|
||||
+static inline int
|
||||
+get_next_backoff (int backoff)
|
||||
+{
|
||||
+ /* Binary expontial backoff. Limiting max backoff
|
||||
+ can reduce latency in large critical section. */
|
||||
+ return (backoff < MAX_BACKOFF) ? backoff << 1 : backoff;
|
||||
+}
|
||||
+
|
||||
+#endif
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,55 +0,0 @@
|
|||
From c6272098323153db373f2986c67786ea8c85f1cf Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Tue, 15 Feb 2022 08:18:15 -0600
|
||||
Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ
|
||||
#28896]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
|
||||
call strcmp-avx2 and wcscmp-avx2 respectively. This would have
|
||||
not checks around vzeroupper and would trigger spurious
|
||||
aborts. This commit fixes that.
|
||||
|
||||
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
|
||||
AVX2 machines with and without RTM.
|
||||
|
||||
Co-authored-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strcmp-avx2.S | 8 ++------
|
||||
1 file changed, 2 insertions(+), 6 deletions(-)
|
||||
|
||||
Conflicts:
|
||||
sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
(split into two patches due to upstream bug differences)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
index 28cc98b6..e267c6cb 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
@@ -345,10 +345,10 @@ L(one_or_less):
|
||||
movq %LOCALE_REG, %rdx
|
||||
# endif
|
||||
jb L(ret_zero)
|
||||
-# ifdef USE_AS_WCSCMP
|
||||
/* 'nbe' covers the case where length is negative (large
|
||||
unsigned). */
|
||||
- jnbe __wcscmp_avx2
|
||||
+ jnbe OVERFLOW_STRCMP
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
movl (%rdi), %edx
|
||||
xorl %eax, %eax
|
||||
cmpl (%rsi), %edx
|
||||
@@ -357,10 +357,6 @@ L(one_or_less):
|
||||
negl %eax
|
||||
orl $1, %eax
|
||||
# else
|
||||
- /* 'nbe' covers the case where length is negative (large
|
||||
- unsigned). */
|
||||
-
|
||||
- jnbe __strcmp_avx2
|
||||
movzbl (%rdi), %eax
|
||||
movzbl (%rsi), %ecx
|
||||
TOLOWER_gpr (%rax, %eax)
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,60 +0,0 @@
|
|||
From 259a17cc98058d2576511201f85d28cb5d9de2a2 Mon Sep 17 00:00:00 2001
|
||||
From: Stefan Liebler <stli@linux.ibm.com>
|
||||
Date: Mon, 28 Jun 2021 13:01:07 +0200
|
||||
Subject: s390x: Update math: redirect roundeven function
|
||||
|
||||
After recent commit
|
||||
447954a206837b5f153869cfeeeab44631c3fac9
|
||||
"math: redirect roundeven function", building on
|
||||
s390x fails with:
|
||||
Error: symbol `__roundevenl' is already defined
|
||||
|
||||
Similar to aarch64/riscv fix, this patch redirects target
|
||||
specific functions for s390x:
|
||||
commit 3213ed770cbc5821920d16caa93c85e92dd7b9f6
|
||||
"Update math: redirect roundeven function"
|
||||
|
||||
diff --git a/sysdeps/s390/fpu/s_roundeven.c b/sysdeps/s390/fpu/s_roundeven.c
|
||||
index 40b07e054b..0773adfed0 100644
|
||||
--- a/sysdeps/s390/fpu/s_roundeven.c
|
||||
+++ b/sysdeps/s390/fpu/s_roundeven.c
|
||||
@@ -18,6 +18,7 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
|
||||
+# define NO_MATH_REDIRECT
|
||||
# include <math.h>
|
||||
# include <libm-alias-double.h>
|
||||
|
||||
@@ -31,7 +32,6 @@ __roundeven (double x)
|
||||
__asm__ ("fidbra %0,4,%1,4" : "=f" (y) : "f" (x));
|
||||
return y;
|
||||
}
|
||||
-hidden_def (__roundeven)
|
||||
libm_alias_double (__roundeven, roundeven)
|
||||
|
||||
#else
|
||||
diff --git a/sysdeps/s390/fpu/s_roundevenf.c b/sysdeps/s390/fpu/s_roundevenf.c
|
||||
index d2fbf3d2b6..289785bc4a 100644
|
||||
--- a/sysdeps/s390/fpu/s_roundevenf.c
|
||||
+++ b/sysdeps/s390/fpu/s_roundevenf.c
|
||||
@@ -18,6 +18,7 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
|
||||
+# define NO_MATH_REDIRECT
|
||||
# include <math.h>
|
||||
# include <libm-alias-float.h>
|
||||
|
||||
diff --git a/sysdeps/s390/fpu/s_roundevenl.c b/sysdeps/s390/fpu/s_roundevenl.c
|
||||
index 29ab7a8616..94b6459ab4 100644
|
||||
--- a/sysdeps/s390/fpu/s_roundevenl.c
|
||||
+++ b/sysdeps/s390/fpu/s_roundevenl.c
|
||||
@@ -18,6 +18,7 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
|
||||
+# define NO_MATH_REDIRECT
|
||||
# include <math.h>
|
||||
# include <math_private.h>
|
||||
# include <libm-alias-ldouble.h>
|
|
@ -1,74 +0,0 @@
|
|||
From 1da50d4bda07f04135dca39f40e79fc9eabed1f8 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Fri, 26 Feb 2021 05:36:59 -0800
|
||||
Subject: [PATCH] x86: Set Prefer_No_VZEROUPPER and add Prefer_AVX2_STRCMP
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
1. Set Prefer_No_VZEROUPPER if RTM is usable to avoid RTM abort triggered
|
||||
by VZEROUPPER inside a transactionally executing RTM region.
|
||||
2. Since to compare 2 32-byte strings, 256-bit EVEX strcmp requires 2
|
||||
loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp requires 1 load, 2 VPCMPEQs,
|
||||
1 VPMINU and 1 VPMOVMSKB, AVX2 strcmp is faster than EVEX strcmp. Add
|
||||
Prefer_AVX2_STRCMP to prefer AVX2 strcmp family functions.
|
||||
---
|
||||
sysdeps/x86/cpu-features.c | 20 +++++++++++++++++--
|
||||
sysdeps/x86/cpu-tunables.c | 2 ++
|
||||
...cpu-features-preferred_feature_index_1.def | 1 +
|
||||
3 files changed, 21 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
||||
index 91042505..3610ee5c 100644
|
||||
--- a/sysdeps/x86/cpu-features.c
|
||||
+++ b/sysdeps/x86/cpu-features.c
|
||||
@@ -524,8 +524,24 @@ init_cpu_features (struct cpu_features *cpu_features)
|
||||
cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
|
||||
|= bit_arch_Prefer_No_VZEROUPPER;
|
||||
else
|
||||
- cpu_features->preferred[index_arch_Prefer_No_AVX512]
|
||||
- |= bit_arch_Prefer_No_AVX512;
|
||||
+ {
|
||||
+ cpu_features->preferred[index_arch_Prefer_No_AVX512]
|
||||
+ |= bit_arch_Prefer_No_AVX512;
|
||||
+
|
||||
+ /* Avoid RTM abort triggered by VZEROUPPER inside a
|
||||
+ transactionally executing RTM region. */
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
||||
+ cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
|
||||
+ |= bit_arch_Prefer_No_VZEROUPPER;
|
||||
+
|
||||
+ /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp
|
||||
+ requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp
|
||||
+ requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB,
|
||||
+ AVX2 strcmp is faster than EVEX strcmp. */
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
|
||||
+ cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
|
||||
+ |= bit_arch_Prefer_AVX2_STRCMP;
|
||||
+ }
|
||||
}
|
||||
/* This spells out "AuthenticAMD". */
|
||||
else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
|
||||
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
|
||||
index 3173b2b9..73adbaba 100644
|
||||
--- a/sysdeps/x86/cpu-tunables.c
|
||||
+++ b/sysdeps/x86/cpu-tunables.c
|
||||
@@ -239,6 +239,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
|
||||
CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
|
||||
Fast_Copy_Backward,
|
||||
disable, 18);
|
||||
+ CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
|
||||
+ (n, cpu_features, Prefer_AVX2_STRCMP, AVX2, disable, 18);
|
||||
}
|
||||
break;
|
||||
case 19:
|
||||
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
||||
index 17a5cc42..4ca70b40 100644
|
||||
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
||||
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
||||
@@ -32,3 +32,4 @@ BIT (Prefer_ERMS)
|
||||
BIT (Prefer_FSRM)
|
||||
BIT (Prefer_No_AVX512)
|
||||
BIT (MathVec_Prefer_No_AVX512)
|
||||
+BIT (Prefer_AVX2_STRCMP)
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
From 3213ed770cbc5821920d16caa93c85e92dd7b9f6 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Wed, 23 Jun 2021 13:29:41 -0700
|
||||
Subject: Update math: redirect roundeven function
|
||||
|
||||
Redirect target specific roundeven functions for aarch64, ldbl-128ibm
|
||||
and riscv.
|
||||
|
||||
Conflicts:
|
||||
sysdeps/aarch64/*
|
||||
(not needed)
|
||||
sysdeps/riscv/*
|
||||
(not supported)
|
||||
|
||||
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
|
||||
index 6701970f4a..90eecf496b 100644
|
||||
--- a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
|
||||
+++ b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
|
||||
@@ -17,6 +17,7 @@
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
+#define NO_MATH_REDIRECT
|
||||
#include <math.h>
|
||||
#include <math_private.h>
|
||||
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -1,242 +0,0 @@
|
|||
From 63ad43566f7a25d140dc723598aeb441ad657eed Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Fri, 5 Mar 2021 06:46:08 -0800
|
||||
Subject: [PATCH] x86-64: Add memmove family functions with 256-bit EVEX
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Update ifunc-memmove.h to select the function optimized with 256-bit EVEX
|
||||
instructions using YMM16-YMM31 registers to avoid RTM abort with usable
|
||||
AVX512VL since VZEROUPPER isn't needed at function exit.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/Makefile | 1 +
|
||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 36 +++++++++++++++++++
|
||||
sysdeps/x86_64/multiarch/ifunc-memmove.h | 21 +++++++++--
|
||||
.../multiarch/memmove-evex-unaligned-erms.S | 33 +++++++++++++++++
|
||||
.../multiarch/memmove-vec-unaligned-erms.S | 24 ++++++++-----
|
||||
5 files changed, 104 insertions(+), 11 deletions(-)
|
||||
create mode 100644 sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
||||
index 46783cd1..4563fc56 100644
|
||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
||||
@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
|
||||
memset-avx2-unaligned-erms \
|
||||
memset-avx512-unaligned-erms \
|
||||
memchr-evex \
|
||||
+ memmove-evex-unaligned-erms \
|
||||
memrchr-evex \
|
||||
rawmemchr-evex \
|
||||
stpcpy-evex \
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index 082e4da3..6bd3abfc 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -80,6 +80,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
||||
CPU_FEATURE_USABLE (AVX),
|
||||
__memmove_chk_avx_unaligned_erms)
|
||||
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __memmove_chk_evex_unaligned)
|
||||
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __memmove_chk_evex_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
||||
CPU_FEATURE_USABLE (SSSE3),
|
||||
__memmove_chk_ssse3_back)
|
||||
@@ -102,6 +108,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, memmove,
|
||||
CPU_FEATURE_USABLE (AVX),
|
||||
__memmove_avx_unaligned_erms)
|
||||
+ IFUNC_IMPL_ADD (array, i, memmove,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __memmove_evex_unaligned)
|
||||
+ IFUNC_IMPL_ADD (array, i, memmove,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __memmove_evex_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, memmove,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
__memmove_avx512_no_vzeroupper)
|
||||
@@ -565,6 +577,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
||||
CPU_FEATURE_USABLE (AVX),
|
||||
__memcpy_chk_avx_unaligned_erms)
|
||||
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __memcpy_chk_evex_unaligned)
|
||||
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __memcpy_chk_evex_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
||||
CPU_FEATURE_USABLE (SSSE3),
|
||||
__memcpy_chk_ssse3_back)
|
||||
@@ -587,6 +605,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, memcpy,
|
||||
CPU_FEATURE_USABLE (AVX),
|
||||
__memcpy_avx_unaligned_erms)
|
||||
+ IFUNC_IMPL_ADD (array, i, memcpy,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __memcpy_evex_unaligned)
|
||||
+ IFUNC_IMPL_ADD (array, i, memcpy,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __memcpy_evex_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
|
||||
__memcpy_ssse3_back)
|
||||
IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
|
||||
@@ -623,6 +647,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
||||
CPU_FEATURE_USABLE (AVX),
|
||||
__mempcpy_chk_avx_unaligned_erms)
|
||||
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __mempcpy_chk_evex_unaligned)
|
||||
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __mempcpy_chk_evex_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
||||
CPU_FEATURE_USABLE (SSSE3),
|
||||
__mempcpy_chk_ssse3_back)
|
||||
@@ -654,6 +684,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, mempcpy,
|
||||
CPU_FEATURE_USABLE (AVX),
|
||||
__mempcpy_avx_unaligned_erms)
|
||||
+ IFUNC_IMPL_ADD (array, i, mempcpy,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __mempcpy_evex_unaligned)
|
||||
+ IFUNC_IMPL_ADD (array, i, mempcpy,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __mempcpy_evex_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
|
||||
__mempcpy_ssse3_back)
|
||||
IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
|
||||
index 5e5f0299..6f8bce5f 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
|
||||
@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
|
||||
attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
|
||||
+ attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
|
||||
+ attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
|
||||
attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
|
||||
@@ -59,10 +63,21 @@ IFUNC_SELECTOR (void)
|
||||
|
||||
if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
||||
{
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
- return OPTIMIZE (avx_unaligned_erms);
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
|
||||
+ {
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
+ return OPTIMIZE (evex_unaligned_erms);
|
||||
+
|
||||
+ return OPTIMIZE (evex_unaligned);
|
||||
+ }
|
||||
+
|
||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
+ {
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
+ return OPTIMIZE (avx_unaligned_erms);
|
||||
|
||||
- return OPTIMIZE (avx_unaligned);
|
||||
+ return OPTIMIZE (avx_unaligned);
|
||||
+ }
|
||||
}
|
||||
|
||||
if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
|
||||
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
|
||||
new file mode 100644
|
||||
index 00000000..0cbce8f9
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
|
||||
@@ -0,0 +1,33 @@
|
||||
+#if IS_IN (libc)
|
||||
+# define VEC_SIZE 32
|
||||
+# define XMM0 xmm16
|
||||
+# define XMM1 xmm17
|
||||
+# define YMM0 ymm16
|
||||
+# define YMM1 ymm17
|
||||
+# define VEC0 ymm16
|
||||
+# define VEC1 ymm17
|
||||
+# define VEC2 ymm18
|
||||
+# define VEC3 ymm19
|
||||
+# define VEC4 ymm20
|
||||
+# define VEC5 ymm21
|
||||
+# define VEC6 ymm22
|
||||
+# define VEC7 ymm23
|
||||
+# define VEC8 ymm24
|
||||
+# define VEC9 ymm25
|
||||
+# define VEC10 ymm26
|
||||
+# define VEC11 ymm27
|
||||
+# define VEC12 ymm28
|
||||
+# define VEC13 ymm29
|
||||
+# define VEC14 ymm30
|
||||
+# define VEC15 ymm31
|
||||
+# define VEC(i) VEC##i
|
||||
+# define VMOVNT vmovntdq
|
||||
+# define VMOVU vmovdqu64
|
||||
+# define VMOVA vmovdqa64
|
||||
+# define VZEROUPPER
|
||||
+
|
||||
+# define SECTION(p) p##.evex
|
||||
+# define MEMMOVE_SYMBOL(p,s) p##_evex_##s
|
||||
+
|
||||
+# include "memmove-vec-unaligned-erms.S"
|
||||
+#endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
index 274aa1c7..08e21692 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
@@ -48,6 +48,14 @@
|
||||
# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
|
||||
#endif
|
||||
|
||||
+#ifndef XMM0
|
||||
+# define XMM0 xmm0
|
||||
+#endif
|
||||
+
|
||||
+#ifndef YMM0
|
||||
+# define YMM0 ymm0
|
||||
+#endif
|
||||
+
|
||||
#ifndef VZEROUPPER
|
||||
# if VEC_SIZE > 16
|
||||
# define VZEROUPPER vzeroupper
|
||||
@@ -277,20 +285,20 @@ L(less_vec):
|
||||
#if VEC_SIZE > 32
|
||||
L(between_32_63):
|
||||
/* From 32 to 63. No branch when size == 32. */
|
||||
- vmovdqu (%rsi), %ymm0
|
||||
- vmovdqu -32(%rsi,%rdx), %ymm1
|
||||
- vmovdqu %ymm0, (%rdi)
|
||||
- vmovdqu %ymm1, -32(%rdi,%rdx)
|
||||
+ VMOVU (%rsi), %YMM0
|
||||
+ VMOVU -32(%rsi,%rdx), %YMM1
|
||||
+ VMOVU %YMM0, (%rdi)
|
||||
+ VMOVU %YMM1, -32(%rdi,%rdx)
|
||||
VZEROUPPER
|
||||
ret
|
||||
#endif
|
||||
#if VEC_SIZE > 16
|
||||
/* From 16 to 31. No branch when size == 16. */
|
||||
L(between_16_31):
|
||||
- vmovdqu (%rsi), %xmm0
|
||||
- vmovdqu -16(%rsi,%rdx), %xmm1
|
||||
- vmovdqu %xmm0, (%rdi)
|
||||
- vmovdqu %xmm1, -16(%rdi,%rdx)
|
||||
+ VMOVU (%rsi), %XMM0
|
||||
+ VMOVU -16(%rsi,%rdx), %XMM1
|
||||
+ VMOVU %XMM0, (%rdi)
|
||||
+ VMOVU %XMM1, -16(%rdi,%rdx)
|
||||
ret
|
||||
#endif
|
||||
L(between_8_15):
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,254 +0,0 @@
|
|||
From 1b968b6b9b3aac702ac2f133e0dd16cfdbb415ee Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Fri, 5 Mar 2021 07:15:03 -0800
|
||||
Subject: [PATCH] x86-64: Add memset family functions with 256-bit EVEX
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
|
||||
with 256-bit EVEX instructions using YMM16-YMM31 registers to avoid RTM
|
||||
abort with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
|
||||
function exit.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/Makefile | 1 +
|
||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 22 +++++++++++++++++
|
||||
sysdeps/x86_64/multiarch/ifunc-memset.h | 24 +++++++++++++++----
|
||||
sysdeps/x86_64/multiarch/ifunc-wmemset.h | 13 ++++++----
|
||||
.../multiarch/memset-evex-unaligned-erms.S | 24 +++++++++++++++++++
|
||||
.../multiarch/memset-vec-unaligned-erms.S | 20 +++++++++++-----
|
||||
6 files changed, 90 insertions(+), 14 deletions(-)
|
||||
create mode 100644 sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
||||
index 4563fc56..1cc0a10e 100644
|
||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
||||
@@ -43,6 +43,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
|
||||
memchr-evex \
|
||||
memmove-evex-unaligned-erms \
|
||||
memrchr-evex \
|
||||
+ memset-evex-unaligned-erms \
|
||||
rawmemchr-evex \
|
||||
stpcpy-evex \
|
||||
stpncpy-evex \
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index 6bd3abfc..7cf83485 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -160,6 +160,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
CPU_FEATURE_USABLE (AVX2),
|
||||
__memset_chk_avx2_unaligned_erms)
|
||||
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
+ __memset_chk_evex_unaligned)
|
||||
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
+ __memset_chk_evex_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
__memset_chk_avx512_unaligned_erms)
|
||||
@@ -185,6 +193,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, memset,
|
||||
CPU_FEATURE_USABLE (AVX2),
|
||||
__memset_avx2_unaligned_erms)
|
||||
+ IFUNC_IMPL_ADD (array, i, memset,
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
+ __memset_evex_unaligned)
|
||||
+ IFUNC_IMPL_ADD (array, i, memset,
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
+ __memset_evex_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, memset,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
__memset_avx512_unaligned_erms)
|
||||
@@ -555,6 +571,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, wmemset,
|
||||
CPU_FEATURE_USABLE (AVX2),
|
||||
__wmemset_avx2_unaligned)
|
||||
+ IFUNC_IMPL_ADD (array, i, wmemset,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __wmemset_evex_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, wmemset,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
__wmemset_avx512_unaligned))
|
||||
@@ -723,6 +742,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
|
||||
CPU_FEATURE_USABLE (AVX2),
|
||||
__wmemset_chk_avx2_unaligned)
|
||||
+ IFUNC_IMPL_ADD (array, i, __wmemset_chk,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __wmemset_chk_evex_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
__wmemset_chk_avx512_unaligned))
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
|
||||
index 708bd72e..6f31f4dc 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
|
||||
@@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
|
||||
attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
|
||||
+ attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
|
||||
+ attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
|
||||
attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
|
||||
@@ -56,10 +60,22 @@ IFUNC_SELECTOR (void)
|
||||
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
|
||||
{
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
- return OPTIMIZE (avx2_unaligned_erms);
|
||||
- else
|
||||
- return OPTIMIZE (avx2_unaligned);
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
||||
+ {
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
+ return OPTIMIZE (evex_unaligned_erms);
|
||||
+
|
||||
+ return OPTIMIZE (evex_unaligned);
|
||||
+ }
|
||||
+
|
||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
+ {
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
+ return OPTIMIZE (avx2_unaligned_erms);
|
||||
+
|
||||
+ return OPTIMIZE (avx2_unaligned);
|
||||
+ }
|
||||
}
|
||||
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
|
||||
index eb242210..9290c4bf 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
|
||||
@@ -20,6 +20,7 @@
|
||||
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
|
||||
|
||||
static inline void *
|
||||
@@ -27,14 +28,18 @@ IFUNC_SELECTOR (void)
|
||||
{
|
||||
const struct cpu_features* cpu_features = __get_cpu_features ();
|
||||
|
||||
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
|
||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
||||
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
||||
{
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
|
||||
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
|
||||
+ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
|
||||
+ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
return OPTIMIZE (avx512_unaligned);
|
||||
- else
|
||||
+
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
|
||||
+ return OPTIMIZE (evex_unaligned);
|
||||
+
|
||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
return OPTIMIZE (avx2_unaligned);
|
||||
}
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
new file mode 100644
|
||||
index 00000000..ae0a4d6e
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
@@ -0,0 +1,24 @@
|
||||
+#if IS_IN (libc)
|
||||
+# define VEC_SIZE 32
|
||||
+# define XMM0 xmm16
|
||||
+# define YMM0 ymm16
|
||||
+# define VEC0 ymm16
|
||||
+# define VEC(i) VEC##i
|
||||
+# define VMOVU vmovdqu64
|
||||
+# define VMOVA vmovdqa64
|
||||
+# define VZEROUPPER
|
||||
+
|
||||
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
+ movq r, %rax; \
|
||||
+ vpbroadcastb d, %VEC0
|
||||
+
|
||||
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
+ movq r, %rax; \
|
||||
+ vpbroadcastd d, %VEC0
|
||||
+
|
||||
+# define SECTION(p) p##.evex
|
||||
+# define MEMSET_SYMBOL(p,s) p##_evex_##s
|
||||
+# define WMEMSET_SYMBOL(p,s) p##_evex_##s
|
||||
+
|
||||
+# include "memset-vec-unaligned-erms.S"
|
||||
+#endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
index 9a0fd818..71e91a8f 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
@@ -34,6 +34,14 @@
|
||||
# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
|
||||
#endif
|
||||
|
||||
+#ifndef XMM0
|
||||
+# define XMM0 xmm0
|
||||
+#endif
|
||||
+
|
||||
+#ifndef YMM0
|
||||
+# define YMM0 ymm0
|
||||
+#endif
|
||||
+
|
||||
#ifndef VZEROUPPER
|
||||
# if VEC_SIZE > 16
|
||||
# define VZEROUPPER vzeroupper
|
||||
@@ -67,7 +75,7 @@
|
||||
ENTRY (__bzero)
|
||||
mov %RDI_LP, %RAX_LP /* Set return value. */
|
||||
mov %RSI_LP, %RDX_LP /* Set n. */
|
||||
- pxor %xmm0, %xmm0
|
||||
+ pxor %XMM0, %XMM0
|
||||
jmp L(entry_from_bzero)
|
||||
END (__bzero)
|
||||
weak_alias (__bzero, bzero)
|
||||
@@ -223,7 +231,7 @@ L(less_vec):
|
||||
cmpb $16, %dl
|
||||
jae L(between_16_31)
|
||||
# endif
|
||||
- MOVQ %xmm0, %rcx
|
||||
+ MOVQ %XMM0, %rcx
|
||||
cmpb $8, %dl
|
||||
jae L(between_8_15)
|
||||
cmpb $4, %dl
|
||||
@@ -238,16 +246,16 @@ L(less_vec):
|
||||
# if VEC_SIZE > 32
|
||||
/* From 32 to 63. No branch when size == 32. */
|
||||
L(between_32_63):
|
||||
- vmovdqu %ymm0, -32(%rdi,%rdx)
|
||||
- vmovdqu %ymm0, (%rdi)
|
||||
+ VMOVU %YMM0, -32(%rdi,%rdx)
|
||||
+ VMOVU %YMM0, (%rdi)
|
||||
VZEROUPPER
|
||||
ret
|
||||
# endif
|
||||
# if VEC_SIZE > 16
|
||||
/* From 16 to 31. No branch when size == 16. */
|
||||
L(between_16_31):
|
||||
- vmovdqu %xmm0, -16(%rdi,%rdx)
|
||||
- vmovdqu %xmm0, (%rdi)
|
||||
+ VMOVU %XMM0, -16(%rdi,%rdx)
|
||||
+ VMOVU %XMM0, (%rdi)
|
||||
VZEROUPPER
|
||||
ret
|
||||
# endif
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,561 +0,0 @@
|
|||
From 91264fe3577fe887b4860923fa6142b5274c8965 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Fri, 5 Mar 2021 07:20:28 -0800
|
||||
Subject: [PATCH] x86-64: Add memcmp family functions with 256-bit EVEX
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Update ifunc-memcmp.h to select the function optimized with 256-bit EVEX
|
||||
instructions using YMM16-YMM31 registers to avoid RTM abort with usable
|
||||
AVX512VL, AVX512BW and MOVBE since VZEROUPPER isn't needed at function
|
||||
exit.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/Makefile | 4 +-
|
||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 10 +
|
||||
sysdeps/x86_64/multiarch/ifunc-memcmp.h | 13 +-
|
||||
sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 440 ++++++++++++++++++
|
||||
sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S | 4 +
|
||||
5 files changed, 467 insertions(+), 4 deletions(-)
|
||||
create mode 100644 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
||||
index 1cc0a10e..9d79b138 100644
|
||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
||||
@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
|
||||
memset-avx2-unaligned-erms \
|
||||
memset-avx512-unaligned-erms \
|
||||
memchr-evex \
|
||||
+ memcmp-evex-movbe \
|
||||
memmove-evex-unaligned-erms \
|
||||
memrchr-evex \
|
||||
memset-evex-unaligned-erms \
|
||||
@@ -81,7 +82,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
|
||||
wcsncmp-evex \
|
||||
wcsnlen-evex \
|
||||
wcsrchr-evex \
|
||||
- wmemchr-evex
|
||||
+ wmemchr-evex \
|
||||
+ wmemcmp-evex-movbe
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),debug)
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index 7cf83485..c8da910e 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -56,6 +56,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
(CPU_FEATURE_USABLE (AVX2)
|
||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
||||
__memcmp_avx2_movbe)
|
||||
+ IFUNC_IMPL_ADD (array, i, memcmp,
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (MOVBE)),
|
||||
+ __memcmp_evex_movbe)
|
||||
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
|
||||
__memcmp_sse4_1)
|
||||
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
|
||||
@@ -558,6 +563,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
(CPU_FEATURE_USABLE (AVX2)
|
||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
||||
__wmemcmp_avx2_movbe)
|
||||
+ IFUNC_IMPL_ADD (array, i, wmemcmp,
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (MOVBE)),
|
||||
+ __wmemcmp_evex_movbe)
|
||||
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
|
||||
__wmemcmp_sse4_1)
|
||||
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
||||
index 6c1f3153..3ca1f0a6 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
||||
@@ -23,17 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
|
||||
|
||||
static inline void *
|
||||
IFUNC_SELECTOR (void)
|
||||
{
|
||||
const struct cpu_features* cpu_features = __get_cpu_features ();
|
||||
|
||||
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
|
||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
||||
&& CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
|
||||
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
||||
- return OPTIMIZE (avx2_movbe);
|
||||
+ {
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
||||
+ return OPTIMIZE (evex_movbe);
|
||||
+
|
||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
+ return OPTIMIZE (avx2_movbe);
|
||||
+ }
|
||||
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
|
||||
return OPTIMIZE (sse4_1);
|
||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
new file mode 100644
|
||||
index 00000000..9c093972
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
@@ -0,0 +1,440 @@
|
||||
+/* memcmp/wmemcmp optimized with 256-bit EVEX instructions.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#if IS_IN (libc)
|
||||
+
|
||||
+/* memcmp/wmemcmp is implemented as:
|
||||
+ 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
|
||||
+ to avoid branches.
|
||||
+ 2. Use overlapping compare to avoid branch.
|
||||
+ 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
|
||||
+ bytes for wmemcmp.
|
||||
+ 4. If size is 8 * VEC_SIZE or less, unroll the loop.
|
||||
+ 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
|
||||
+ area.
|
||||
+ 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
|
||||
+ 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
|
||||
+ 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
|
||||
+
|
||||
+# include <sysdep.h>
|
||||
+
|
||||
+# ifndef MEMCMP
|
||||
+# define MEMCMP __memcmp_evex_movbe
|
||||
+# endif
|
||||
+
|
||||
+# define VMOVU vmovdqu64
|
||||
+
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+# define VPCMPEQ vpcmpeqd
|
||||
+# else
|
||||
+# define VPCMPEQ vpcmpeqb
|
||||
+# endif
|
||||
+
|
||||
+# define XMM1 xmm17
|
||||
+# define XMM2 xmm18
|
||||
+# define YMM1 ymm17
|
||||
+# define YMM2 ymm18
|
||||
+# define YMM3 ymm19
|
||||
+# define YMM4 ymm20
|
||||
+# define YMM5 ymm21
|
||||
+# define YMM6 ymm22
|
||||
+
|
||||
+# define VEC_SIZE 32
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+# define VEC_MASK 0xff
|
||||
+# define XMM_MASK 0xf
|
||||
+# else
|
||||
+# define VEC_MASK 0xffffffff
|
||||
+# define XMM_MASK 0xffff
|
||||
+# endif
|
||||
+
|
||||
+/* Warning!
|
||||
+ wmemcmp has to use SIGNED comparison for elements.
|
||||
+ memcmp has to use UNSIGNED comparison for elemnts.
|
||||
+*/
|
||||
+
|
||||
+ .section .text.evex,"ax",@progbits
|
||||
+ENTRY (MEMCMP)
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ shl $2, %RDX_LP
|
||||
+# elif defined __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ movl %edx, %edx
|
||||
+# endif
|
||||
+ cmp $VEC_SIZE, %RDX_LP
|
||||
+ jb L(less_vec)
|
||||
+
|
||||
+ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
+ VMOVU (%rsi), %YMM2
|
||||
+ VPCMPEQ (%rdi), %YMM2, %k1
|
||||
+ kmovd %k1, %eax
|
||||
+ subl $VEC_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+
|
||||
+ cmpq $(VEC_SIZE * 2), %rdx
|
||||
+ jbe L(last_vec)
|
||||
+
|
||||
+ /* More than 2 * VEC. */
|
||||
+ cmpq $(VEC_SIZE * 8), %rdx
|
||||
+ ja L(more_8x_vec)
|
||||
+ cmpq $(VEC_SIZE * 4), %rdx
|
||||
+ jb L(last_4x_vec)
|
||||
+
|
||||
+ /* From 4 * VEC to 8 * VEC, inclusively. */
|
||||
+ VMOVU (%rsi), %YMM1
|
||||
+ VPCMPEQ (%rdi), %YMM1, %k1
|
||||
+
|
||||
+ VMOVU VEC_SIZE(%rsi), %YMM2
|
||||
+ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
|
||||
+
|
||||
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
|
||||
+
|
||||
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
|
||||
+
|
||||
+ kandd %k1, %k2, %k5
|
||||
+ kandd %k3, %k4, %k6
|
||||
+ kandd %k5, %k6, %k6
|
||||
+
|
||||
+ kmovd %k6, %eax
|
||||
+ cmpl $VEC_MASK, %eax
|
||||
+ jne L(4x_vec_end)
|
||||
+
|
||||
+ leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
|
||||
+ leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
|
||||
+ VMOVU (%rsi), %YMM1
|
||||
+ VPCMPEQ (%rdi), %YMM1, %k1
|
||||
+
|
||||
+ VMOVU VEC_SIZE(%rsi), %YMM2
|
||||
+ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
|
||||
+ kandd %k1, %k2, %k5
|
||||
+
|
||||
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
|
||||
+ kandd %k3, %k5, %k5
|
||||
+
|
||||
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
|
||||
+ kandd %k4, %k5, %k5
|
||||
+
|
||||
+ kmovd %k5, %eax
|
||||
+ cmpl $VEC_MASK, %eax
|
||||
+ jne L(4x_vec_end)
|
||||
+ xorl %eax, %eax
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(last_2x_vec):
|
||||
+ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
+ VMOVU (%rsi), %YMM2
|
||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
||||
+ kmovd %k2, %eax
|
||||
+ subl $VEC_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+
|
||||
+L(last_vec):
|
||||
+ /* Use overlapping loads to avoid branches. */
|
||||
+ leaq -VEC_SIZE(%rdi, %rdx), %rdi
|
||||
+ leaq -VEC_SIZE(%rsi, %rdx), %rsi
|
||||
+ VMOVU (%rsi), %YMM2
|
||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
||||
+ kmovd %k2, %eax
|
||||
+ subl $VEC_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(first_vec):
|
||||
+ /* A byte or int32 is different within 16 or 32 bytes. */
|
||||
+ tzcntl %eax, %ecx
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ xorl %eax, %eax
|
||||
+ movl (%rdi, %rcx, 4), %edx
|
||||
+ cmpl (%rsi, %rcx, 4), %edx
|
||||
+L(wmemcmp_return):
|
||||
+ setl %al
|
||||
+ negl %eax
|
||||
+ orl $1, %eax
|
||||
+# else
|
||||
+ movzbl (%rdi, %rcx), %eax
|
||||
+ movzbl (%rsi, %rcx), %edx
|
||||
+ sub %edx, %eax
|
||||
+# endif
|
||||
+ ret
|
||||
+
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ .p2align 4
|
||||
+L(4):
|
||||
+ xorl %eax, %eax
|
||||
+ movl (%rdi), %edx
|
||||
+ cmpl (%rsi), %edx
|
||||
+ jne L(wmemcmp_return)
|
||||
+ ret
|
||||
+# else
|
||||
+ .p2align 4
|
||||
+L(between_4_7):
|
||||
+ /* Load as big endian with overlapping movbe to avoid branches. */
|
||||
+ movbe (%rdi), %eax
|
||||
+ movbe (%rsi), %ecx
|
||||
+ shlq $32, %rax
|
||||
+ shlq $32, %rcx
|
||||
+ movbe -4(%rdi, %rdx), %edi
|
||||
+ movbe -4(%rsi, %rdx), %esi
|
||||
+ orq %rdi, %rax
|
||||
+ orq %rsi, %rcx
|
||||
+ subq %rcx, %rax
|
||||
+ je L(exit)
|
||||
+ sbbl %eax, %eax
|
||||
+ orl $1, %eax
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(exit):
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(between_2_3):
|
||||
+ /* Load as big endian to avoid branches. */
|
||||
+ movzwl (%rdi), %eax
|
||||
+ movzwl (%rsi), %ecx
|
||||
+ shll $8, %eax
|
||||
+ shll $8, %ecx
|
||||
+ bswap %eax
|
||||
+ bswap %ecx
|
||||
+ movb -1(%rdi, %rdx), %al
|
||||
+ movb -1(%rsi, %rdx), %cl
|
||||
+ /* Subtraction is okay because the upper 8 bits are zero. */
|
||||
+ subl %ecx, %eax
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(1):
|
||||
+ movzbl (%rdi), %eax
|
||||
+ movzbl (%rsi), %ecx
|
||||
+ subl %ecx, %eax
|
||||
+ ret
|
||||
+# endif
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(zero):
|
||||
+ xorl %eax, %eax
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(less_vec):
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */
|
||||
+ cmpb $4, %dl
|
||||
+ je L(4)
|
||||
+ jb L(zero)
|
||||
+# else
|
||||
+ cmpb $1, %dl
|
||||
+ je L(1)
|
||||
+ jb L(zero)
|
||||
+ cmpb $4, %dl
|
||||
+ jb L(between_2_3)
|
||||
+ cmpb $8, %dl
|
||||
+ jb L(between_4_7)
|
||||
+# endif
|
||||
+ cmpb $16, %dl
|
||||
+ jae L(between_16_31)
|
||||
+ /* It is between 8 and 15 bytes. */
|
||||
+ vmovq (%rdi), %XMM1
|
||||
+ vmovq (%rsi), %XMM2
|
||||
+ VPCMPEQ %XMM1, %XMM2, %k2
|
||||
+ kmovw %k2, %eax
|
||||
+ subl $XMM_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+ /* Use overlapping loads to avoid branches. */
|
||||
+ leaq -8(%rdi, %rdx), %rdi
|
||||
+ leaq -8(%rsi, %rdx), %rsi
|
||||
+ vmovq (%rdi), %XMM1
|
||||
+ vmovq (%rsi), %XMM2
|
||||
+ VPCMPEQ %XMM1, %XMM2, %k2
|
||||
+ kmovw %k2, %eax
|
||||
+ subl $XMM_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(between_16_31):
|
||||
+ /* From 16 to 31 bytes. No branch when size == 16. */
|
||||
+ VMOVU (%rsi), %XMM2
|
||||
+ VPCMPEQ (%rdi), %XMM2, %k2
|
||||
+ kmovw %k2, %eax
|
||||
+ subl $XMM_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+
|
||||
+ /* Use overlapping loads to avoid branches. */
|
||||
+ leaq -16(%rdi, %rdx), %rdi
|
||||
+ leaq -16(%rsi, %rdx), %rsi
|
||||
+ VMOVU (%rsi), %XMM2
|
||||
+ VPCMPEQ (%rdi), %XMM2, %k2
|
||||
+ kmovw %k2, %eax
|
||||
+ subl $XMM_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(more_8x_vec):
|
||||
+ /* More than 8 * VEC. Check the first VEC. */
|
||||
+ VMOVU (%rsi), %YMM2
|
||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
||||
+ kmovd %k2, %eax
|
||||
+ subl $VEC_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+
|
||||
+ /* Align the first memory area for aligned loads in the loop.
|
||||
+ Compute how much the first memory area is misaligned. */
|
||||
+ movq %rdi, %rcx
|
||||
+ andl $(VEC_SIZE - 1), %ecx
|
||||
+ /* Get the negative of offset for alignment. */
|
||||
+ subq $VEC_SIZE, %rcx
|
||||
+ /* Adjust the second memory area. */
|
||||
+ subq %rcx, %rsi
|
||||
+ /* Adjust the first memory area which should be aligned now. */
|
||||
+ subq %rcx, %rdi
|
||||
+ /* Adjust length. */
|
||||
+ addq %rcx, %rdx
|
||||
+
|
||||
+L(loop_4x_vec):
|
||||
+ /* Compare 4 * VEC at a time forward. */
|
||||
+ VMOVU (%rsi), %YMM1
|
||||
+ VPCMPEQ (%rdi), %YMM1, %k1
|
||||
+
|
||||
+ VMOVU VEC_SIZE(%rsi), %YMM2
|
||||
+ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
|
||||
+ kandd %k2, %k1, %k5
|
||||
+
|
||||
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
|
||||
+ kandd %k3, %k5, %k5
|
||||
+
|
||||
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
|
||||
+ kandd %k4, %k5, %k5
|
||||
+
|
||||
+ kmovd %k5, %eax
|
||||
+ cmpl $VEC_MASK, %eax
|
||||
+ jne L(4x_vec_end)
|
||||
+
|
||||
+ addq $(VEC_SIZE * 4), %rdi
|
||||
+ addq $(VEC_SIZE * 4), %rsi
|
||||
+
|
||||
+ subq $(VEC_SIZE * 4), %rdx
|
||||
+ cmpq $(VEC_SIZE * 4), %rdx
|
||||
+ jae L(loop_4x_vec)
|
||||
+
|
||||
+ /* Less than 4 * VEC. */
|
||||
+ cmpq $VEC_SIZE, %rdx
|
||||
+ jbe L(last_vec)
|
||||
+ cmpq $(VEC_SIZE * 2), %rdx
|
||||
+ jbe L(last_2x_vec)
|
||||
+
|
||||
+L(last_4x_vec):
|
||||
+ /* From 2 * VEC to 4 * VEC. */
|
||||
+ VMOVU (%rsi), %YMM2
|
||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
||||
+ kmovd %k2, %eax
|
||||
+ subl $VEC_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+
|
||||
+ addq $VEC_SIZE, %rdi
|
||||
+ addq $VEC_SIZE, %rsi
|
||||
+ VMOVU (%rsi), %YMM2
|
||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
||||
+ kmovd %k2, %eax
|
||||
+ subl $VEC_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+
|
||||
+ /* Use overlapping loads to avoid branches. */
|
||||
+ leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
|
||||
+ leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
|
||||
+ VMOVU (%rsi), %YMM2
|
||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
||||
+ kmovd %k2, %eax
|
||||
+ subl $VEC_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+
|
||||
+ addq $VEC_SIZE, %rdi
|
||||
+ addq $VEC_SIZE, %rsi
|
||||
+ VMOVU (%rsi), %YMM2
|
||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
||||
+ kmovd %k2, %eax
|
||||
+ subl $VEC_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(4x_vec_end):
|
||||
+ kmovd %k1, %eax
|
||||
+ subl $VEC_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+ kmovd %k2, %eax
|
||||
+ subl $VEC_MASK, %eax
|
||||
+ jnz L(first_vec_x1)
|
||||
+ kmovd %k3, %eax
|
||||
+ subl $VEC_MASK, %eax
|
||||
+ jnz L(first_vec_x2)
|
||||
+ kmovd %k4, %eax
|
||||
+ subl $VEC_MASK, %eax
|
||||
+ tzcntl %eax, %ecx
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ xorl %eax, %eax
|
||||
+ movl (VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
|
||||
+ cmpl (VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
|
||||
+ jmp L(wmemcmp_return)
|
||||
+# else
|
||||
+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
|
||||
+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
|
||||
+ sub %edx, %eax
|
||||
+# endif
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(first_vec_x1):
|
||||
+ tzcntl %eax, %ecx
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ xorl %eax, %eax
|
||||
+ movl VEC_SIZE(%rdi, %rcx, 4), %edx
|
||||
+ cmpl VEC_SIZE(%rsi, %rcx, 4), %edx
|
||||
+ jmp L(wmemcmp_return)
|
||||
+# else
|
||||
+ movzbl VEC_SIZE(%rdi, %rcx), %eax
|
||||
+ movzbl VEC_SIZE(%rsi, %rcx), %edx
|
||||
+ sub %edx, %eax
|
||||
+# endif
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(first_vec_x2):
|
||||
+ tzcntl %eax, %ecx
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ xorl %eax, %eax
|
||||
+ movl (VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
|
||||
+ cmpl (VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
|
||||
+ jmp L(wmemcmp_return)
|
||||
+# else
|
||||
+ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
|
||||
+ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx
|
||||
+ sub %edx, %eax
|
||||
+# endif
|
||||
+ ret
|
||||
+END (MEMCMP)
|
||||
+#endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
|
||||
new file mode 100644
|
||||
index 00000000..4726d74a
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
|
||||
@@ -0,0 +1,4 @@
|
||||
+#define MEMCMP __wmemcmp_evex_movbe
|
||||
+#define USE_AS_WMEMCMP 1
|
||||
+
|
||||
+#include "memcmp-evex-movbe.S"
|
||||
--
|
||||
GitLab
|
||||
|
File diff suppressed because it is too large
Load diff
|
@ -1,735 +0,0 @@
|
|||
From 4bd660be40967cd69072f69ebc2ad32bfcc1f206 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Tue, 23 Feb 2021 06:33:10 -0800
|
||||
Subject: [PATCH] x86: Add string/memory function tests in RTM region
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
At function exit, AVX optimized string/memory functions have VZEROUPPER
|
||||
which triggers RTM abort. When such functions are called inside a
|
||||
transactionally executing RTM region, RTM abort causes severe performance
|
||||
degradation. Add tests to verify that string/memory functions won't
|
||||
cause RTM abort in RTM region.
|
||||
---
|
||||
sysdeps/x86/Makefile | 23 +++++++++++
|
||||
sysdeps/x86/tst-memchr-rtm.c | 54 ++++++++++++++++++++++++++
|
||||
sysdeps/x86/tst-memcmp-rtm.c | 52 +++++++++++++++++++++++++
|
||||
sysdeps/x86/tst-memmove-rtm.c | 53 ++++++++++++++++++++++++++
|
||||
sysdeps/x86/tst-memrchr-rtm.c | 54 ++++++++++++++++++++++++++
|
||||
sysdeps/x86/tst-memset-rtm.c | 45 ++++++++++++++++++++++
|
||||
sysdeps/x86/tst-strchr-rtm.c | 54 ++++++++++++++++++++++++++
|
||||
sysdeps/x86/tst-strcpy-rtm.c | 53 ++++++++++++++++++++++++++
|
||||
sysdeps/x86/tst-string-rtm.h | 72 +++++++++++++++++++++++++++++++++++
|
||||
sysdeps/x86/tst-strlen-rtm.c | 53 ++++++++++++++++++++++++++
|
||||
sysdeps/x86/tst-strncmp-rtm.c | 52 +++++++++++++++++++++++++
|
||||
sysdeps/x86/tst-strrchr-rtm.c | 53 ++++++++++++++++++++++++++
|
||||
12 files changed, 618 insertions(+)
|
||||
create mode 100644 sysdeps/x86/tst-memchr-rtm.c
|
||||
create mode 100644 sysdeps/x86/tst-memcmp-rtm.c
|
||||
create mode 100644 sysdeps/x86/tst-memmove-rtm.c
|
||||
create mode 100644 sysdeps/x86/tst-memrchr-rtm.c
|
||||
create mode 100644 sysdeps/x86/tst-memset-rtm.c
|
||||
create mode 100644 sysdeps/x86/tst-strchr-rtm.c
|
||||
create mode 100644 sysdeps/x86/tst-strcpy-rtm.c
|
||||
create mode 100644 sysdeps/x86/tst-string-rtm.h
|
||||
create mode 100644 sysdeps/x86/tst-strlen-rtm.c
|
||||
create mode 100644 sysdeps/x86/tst-strncmp-rtm.c
|
||||
create mode 100644 sysdeps/x86/tst-strrchr-rtm.c
|
||||
|
||||
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
|
||||
index 59e928e9..5be71ada 100644
|
||||
--- a/sysdeps/x86/Makefile
|
||||
+++ b/sysdeps/x86/Makefile
|
||||
@@ -17,6 +17,29 @@ endif
|
||||
|
||||
ifeq ($(subdir),string)
|
||||
sysdep_routines += cacheinfo
|
||||
+
|
||||
+tests += \
|
||||
+ tst-memchr-rtm \
|
||||
+ tst-memcmp-rtm \
|
||||
+ tst-memmove-rtm \
|
||||
+ tst-memrchr-rtm \
|
||||
+ tst-memset-rtm \
|
||||
+ tst-strchr-rtm \
|
||||
+ tst-strcpy-rtm \
|
||||
+ tst-strlen-rtm \
|
||||
+ tst-strncmp-rtm \
|
||||
+ tst-strrchr-rtm
|
||||
+
|
||||
+CFLAGS-tst-memchr-rtm.c += -mrtm
|
||||
+CFLAGS-tst-memcmp-rtm.c += -mrtm
|
||||
+CFLAGS-tst-memmove-rtm.c += -mrtm
|
||||
+CFLAGS-tst-memrchr-rtm.c += -mrtm
|
||||
+CFLAGS-tst-memset-rtm.c += -mrtm
|
||||
+CFLAGS-tst-strchr-rtm.c += -mrtm
|
||||
+CFLAGS-tst-strcpy-rtm.c += -mrtm
|
||||
+CFLAGS-tst-strlen-rtm.c += -mrtm
|
||||
+CFLAGS-tst-strncmp-rtm.c += -mrtm
|
||||
+CFLAGS-tst-strrchr-rtm.c += -mrtm
|
||||
endif
|
||||
|
||||
ifneq ($(enable-cet),no)
|
||||
diff --git a/sysdeps/x86/tst-memchr-rtm.c b/sysdeps/x86/tst-memchr-rtm.c
|
||||
new file mode 100644
|
||||
index 00000000..e4749401
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-memchr-rtm.c
|
||||
@@ -0,0 +1,54 @@
|
||||
+/* Test case for memchr inside a transactionally executing RTM region.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <tst-string-rtm.h>
|
||||
+
|
||||
+#define LOOP 3000
|
||||
+#define STRING_SIZE 1024
|
||||
+char string1[STRING_SIZE];
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+prepare (void)
|
||||
+{
|
||||
+ memset (string1, 'a', STRING_SIZE);
|
||||
+ string1[100] = 'c';
|
||||
+ string1[STRING_SIZE - 100] = 'c';
|
||||
+ char *p = memchr (string1, 'c', STRING_SIZE);
|
||||
+ if (p == &string1[100])
|
||||
+ return EXIT_SUCCESS;
|
||||
+ else
|
||||
+ return EXIT_FAILURE;
|
||||
+}
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+function (void)
|
||||
+{
|
||||
+ char *p = memchr (string1, 'c', STRING_SIZE);
|
||||
+ if (p == &string1[100])
|
||||
+ return 0;
|
||||
+ else
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ return do_test_1 ("memchr", LOOP, prepare, function);
|
||||
+}
|
||||
diff --git a/sysdeps/x86/tst-memcmp-rtm.c b/sysdeps/x86/tst-memcmp-rtm.c
|
||||
new file mode 100644
|
||||
index 00000000..e4c8a623
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-memcmp-rtm.c
|
||||
@@ -0,0 +1,52 @@
|
||||
+/* Test case for memcmp inside a transactionally executing RTM region.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <tst-string-rtm.h>
|
||||
+
|
||||
+#define LOOP 3000
|
||||
+#define STRING_SIZE 1024
|
||||
+char string1[STRING_SIZE];
|
||||
+char string2[STRING_SIZE];
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+prepare (void)
|
||||
+{
|
||||
+ memset (string1, 'a', STRING_SIZE);
|
||||
+ memset (string2, 'a', STRING_SIZE);
|
||||
+ if (memcmp (string1, string2, STRING_SIZE) == 0)
|
||||
+ return EXIT_SUCCESS;
|
||||
+ else
|
||||
+ return EXIT_FAILURE;
|
||||
+}
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+function (void)
|
||||
+{
|
||||
+ if (memcmp (string1, string2, STRING_SIZE) == 0)
|
||||
+ return 0;
|
||||
+ else
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ return do_test_1 ("memcmp", LOOP, prepare, function);
|
||||
+}
|
||||
diff --git a/sysdeps/x86/tst-memmove-rtm.c b/sysdeps/x86/tst-memmove-rtm.c
|
||||
new file mode 100644
|
||||
index 00000000..4bf97ef1
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-memmove-rtm.c
|
||||
@@ -0,0 +1,53 @@
|
||||
+/* Test case for memmove inside a transactionally executing RTM region.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <tst-string-rtm.h>
|
||||
+
|
||||
+#define LOOP 3000
|
||||
+#define STRING_SIZE 1024
|
||||
+char string1[STRING_SIZE];
|
||||
+char string2[STRING_SIZE];
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+prepare (void)
|
||||
+{
|
||||
+ memset (string1, 'a', STRING_SIZE);
|
||||
+ if (memmove (string2, string1, STRING_SIZE) == string2
|
||||
+ && memcmp (string2, string1, STRING_SIZE) == 0)
|
||||
+ return EXIT_SUCCESS;
|
||||
+ else
|
||||
+ return EXIT_FAILURE;
|
||||
+}
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+function (void)
|
||||
+{
|
||||
+ if (memmove (string2, string1, STRING_SIZE) == string2
|
||||
+ && memcmp (string2, string1, STRING_SIZE) == 0)
|
||||
+ return 0;
|
||||
+ else
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ return do_test_1 ("memmove", LOOP, prepare, function);
|
||||
+}
|
||||
diff --git a/sysdeps/x86/tst-memrchr-rtm.c b/sysdeps/x86/tst-memrchr-rtm.c
|
||||
new file mode 100644
|
||||
index 00000000..a57a5a8e
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-memrchr-rtm.c
|
||||
@@ -0,0 +1,54 @@
|
||||
+/* Test case for memrchr inside a transactionally executing RTM region.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <tst-string-rtm.h>
|
||||
+
|
||||
+#define LOOP 3000
|
||||
+#define STRING_SIZE 1024
|
||||
+char string1[STRING_SIZE];
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+prepare (void)
|
||||
+{
|
||||
+ memset (string1, 'a', STRING_SIZE);
|
||||
+ string1[100] = 'c';
|
||||
+ string1[STRING_SIZE - 100] = 'c';
|
||||
+ char *p = memrchr (string1, 'c', STRING_SIZE);
|
||||
+ if (p == &string1[STRING_SIZE - 100])
|
||||
+ return EXIT_SUCCESS;
|
||||
+ else
|
||||
+ return EXIT_FAILURE;
|
||||
+}
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+function (void)
|
||||
+{
|
||||
+ char *p = memrchr (string1, 'c', STRING_SIZE);
|
||||
+ if (p == &string1[STRING_SIZE - 100])
|
||||
+ return 0;
|
||||
+ else
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ return do_test_1 ("memrchr", LOOP, prepare, function);
|
||||
+}
|
||||
diff --git a/sysdeps/x86/tst-memset-rtm.c b/sysdeps/x86/tst-memset-rtm.c
|
||||
new file mode 100644
|
||||
index 00000000..bf343a4d
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-memset-rtm.c
|
||||
@@ -0,0 +1,45 @@
|
||||
+/* Test case for memset inside a transactionally executing RTM region.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <tst-string-rtm.h>
|
||||
+
|
||||
+#define LOOP 3000
|
||||
+#define STRING_SIZE 1024
|
||||
+char string1[STRING_SIZE];
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+prepare (void)
|
||||
+{
|
||||
+ memset (string1, 'a', STRING_SIZE);
|
||||
+ return EXIT_SUCCESS;
|
||||
+}
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+function (void)
|
||||
+{
|
||||
+ memset (string1, 'a', STRING_SIZE);
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ return do_test_1 ("memset", LOOP, prepare, function);
|
||||
+}
|
||||
diff --git a/sysdeps/x86/tst-strchr-rtm.c b/sysdeps/x86/tst-strchr-rtm.c
|
||||
new file mode 100644
|
||||
index 00000000..a82e29c0
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-strchr-rtm.c
|
||||
@@ -0,0 +1,54 @@
|
||||
+/* Test case for strchr inside a transactionally executing RTM region.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <tst-string-rtm.h>
|
||||
+
|
||||
+#define LOOP 3000
|
||||
+#define STRING_SIZE 1024
|
||||
+char string1[STRING_SIZE];
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+prepare (void)
|
||||
+{
|
||||
+ memset (string1, 'a', STRING_SIZE - 1);
|
||||
+ string1[100] = 'c';
|
||||
+ string1[STRING_SIZE - 100] = 'c';
|
||||
+ char *p = strchr (string1, 'c');
|
||||
+ if (p == &string1[100])
|
||||
+ return EXIT_SUCCESS;
|
||||
+ else
|
||||
+ return EXIT_FAILURE;
|
||||
+}
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+function (void)
|
||||
+{
|
||||
+ char *p = strchr (string1, 'c');
|
||||
+ if (p == &string1[100])
|
||||
+ return 0;
|
||||
+ else
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ return do_test_1 ("strchr", LOOP, prepare, function);
|
||||
+}
|
||||
diff --git a/sysdeps/x86/tst-strcpy-rtm.c b/sysdeps/x86/tst-strcpy-rtm.c
|
||||
new file mode 100644
|
||||
index 00000000..2b2a583f
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-strcpy-rtm.c
|
||||
@@ -0,0 +1,53 @@
|
||||
+/* Test case for strcpy inside a transactionally executing RTM region.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <tst-string-rtm.h>
|
||||
+
|
||||
+#define LOOP 3000
|
||||
+#define STRING_SIZE 1024
|
||||
+char string1[STRING_SIZE];
|
||||
+char string2[STRING_SIZE];
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+prepare (void)
|
||||
+{
|
||||
+ memset (string1, 'a', STRING_SIZE - 1);
|
||||
+ if (strcpy (string2, string1) == string2
|
||||
+ && strcmp (string2, string1) == 0)
|
||||
+ return EXIT_SUCCESS;
|
||||
+ else
|
||||
+ return EXIT_FAILURE;
|
||||
+}
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+function (void)
|
||||
+{
|
||||
+ if (strcpy (string2, string1) == string2
|
||||
+ && strcmp (string2, string1) == 0)
|
||||
+ return 0;
|
||||
+ else
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ return do_test_1 ("strcpy", LOOP, prepare, function);
|
||||
+}
|
||||
diff --git a/sysdeps/x86/tst-string-rtm.h b/sysdeps/x86/tst-string-rtm.h
|
||||
new file mode 100644
|
||||
index 00000000..d2470afa
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-string-rtm.h
|
||||
@@ -0,0 +1,72 @@
|
||||
+/* Test string function in a transactionally executing RTM region.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <string.h>
|
||||
+#include <x86intrin.h>
|
||||
+#include <sys/platform/x86.h>
|
||||
+#include <support/check.h>
|
||||
+#include <support/test-driver.h>
|
||||
+
|
||||
+static int
|
||||
+do_test_1 (const char *name, unsigned int loop, int (*prepare) (void),
|
||||
+ int (*function) (void))
|
||||
+{
|
||||
+ if (!CPU_FEATURE_USABLE (RTM))
|
||||
+ return EXIT_UNSUPPORTED;
|
||||
+
|
||||
+ int status = prepare ();
|
||||
+ if (status != EXIT_SUCCESS)
|
||||
+ return status;
|
||||
+
|
||||
+ unsigned int i;
|
||||
+ unsigned int naborts = 0;
|
||||
+ unsigned int failed = 0;
|
||||
+ for (i = 0; i < loop; i++)
|
||||
+ {
|
||||
+ failed |= function ();
|
||||
+ if (_xbegin() == _XBEGIN_STARTED)
|
||||
+ {
|
||||
+ failed |= function ();
|
||||
+ _xend();
|
||||
+ }
|
||||
+ else
|
||||
+ {
|
||||
+ failed |= function ();
|
||||
+ ++naborts;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ if (failed)
|
||||
+ FAIL_EXIT1 ("%s() failed", name);
|
||||
+
|
||||
+ if (naborts)
|
||||
+ {
|
||||
+ /* NB: Low single digit (<= 5%) noise-level aborts are normal for
|
||||
+ TSX. */
|
||||
+ double rate = 100 * ((double) naborts) / ((double) loop);
|
||||
+ if (rate > 5)
|
||||
+ FAIL_EXIT1 ("TSX abort rate: %.2f%% (%d out of %d)",
|
||||
+ rate, naborts, loop);
|
||||
+ }
|
||||
+
|
||||
+ return EXIT_SUCCESS;
|
||||
+}
|
||||
+
|
||||
+static int do_test (void);
|
||||
+
|
||||
+#include <support/test-driver.c>
|
||||
diff --git a/sysdeps/x86/tst-strlen-rtm.c b/sysdeps/x86/tst-strlen-rtm.c
|
||||
new file mode 100644
|
||||
index 00000000..0dcf14db
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-strlen-rtm.c
|
||||
@@ -0,0 +1,53 @@
|
||||
+/* Test case for strlen inside a transactionally executing RTM region.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <tst-string-rtm.h>
|
||||
+
|
||||
+#define LOOP 3000
|
||||
+#define STRING_SIZE 1024
|
||||
+char string1[STRING_SIZE];
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+prepare (void)
|
||||
+{
|
||||
+ memset (string1, 'a', STRING_SIZE - 1);
|
||||
+ string1[STRING_SIZE - 100] = '\0';
|
||||
+ size_t len = strlen (string1);
|
||||
+ if (len == STRING_SIZE - 100)
|
||||
+ return EXIT_SUCCESS;
|
||||
+ else
|
||||
+ return EXIT_FAILURE;
|
||||
+}
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+function (void)
|
||||
+{
|
||||
+ size_t len = strlen (string1);
|
||||
+ if (len == STRING_SIZE - 100)
|
||||
+ return 0;
|
||||
+ else
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ return do_test_1 ("strlen", LOOP, prepare, function);
|
||||
+}
|
||||
diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
|
||||
new file mode 100644
|
||||
index 00000000..236ad951
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-strncmp-rtm.c
|
||||
@@ -0,0 +1,52 @@
|
||||
+/* Test case for strncmp inside a transactionally executing RTM region.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <tst-string-rtm.h>
|
||||
+
|
||||
+#define LOOP 3000
|
||||
+#define STRING_SIZE 1024
|
||||
+char string1[STRING_SIZE];
|
||||
+char string2[STRING_SIZE];
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+prepare (void)
|
||||
+{
|
||||
+ memset (string1, 'a', STRING_SIZE - 1);
|
||||
+ memset (string2, 'a', STRING_SIZE - 1);
|
||||
+ if (strncmp (string1, string2, STRING_SIZE) == 0)
|
||||
+ return EXIT_SUCCESS;
|
||||
+ else
|
||||
+ return EXIT_FAILURE;
|
||||
+}
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+function (void)
|
||||
+{
|
||||
+ if (strncmp (string1, string2, STRING_SIZE) == 0)
|
||||
+ return 0;
|
||||
+ else
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ return do_test_1 ("strncmp", LOOP, prepare, function);
|
||||
+}
|
||||
diff --git a/sysdeps/x86/tst-strrchr-rtm.c b/sysdeps/x86/tst-strrchr-rtm.c
|
||||
new file mode 100644
|
||||
index 00000000..e32bfaf5
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-strrchr-rtm.c
|
||||
@@ -0,0 +1,53 @@
|
||||
+/* Test case for strrchr inside a transactionally executing RTM region.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <tst-string-rtm.h>
|
||||
+
|
||||
+#define LOOP 3000
|
||||
+#define STRING_SIZE 1024
|
||||
+char string1[STRING_SIZE];
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+prepare (void)
|
||||
+{
|
||||
+ memset (string1, 'a', STRING_SIZE - 1);
|
||||
+ string1[STRING_SIZE - 100] = 'c';
|
||||
+ char *p = strrchr (string1, 'c');
|
||||
+ if (p == &string1[STRING_SIZE - 100])
|
||||
+ return EXIT_SUCCESS;
|
||||
+ else
|
||||
+ return EXIT_FAILURE;
|
||||
+}
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+function (void)
|
||||
+{
|
||||
+ char *p = strrchr (string1, 'c');
|
||||
+ if (p == &string1[STRING_SIZE - 100])
|
||||
+ return 0;
|
||||
+ else
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ return do_test_1 ("strrchr", LOOP, prepare, function);
|
||||
+}
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,148 +0,0 @@
|
|||
From 4e2d8f352774b56078c34648b14a2412c38384f4 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Sun, 7 Mar 2021 09:44:18 -0800
|
||||
Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memset family functions
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
|
||||
with AVX512 instructions using ZMM16-ZMM31 registers to avoid RTM abort
|
||||
with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
|
||||
function exit.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 14 +++++++++-----
|
||||
sysdeps/x86_64/multiarch/ifunc-memset.h | 13 ++++++++-----
|
||||
sysdeps/x86_64/multiarch/ifunc-wmemset.h | 12 ++++++------
|
||||
.../multiarch/memset-avx512-unaligned-erms.S | 16 ++++++++--------
|
||||
4 files changed, 31 insertions(+), 24 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index c1efeec0..d969a156 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -211,10 +211,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
&& CPU_FEATURE_USABLE (AVX512BW)),
|
||||
__memset_chk_evex_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
__memset_chk_avx512_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
__memset_chk_avx512_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
@@ -252,10 +254,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
&& CPU_FEATURE_USABLE (AVX512BW)),
|
||||
__memset_evex_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, memset,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
__memset_avx512_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, memset,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
__memset_avx512_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, memset,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
@@ -719,7 +723,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
CPU_FEATURE_USABLE (AVX512VL),
|
||||
__wmemset_evex_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, wmemset,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__wmemset_avx512_unaligned))
|
||||
|
||||
#ifdef SHARED
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
|
||||
index 6f3375cc..19795938 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
|
||||
@@ -53,13 +53,16 @@ IFUNC_SELECTOR (void)
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
|
||||
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
|
||||
{
|
||||
- if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
- return OPTIMIZE (avx512_no_vzeroupper);
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
||||
+ {
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
+ return OPTIMIZE (avx512_unaligned_erms);
|
||||
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
- return OPTIMIZE (avx512_unaligned_erms);
|
||||
+ return OPTIMIZE (avx512_unaligned);
|
||||
+ }
|
||||
|
||||
- return OPTIMIZE (avx512_unaligned);
|
||||
+ return OPTIMIZE (avx512_no_vzeroupper);
|
||||
}
|
||||
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
|
||||
index bdc94c6c..98c5d406 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
|
||||
@@ -33,13 +33,13 @@ IFUNC_SELECTOR (void)
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
||||
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
||||
{
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
|
||||
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
|
||||
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
- return OPTIMIZE (avx512_unaligned);
|
||||
-
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
|
||||
- return OPTIMIZE (evex_unaligned);
|
||||
+ {
|
||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
|
||||
+ return OPTIMIZE (avx512_unaligned);
|
||||
+
|
||||
+ return OPTIMIZE (evex_unaligned);
|
||||
+ }
|
||||
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
||||
return OPTIMIZE (avx2_unaligned_rtm);
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
index 0783979c..22e7b187 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
@@ -1,22 +1,22 @@
|
||||
#if IS_IN (libc)
|
||||
# define VEC_SIZE 64
|
||||
-# define VEC(i) zmm##i
|
||||
+# define XMM0 xmm16
|
||||
+# define YMM0 ymm16
|
||||
+# define VEC0 zmm16
|
||||
+# define VEC(i) VEC##i
|
||||
# define VMOVU vmovdqu64
|
||||
# define VMOVA vmovdqa64
|
||||
+# define VZEROUPPER
|
||||
|
||||
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
- vmovd d, %xmm0; \
|
||||
movq r, %rax; \
|
||||
- vpbroadcastb %xmm0, %xmm0; \
|
||||
- vpbroadcastq %xmm0, %zmm0
|
||||
+ vpbroadcastb d, %VEC0
|
||||
|
||||
# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
- vmovd d, %xmm0; \
|
||||
movq r, %rax; \
|
||||
- vpbroadcastd %xmm0, %xmm0; \
|
||||
- vpbroadcastq %xmm0, %zmm0
|
||||
+ vpbroadcastd d, %VEC0
|
||||
|
||||
-# define SECTION(p) p##.avx512
|
||||
+# define SECTION(p) p##.evex512
|
||||
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
|
||||
# define WMEMSET_SYMBOL(p,s) p##_avx512_##s
|
||||
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,230 +0,0 @@
|
|||
From b304fc201d2f6baf52ea790df8643e99772243cd Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Mon, 21 Jan 2019 11:25:56 -0800
|
||||
Subject: [PATCH] x86-64 memcmp/wmemcmp: Properly handle the length parameter
|
||||
[BZ# 24097]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
||||
functions written in assembly can only use the lower 32 bits of a
|
||||
64-bit register as length or must clear the upper 32 bits before using
|
||||
the full 64-bit register for length.
|
||||
|
||||
This pach fixes memcmp/wmemcmp for x32. Tested on x86-64 and x32. On
|
||||
x86-64, libc.so is the same with and withou the fix.
|
||||
|
||||
[BZ# 24097]
|
||||
CVE-2019-6488
|
||||
* sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S: Use RDX_LP for
|
||||
length. Clear the upper 32 bits of RDX register.
|
||||
* sysdeps/x86_64/multiarch/memcmp-sse4.S: Likewise.
|
||||
* sysdeps/x86_64/multiarch/memcmp-ssse3.S: Likewise.
|
||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp and
|
||||
tst-size_t-wmemcmp.
|
||||
* sysdeps/x86_64/x32/tst-size_t-memcmp.c: New file.
|
||||
* sysdeps/x86_64/x32/tst-size_t-wmemcmp.c: Likewise.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 7 +-
|
||||
sysdeps/x86_64/multiarch/memcmp-sse4.S | 9 ++-
|
||||
sysdeps/x86_64/multiarch/memcmp-ssse3.S | 7 +-
|
||||
sysdeps/x86_64/x32/Makefile | 4 +-
|
||||
sysdeps/x86_64/x32/tst-size_t-memcmp.c | 76 ++++++++++++++++++++
|
||||
sysdeps/x86_64/x32/tst-size_t-wmemcmp.c | 20 ++++++
|
||||
6 files changed, 114 insertions(+), 9 deletions(-)
|
||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp.c
|
||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
|
||||
|
||||
Conflicts:
|
||||
ChangeLog
|
||||
(removed)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
||||
index 30f764c3..e3a35b89 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
||||
@@ -58,9 +58,12 @@
|
||||
.section .text.avx,"ax",@progbits
|
||||
ENTRY (MEMCMP)
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
- shl $2, %rdx
|
||||
+ shl $2, %RDX_LP
|
||||
+# elif defined __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ movl %edx, %edx
|
||||
# endif
|
||||
- cmpq $VEC_SIZE, %rdx
|
||||
+ cmp $VEC_SIZE, %RDX_LP
|
||||
jb L(less_vec)
|
||||
|
||||
/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
|
||||
index 8e164f2c..302900f5 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
|
||||
@@ -42,13 +42,16 @@
|
||||
.section .text.sse4.1,"ax",@progbits
|
||||
ENTRY (MEMCMP)
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
- shl $2, %rdx
|
||||
+ shl $2, %RDX_LP
|
||||
+# elif defined __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ mov %edx, %edx
|
||||
# endif
|
||||
pxor %xmm0, %xmm0
|
||||
- cmp $79, %rdx
|
||||
+ cmp $79, %RDX_LP
|
||||
ja L(79bytesormore)
|
||||
# ifndef USE_AS_WMEMCMP
|
||||
- cmp $1, %rdx
|
||||
+ cmp $1, %RDX_LP
|
||||
je L(firstbyte)
|
||||
# endif
|
||||
add %rdx, %rsi
|
||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
|
||||
index 6f76c641..69d030fc 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
|
||||
@@ -33,9 +33,12 @@
|
||||
atom_text_section
|
||||
ENTRY (MEMCMP)
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
- shl $2, %rdx
|
||||
- test %rdx, %rdx
|
||||
+ shl $2, %RDX_LP
|
||||
+ test %RDX_LP, %RDX_LP
|
||||
jz L(equal)
|
||||
+# elif defined __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ mov %edx, %edx
|
||||
# endif
|
||||
mov %rdx, %rcx
|
||||
mov %rdi, %rdx
|
||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
||||
index 7d528889..ddec7f04 100644
|
||||
--- a/sysdeps/x86_64/x32/Makefile
|
||||
+++ b/sysdeps/x86_64/x32/Makefile
|
||||
@@ -6,9 +6,9 @@ CFLAGS-s_llround.c += -fno-builtin-lround
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),string)
|
||||
-tests += tst-size_t-memchr
|
||||
+tests += tst-size_t-memchr tst-size_t-memcmp
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),wcsmbs)
|
||||
-tests += tst-size_t-wmemchr
|
||||
+tests += tst-size_t-wmemchr tst-size_t-wmemcmp
|
||||
endif
|
||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp.c b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
|
||||
new file mode 100644
|
||||
index 00000000..9bd6fdb4
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
|
||||
@@ -0,0 +1,76 @@
|
||||
+/* Test memcmp with size_t in the lower 32 bits of 64-bit register.
|
||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#define TEST_MAIN
|
||||
+#ifdef WIDE
|
||||
+# define TEST_NAME "wmemcmp"
|
||||
+#else
|
||||
+# define TEST_NAME "memcmp"
|
||||
+#endif
|
||||
+
|
||||
+#include "test-size_t.h"
|
||||
+
|
||||
+#ifdef WIDE
|
||||
+# include <inttypes.h>
|
||||
+# include <wchar.h>
|
||||
+
|
||||
+# define MEMCMP wmemcmp
|
||||
+# define CHAR wchar_t
|
||||
+#else
|
||||
+# define MEMCMP memcmp
|
||||
+# define CHAR char
|
||||
+#endif
|
||||
+
|
||||
+IMPL (MEMCMP, 1)
|
||||
+
|
||||
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
|
||||
+
|
||||
+static int
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+do_memcmp (parameter_t a, parameter_t b)
|
||||
+{
|
||||
+ return CALL (&b, a.p, b.p, a.len);
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+test_main (void)
|
||||
+{
|
||||
+ test_init ();
|
||||
+
|
||||
+ parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 };
|
||||
+ parameter_t src = { { 0 }, buf2 };
|
||||
+
|
||||
+ memcpy (buf1, buf2, page_size);
|
||||
+
|
||||
+ int ret = 0;
|
||||
+ FOR_EACH_IMPL (impl, 0)
|
||||
+ {
|
||||
+ src.fn = impl->fn;
|
||||
+ int res = do_memcmp (dest, src);
|
||||
+ if (res)
|
||||
+ {
|
||||
+ error (0, 0, "Wrong result in function %s: %i != 0",
|
||||
+ impl->name, res);
|
||||
+ ret = 1;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
||||
+}
|
||||
+
|
||||
+#include <support/test-driver.c>
|
||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
|
||||
new file mode 100644
|
||||
index 00000000..e8b5ffd0
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
|
||||
@@ -0,0 +1,20 @@
|
||||
+/* Test wmemcmp with size_t in the lower 32 bits of 64-bit register.
|
||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#define WIDE 1
|
||||
+#include "tst-size_t-memcmp.c"
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,164 +0,0 @@
|
|||
From e4fda4631017e49d4ee5a2755db34289b6860fa4 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Sun, 7 Mar 2021 09:45:23 -0800
|
||||
Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memmove family functions
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Update ifunc-memmove.h to select the function optimized with AVX512
|
||||
instructions using ZMM16-ZMM31 registers to avoid RTM abort with usable
|
||||
AVX512VL since VZEROUPPER isn't needed at function exit.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 24 +++++++++---------
|
||||
sysdeps/x86_64/multiarch/ifunc-memmove.h | 12 +++++----
|
||||
.../multiarch/memmove-avx512-unaligned-erms.S | 25 +++++++++++++++++--
|
||||
3 files changed, 42 insertions(+), 19 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index d969a156..fec384f6 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -83,10 +83,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
__memmove_chk_avx512_no_vzeroupper)
|
||||
IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__memmove_chk_avx512_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__memmove_chk_avx512_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
||||
CPU_FEATURE_USABLE (AVX),
|
||||
@@ -148,10 +148,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
__memmove_avx512_no_vzeroupper)
|
||||
IFUNC_IMPL_ADD (array, i, memmove,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__memmove_avx512_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, memmove,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__memmove_avx512_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
|
||||
__memmove_ssse3_back)
|
||||
@@ -733,10 +733,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
__memcpy_chk_avx512_no_vzeroupper)
|
||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__memcpy_chk_avx512_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__memcpy_chk_avx512_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
||||
CPU_FEATURE_USABLE (AVX),
|
||||
@@ -802,10 +802,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
__memcpy_avx512_no_vzeroupper)
|
||||
IFUNC_IMPL_ADD (array, i, memcpy,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__memcpy_avx512_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, memcpy,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__memcpy_avx512_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, memcpy, 1,
|
||||
@@ -819,10 +819,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
__mempcpy_chk_avx512_no_vzeroupper)
|
||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__mempcpy_chk_avx512_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__mempcpy_chk_avx512_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
||||
CPU_FEATURE_USABLE (AVX),
|
||||
@@ -864,10 +864,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
__mempcpy_avx512_no_vzeroupper)
|
||||
IFUNC_IMPL_ADD (array, i, mempcpy,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__mempcpy_avx512_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, mempcpy,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__mempcpy_avx512_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, mempcpy,
|
||||
CPU_FEATURE_USABLE (AVX),
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
|
||||
index fa09b9fb..014e95c7 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
|
||||
@@ -56,13 +56,15 @@ IFUNC_SELECTOR (void)
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
|
||||
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
|
||||
{
|
||||
- if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
- return OPTIMIZE (avx512_no_vzeroupper);
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
|
||||
+ {
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
+ return OPTIMIZE (avx512_unaligned_erms);
|
||||
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
- return OPTIMIZE (avx512_unaligned_erms);
|
||||
+ return OPTIMIZE (avx512_unaligned);
|
||||
+ }
|
||||
|
||||
- return OPTIMIZE (avx512_unaligned);
|
||||
+ return OPTIMIZE (avx512_no_vzeroupper);
|
||||
}
|
||||
|
||||
if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
||||
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
|
||||
index aac1515c..848848ab 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
|
||||
@@ -1,11 +1,32 @@
|
||||
#if IS_IN (libc)
|
||||
# define VEC_SIZE 64
|
||||
-# define VEC(i) zmm##i
|
||||
+# define XMM0 xmm16
|
||||
+# define XMM1 xmm17
|
||||
+# define YMM0 ymm16
|
||||
+# define YMM1 ymm17
|
||||
+# define VEC0 zmm16
|
||||
+# define VEC1 zmm17
|
||||
+# define VEC2 zmm18
|
||||
+# define VEC3 zmm19
|
||||
+# define VEC4 zmm20
|
||||
+# define VEC5 zmm21
|
||||
+# define VEC6 zmm22
|
||||
+# define VEC7 zmm23
|
||||
+# define VEC8 zmm24
|
||||
+# define VEC9 zmm25
|
||||
+# define VEC10 zmm26
|
||||
+# define VEC11 zmm27
|
||||
+# define VEC12 zmm28
|
||||
+# define VEC13 zmm29
|
||||
+# define VEC14 zmm30
|
||||
+# define VEC15 zmm31
|
||||
+# define VEC(i) VEC##i
|
||||
# define VMOVNT vmovntdq
|
||||
# define VMOVU vmovdqu64
|
||||
# define VMOVA vmovdqa64
|
||||
+# define VZEROUPPER
|
||||
|
||||
-# define SECTION(p) p##.avx512
|
||||
+# define SECTION(p) p##.evex512
|
||||
# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s
|
||||
|
||||
# include "memmove-vec-unaligned-erms.S"
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,71 +0,0 @@
|
|||
From 595c22ecd8e87a27fd19270ed30fdbae9ad25426 Mon Sep 17 00:00:00 2001
|
||||
From: Sunil K Pandey <skpgkp2@gmail.com>
|
||||
Date: Thu, 1 Apr 2021 15:47:04 -0700
|
||||
Subject: [PATCH] x86-64: Fix ifdef indentation in strlen-evex.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Fix some indentations of ifdef in file strlen-evex.S which are off by 1
|
||||
and confusing to read.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strlen-evex.S | 16 ++++++++--------
|
||||
1 file changed, 8 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
|
||||
index cd022509..05838190 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
|
||||
@@ -276,10 +276,10 @@ L(last_2x_vec):
|
||||
.p2align 4
|
||||
L(first_vec_x0_check):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
sall $2, %eax
|
||||
-# endif
|
||||
+# endif
|
||||
/* Check the end of data. */
|
||||
cmpq %rax, %rsi
|
||||
jbe L(max)
|
||||
@@ -293,10 +293,10 @@ L(first_vec_x0_check):
|
||||
.p2align 4
|
||||
L(first_vec_x1_check):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
sall $2, %eax
|
||||
-# endif
|
||||
+# endif
|
||||
/* Check the end of data. */
|
||||
cmpq %rax, %rsi
|
||||
jbe L(max)
|
||||
@@ -311,10 +311,10 @@ L(first_vec_x1_check):
|
||||
.p2align 4
|
||||
L(first_vec_x2_check):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
sall $2, %eax
|
||||
-# endif
|
||||
+# endif
|
||||
/* Check the end of data. */
|
||||
cmpq %rax, %rsi
|
||||
jbe L(max)
|
||||
@@ -329,10 +329,10 @@ L(first_vec_x2_check):
|
||||
.p2align 4
|
||||
L(first_vec_x3_check):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
sall $2, %eax
|
||||
-# endif
|
||||
+# endif
|
||||
/* Check the end of data. */
|
||||
cmpq %rax, %rsi
|
||||
jbe L(max)
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,51 +0,0 @@
|
|||
From 55bf411b451c13f0fb7ff3d3bf9a820020b45df1 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Mon, 19 Apr 2021 07:07:21 -0700
|
||||
Subject: [PATCH] x86-64: Require BMI2 for __strlen_evex and __strnlen_evex
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Since __strlen_evex and __strnlen_evex added by
|
||||
|
||||
commit 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77
|
||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
||||
Date: Fri Mar 5 06:24:52 2021 -0800
|
||||
|
||||
x86-64: Add ifunc-avx2.h functions with 256-bit EVEX
|
||||
|
||||
use sarx:
|
||||
|
||||
c4 e2 6a f7 c0 sarx %edx,%eax,%eax
|
||||
|
||||
require BMI2 for __strlen_evex and __strnlen_evex in ifunc-impl-list.c.
|
||||
ifunc-avx2.h already requires BMI2 for EVEX implementation.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 ++++--
|
||||
1 file changed, 4 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index fec384f6..cbfc1a5d 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -293,7 +293,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
__strlen_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, strlen,
|
||||
(CPU_FEATURE_USABLE (AVX512VL)
|
||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__strlen_evex)
|
||||
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
|
||||
|
||||
@@ -308,7 +309,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
__strnlen_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, strnlen,
|
||||
(CPU_FEATURE_USABLE (AVX512VL)
|
||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__strnlen_evex)
|
||||
IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
|
||||
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,584 +0,0 @@
|
|||
From acfd088a1963ba51cd83c78f95c0ab25ead79e04 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon, 3 May 2021 03:01:58 -0400
|
||||
Subject: [PATCH] x86: Optimize memchr-avx2.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug. This commit optimizes memchr-avx2.S. The optimizations include
|
||||
replacing some branches with cmovcc, avoiding some branches entirely
|
||||
in the less_4x_vec case, making the page cross logic less strict,
|
||||
asaving a few instructions the in loop return loop. test-memchr,
|
||||
test-rawmemchr, and test-wmemchr are all passing.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/memchr-avx2.S | 425 ++++++++++++++-----------
|
||||
1 file changed, 247 insertions(+), 178 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
||||
index cf893e77..b377f22e 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
||||
@@ -26,8 +26,22 @@
|
||||
|
||||
# ifdef USE_AS_WMEMCHR
|
||||
# define VPCMPEQ vpcmpeqd
|
||||
+# define VPBROADCAST vpbroadcastd
|
||||
+# define CHAR_SIZE 4
|
||||
# else
|
||||
# define VPCMPEQ vpcmpeqb
|
||||
+# define VPBROADCAST vpbroadcastb
|
||||
+# define CHAR_SIZE 1
|
||||
+# endif
|
||||
+
|
||||
+# ifdef USE_AS_RAWMEMCHR
|
||||
+# define ERAW_PTR_REG ecx
|
||||
+# define RRAW_PTR_REG rcx
|
||||
+# define ALGN_PTR_REG rdi
|
||||
+# else
|
||||
+# define ERAW_PTR_REG edi
|
||||
+# define RRAW_PTR_REG rdi
|
||||
+# define ALGN_PTR_REG rcx
|
||||
# endif
|
||||
|
||||
# ifndef VZEROUPPER
|
||||
@@ -39,6 +53,7 @@
|
||||
# endif
|
||||
|
||||
# define VEC_SIZE 32
|
||||
+# define PAGE_SIZE 4096
|
||||
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
ENTRY (MEMCHR)
|
||||
@@ -47,295 +62,349 @@ ENTRY (MEMCHR)
|
||||
test %RDX_LP, %RDX_LP
|
||||
jz L(null)
|
||||
# endif
|
||||
- movl %edi, %ecx
|
||||
- /* Broadcast CHAR to YMM0. */
|
||||
- vmovd %esi, %xmm0
|
||||
# ifdef USE_AS_WMEMCHR
|
||||
shl $2, %RDX_LP
|
||||
- vpbroadcastd %xmm0, %ymm0
|
||||
# else
|
||||
# ifdef __ILP32__
|
||||
/* Clear the upper 32 bits. */
|
||||
movl %edx, %edx
|
||||
# endif
|
||||
- vpbroadcastb %xmm0, %ymm0
|
||||
# endif
|
||||
+ /* Broadcast CHAR to YMMMATCH. */
|
||||
+ vmovd %esi, %xmm0
|
||||
+ VPBROADCAST %xmm0, %ymm0
|
||||
/* Check if we may cross page boundary with one vector load. */
|
||||
- andl $(2 * VEC_SIZE - 1), %ecx
|
||||
- cmpl $VEC_SIZE, %ecx
|
||||
- ja L(cros_page_boundary)
|
||||
+ movl %edi, %eax
|
||||
+ andl $(PAGE_SIZE - 1), %eax
|
||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
+ ja L(cross_page_boundary)
|
||||
|
||||
/* Check the first VEC_SIZE bytes. */
|
||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
+ VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
-
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
- jnz L(first_vec_x0_check)
|
||||
- /* Adjust length and check the end of data. */
|
||||
- subq $VEC_SIZE, %rdx
|
||||
- jbe L(zero)
|
||||
-# else
|
||||
- jnz L(first_vec_x0)
|
||||
+ /* If length < CHAR_PER_VEC handle special. */
|
||||
+ cmpq $VEC_SIZE, %rdx
|
||||
+ jbe L(first_vec_x0)
|
||||
# endif
|
||||
-
|
||||
- /* Align data for aligned loads in the loop. */
|
||||
- addq $VEC_SIZE, %rdi
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
+ testl %eax, %eax
|
||||
+ jz L(aligned_more)
|
||||
+ tzcntl %eax, %eax
|
||||
+ addq %rdi, %rax
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
- /* Adjust length. */
|
||||
- addq %rcx, %rdx
|
||||
+ .p2align 5
|
||||
+L(first_vec_x0):
|
||||
+ /* Check if first match was before length. */
|
||||
+ tzcntl %eax, %eax
|
||||
+ xorl %ecx, %ecx
|
||||
+ cmpl %eax, %edx
|
||||
+ leaq (%rdi, %rax), %rax
|
||||
+ cmovle %rcx, %rax
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
+L(null):
|
||||
+ xorl %eax, %eax
|
||||
+ ret
|
||||
# endif
|
||||
- jmp L(more_4x_vec)
|
||||
-
|
||||
.p2align 4
|
||||
-L(cros_page_boundary):
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
+L(cross_page_boundary):
|
||||
+ /* Save pointer before aligning as its original value is necessary
|
||||
+ for computer return address if byte is found or adjusting length
|
||||
+ if it is not and this is memchr. */
|
||||
+ movq %rdi, %rcx
|
||||
+ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
|
||||
+ rdi for rawmemchr. */
|
||||
+ orq $(VEC_SIZE - 1), %ALGN_PTR_REG
|
||||
+ VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
+# ifndef USE_AS_RAWMEMCHR
|
||||
+ /* Calculate length until end of page (length checked for a
|
||||
+ match). */
|
||||
+ leaq 1(%ALGN_PTR_REG), %rsi
|
||||
+ subq %RRAW_PTR_REG, %rsi
|
||||
+# endif
|
||||
/* Remove the leading bytes. */
|
||||
- sarl %cl, %eax
|
||||
- testl %eax, %eax
|
||||
- jz L(aligned_more)
|
||||
- tzcntl %eax, %eax
|
||||
+ sarxl %ERAW_PTR_REG, %eax, %eax
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
/* Check the end of data. */
|
||||
- cmpq %rax, %rdx
|
||||
- jbe L(zero)
|
||||
+ cmpq %rsi, %rdx
|
||||
+ jbe L(first_vec_x0)
|
||||
# endif
|
||||
- addq %rdi, %rax
|
||||
- addq %rcx, %rax
|
||||
+ testl %eax, %eax
|
||||
+ jz L(cross_page_continue)
|
||||
+ tzcntl %eax, %eax
|
||||
+ addq %RRAW_PTR_REG, %rax
|
||||
L(return_vzeroupper):
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
|
||||
.p2align 4
|
||||
-L(aligned_more):
|
||||
-# ifndef USE_AS_RAWMEMCHR
|
||||
- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
|
||||
- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
|
||||
- overflow. */
|
||||
- negq %rcx
|
||||
- addq $VEC_SIZE, %rcx
|
||||
+L(first_vec_x1):
|
||||
+ tzcntl %eax, %eax
|
||||
+ incq %rdi
|
||||
+ addq %rdi, %rax
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
- /* Check the end of data. */
|
||||
- subq %rcx, %rdx
|
||||
- jbe L(zero)
|
||||
-# endif
|
||||
+ .p2align 4
|
||||
+L(first_vec_x2):
|
||||
+ tzcntl %eax, %eax
|
||||
+ addq $(VEC_SIZE + 1), %rdi
|
||||
+ addq %rdi, %rax
|
||||
+ VZEROUPPER_RETURN
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(first_vec_x3):
|
||||
+ tzcntl %eax, %eax
|
||||
+ addq $(VEC_SIZE * 2 + 1), %rdi
|
||||
+ addq %rdi, %rax
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
- addq $VEC_SIZE, %rdi
|
||||
|
||||
-# ifndef USE_AS_RAWMEMCHR
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
-# endif
|
||||
+ .p2align 4
|
||||
+L(first_vec_x4):
|
||||
+ tzcntl %eax, %eax
|
||||
+ addq $(VEC_SIZE * 3 + 1), %rdi
|
||||
+ addq %rdi, %rax
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
-L(more_4x_vec):
|
||||
+ .p2align 4
|
||||
+L(aligned_more):
|
||||
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
||||
since data is only aligned to VEC_SIZE. */
|
||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
|
||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
|
||||
+# ifndef USE_AS_RAWMEMCHR
|
||||
+L(cross_page_continue):
|
||||
+ /* Align data to VEC_SIZE - 1. */
|
||||
+ xorl %ecx, %ecx
|
||||
+ subl %edi, %ecx
|
||||
+ orq $(VEC_SIZE - 1), %rdi
|
||||
+ /* esi is for adjusting length to see if near the end. */
|
||||
+ leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
|
||||
+# else
|
||||
+ orq $(VEC_SIZE - 1), %rdi
|
||||
+L(cross_page_continue):
|
||||
+# endif
|
||||
+ /* Load first VEC regardless. */
|
||||
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
+# ifndef USE_AS_RAWMEMCHR
|
||||
+ /* Adjust length. If near end handle specially. */
|
||||
+ subq %rsi, %rdx
|
||||
+ jbe L(last_4x_vec_or_less)
|
||||
+# endif
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x1)
|
||||
|
||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
|
||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x2)
|
||||
|
||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
|
||||
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x3)
|
||||
|
||||
- addq $(VEC_SIZE * 4), %rdi
|
||||
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(first_vec_x4)
|
||||
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
+ /* Check if at last VEC_SIZE * 4 length. */
|
||||
subq $(VEC_SIZE * 4), %rdx
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
-# endif
|
||||
-
|
||||
- /* Align data to 4 * VEC_SIZE. */
|
||||
- movq %rdi, %rcx
|
||||
- andl $(4 * VEC_SIZE - 1), %ecx
|
||||
- andq $-(4 * VEC_SIZE), %rdi
|
||||
-
|
||||
-# ifndef USE_AS_RAWMEMCHR
|
||||
- /* Adjust length. */
|
||||
+ jbe L(last_4x_vec_or_less_cmpeq)
|
||||
+ /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
|
||||
+ length. */
|
||||
+ incq %rdi
|
||||
+ movl %edi, %ecx
|
||||
+ orq $(VEC_SIZE * 4 - 1), %rdi
|
||||
+ andl $(VEC_SIZE * 4 - 1), %ecx
|
||||
addq %rcx, %rdx
|
||||
+# else
|
||||
+ /* Align data to VEC_SIZE * 4 - 1 for loop. */
|
||||
+ incq %rdi
|
||||
+ orq $(VEC_SIZE * 4 - 1), %rdi
|
||||
# endif
|
||||
|
||||
+ /* Compare 4 * VEC at a time forward. */
|
||||
.p2align 4
|
||||
L(loop_4x_vec):
|
||||
- /* Compare 4 * VEC at a time forward. */
|
||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
|
||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
|
||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
|
||||
-
|
||||
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
|
||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
|
||||
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
|
||||
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
|
||||
vpor %ymm1, %ymm2, %ymm5
|
||||
vpor %ymm3, %ymm4, %ymm6
|
||||
vpor %ymm5, %ymm6, %ymm5
|
||||
|
||||
- vpmovmskb %ymm5, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(4x_vec_end)
|
||||
-
|
||||
- addq $(VEC_SIZE * 4), %rdi
|
||||
-
|
||||
+ vpmovmskb %ymm5, %ecx
|
||||
# ifdef USE_AS_RAWMEMCHR
|
||||
- jmp L(loop_4x_vec)
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+ testl %ecx, %ecx
|
||||
+ jz L(loop_4x_vec)
|
||||
# else
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
- ja L(loop_4x_vec)
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(loop_4x_vec_end)
|
||||
|
||||
-L(last_4x_vec_or_less):
|
||||
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
|
||||
- addl $(VEC_SIZE * 2), %edx
|
||||
- jle L(last_2x_vec)
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
|
||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
+ subq $(VEC_SIZE * 4), %rdx
|
||||
+ ja L(loop_4x_vec)
|
||||
|
||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
|
||||
+ /* Fall through into less than 4 remaining vectors of length case.
|
||||
+ */
|
||||
+ VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
+ .p2align 4
|
||||
+L(last_4x_vec_or_less):
|
||||
+ /* Check if first VEC contained match. */
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec_x1)
|
||||
+ jnz L(first_vec_x1_check)
|
||||
|
||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
+ /* If remaining length > VEC_SIZE * 2. */
|
||||
+ addl $(VEC_SIZE * 2), %edx
|
||||
+ jg L(last_4x_vec)
|
||||
|
||||
- jnz L(first_vec_x2_check)
|
||||
- subl $VEC_SIZE, %edx
|
||||
- jle L(zero)
|
||||
+L(last_2x_vec):
|
||||
+ /* If remaining length < VEC_SIZE. */
|
||||
+ addl $VEC_SIZE, %edx
|
||||
+ jle L(zero_end)
|
||||
|
||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
|
||||
+ /* Check VEC2 and compare any match with remaining length. */
|
||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
-
|
||||
- jnz L(first_vec_x3_check)
|
||||
- xorl %eax, %eax
|
||||
+ tzcntl %eax, %eax
|
||||
+ cmpl %eax, %edx
|
||||
+ jbe L(set_zero_end)
|
||||
+ addq $(VEC_SIZE + 1), %rdi
|
||||
+ addq %rdi, %rax
|
||||
+L(zero_end):
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
-L(last_2x_vec):
|
||||
- addl $(VEC_SIZE * 2), %edx
|
||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
+L(loop_4x_vec_end):
|
||||
+# endif
|
||||
+ /* rawmemchr will fall through into this if match was found in
|
||||
+ loop. */
|
||||
+
|
||||
vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
+ jnz L(last_vec_x1_return)
|
||||
|
||||
- jnz L(first_vec_x0_check)
|
||||
- subl $VEC_SIZE, %edx
|
||||
- jle L(zero)
|
||||
-
|
||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ vpmovmskb %ymm2, %eax
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec_x1_check)
|
||||
- xorl %eax, %eax
|
||||
- VZEROUPPER_RETURN
|
||||
+ jnz L(last_vec_x2_return)
|
||||
|
||||
- .p2align 4
|
||||
-L(first_vec_x0_check):
|
||||
- tzcntl %eax, %eax
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rdx
|
||||
- jbe L(zero)
|
||||
+ vpmovmskb %ymm3, %eax
|
||||
+ /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */
|
||||
+ salq $32, %rcx
|
||||
+ orq %rcx, %rax
|
||||
+ tzcntq %rax, %rax
|
||||
+# ifdef USE_AS_RAWMEMCHR
|
||||
+ subq $(VEC_SIZE * 2 - 1), %rdi
|
||||
+# else
|
||||
+ subq $-(VEC_SIZE * 2 + 1), %rdi
|
||||
+# endif
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER_RETURN
|
||||
+# ifndef USE_AS_RAWMEMCHR
|
||||
|
||||
.p2align 4
|
||||
L(first_vec_x1_check):
|
||||
tzcntl %eax, %eax
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rdx
|
||||
- jbe L(zero)
|
||||
- addq $VEC_SIZE, %rax
|
||||
+ /* Adjust length. */
|
||||
+ subl $-(VEC_SIZE * 4), %edx
|
||||
+ /* Check if match within remaining length. */
|
||||
+ cmpl %eax, %edx
|
||||
+ jbe L(set_zero_end)
|
||||
+ incq %rdi
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER_RETURN
|
||||
+ .p2align 4
|
||||
+L(set_zero_end):
|
||||
+ xorl %eax, %eax
|
||||
+ VZEROUPPER_RETURN
|
||||
+# endif
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x2_check):
|
||||
+L(last_vec_x1_return):
|
||||
tzcntl %eax, %eax
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rdx
|
||||
- jbe L(zero)
|
||||
- addq $(VEC_SIZE * 2), %rax
|
||||
+# ifdef USE_AS_RAWMEMCHR
|
||||
+ subq $(VEC_SIZE * 4 - 1), %rdi
|
||||
+# else
|
||||
+ incq %rdi
|
||||
+# endif
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x3_check):
|
||||
+L(last_vec_x2_return):
|
||||
tzcntl %eax, %eax
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rdx
|
||||
- jbe L(zero)
|
||||
- addq $(VEC_SIZE * 3), %rax
|
||||
+# ifdef USE_AS_RAWMEMCHR
|
||||
+ subq $(VEC_SIZE * 3 - 1), %rdi
|
||||
+# else
|
||||
+ subq $-(VEC_SIZE + 1), %rdi
|
||||
+# endif
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
+# ifndef USE_AS_RAWMEMCHR
|
||||
.p2align 4
|
||||
-L(zero):
|
||||
- xorl %eax, %eax
|
||||
- jmp L(return_vzeroupper)
|
||||
+L(last_4x_vec_or_less_cmpeq):
|
||||
+ VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+ /* Check first VEC regardless. */
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(first_vec_x1_check)
|
||||
|
||||
+ /* If remaining length <= CHAR_PER_VEC * 2. */
|
||||
+ addl $(VEC_SIZE * 2), %edx
|
||||
+ jle L(last_2x_vec)
|
||||
.p2align 4
|
||||
-L(null):
|
||||
- xorl %eax, %eax
|
||||
- ret
|
||||
-# endif
|
||||
+L(last_4x_vec):
|
||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(last_vec_x2_return)
|
||||
|
||||
- .p2align 4
|
||||
-L(first_vec_x0):
|
||||
- tzcntl %eax, %eax
|
||||
- addq %rdi, %rax
|
||||
- VZEROUPPER_RETURN
|
||||
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
|
||||
- .p2align 4
|
||||
-L(first_vec_x1):
|
||||
- tzcntl %eax, %eax
|
||||
- addq $VEC_SIZE, %rax
|
||||
- addq %rdi, %rax
|
||||
- VZEROUPPER_RETURN
|
||||
+ /* Create mask for possible matches within remaining length. */
|
||||
+ movq $-1, %rcx
|
||||
+ bzhiq %rdx, %rcx, %rcx
|
||||
|
||||
- .p2align 4
|
||||
-L(first_vec_x2):
|
||||
+ /* Test matches in data against length match. */
|
||||
+ andl %ecx, %eax
|
||||
+ jnz L(last_vec_x3)
|
||||
+
|
||||
+ /* if remaining length <= VEC_SIZE * 3 (Note this is after
|
||||
+ remaining length was found to be > VEC_SIZE * 2. */
|
||||
+ subl $VEC_SIZE, %edx
|
||||
+ jbe L(zero_end2)
|
||||
+
|
||||
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ /* Shift remaining length mask for last VEC. */
|
||||
+ shrq $32, %rcx
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(zero_end2)
|
||||
tzcntl %eax, %eax
|
||||
- addq $(VEC_SIZE * 2), %rax
|
||||
+ addq $(VEC_SIZE * 3 + 1), %rdi
|
||||
addq %rdi, %rax
|
||||
+L(zero_end2):
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
-L(4x_vec_end):
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
- vpmovmskb %ymm2, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x1)
|
||||
- vpmovmskb %ymm3, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x2)
|
||||
- vpmovmskb %ymm4, %eax
|
||||
- testl %eax, %eax
|
||||
-L(first_vec_x3):
|
||||
+L(last_vec_x3):
|
||||
tzcntl %eax, %eax
|
||||
- addq $(VEC_SIZE * 3), %rax
|
||||
+ subq $-(VEC_SIZE * 2 + 1), %rdi
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER_RETURN
|
||||
+# endif
|
||||
|
||||
END (MEMCHR)
|
||||
#endif
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,388 +0,0 @@
|
|||
From 645a158978f9520e74074e8c14047503be4db0f0 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Wed, 9 Jun 2021 16:25:32 -0400
|
||||
Subject: [PATCH] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ
|
||||
#27974]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
This commit fixes the bug mentioned in the previous commit.
|
||||
|
||||
The previous implementations of wmemchr in these files relied
|
||||
on n * sizeof(wchar_t) which was not guranteed by the standard.
|
||||
|
||||
The new overflow tests added in the previous commit now
|
||||
pass (As well as all the other tests).
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/memchr.S | 77 +++++++++++++++++++-------
|
||||
sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
|
||||
2 files changed, 98 insertions(+), 37 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
|
||||
index cb320257..24f9a0c5 100644
|
||||
--- a/sysdeps/x86_64/memchr.S
|
||||
+++ b/sysdeps/x86_64/memchr.S
|
||||
@@ -21,9 +21,11 @@
|
||||
#ifdef USE_AS_WMEMCHR
|
||||
# define MEMCHR wmemchr
|
||||
# define PCMPEQ pcmpeqd
|
||||
+# define CHAR_PER_VEC 4
|
||||
#else
|
||||
# define MEMCHR memchr
|
||||
# define PCMPEQ pcmpeqb
|
||||
+# define CHAR_PER_VEC 16
|
||||
#endif
|
||||
|
||||
/* fast SSE2 version with using pmaxub and 64 byte loop */
|
||||
@@ -33,15 +35,14 @@ ENTRY(MEMCHR)
|
||||
movd %esi, %xmm1
|
||||
mov %edi, %ecx
|
||||
|
||||
+#ifdef __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ movl %edx, %edx
|
||||
+#endif
|
||||
#ifdef USE_AS_WMEMCHR
|
||||
test %RDX_LP, %RDX_LP
|
||||
jz L(return_null)
|
||||
- shl $2, %RDX_LP
|
||||
#else
|
||||
-# ifdef __ILP32__
|
||||
- /* Clear the upper 32 bits. */
|
||||
- movl %edx, %edx
|
||||
-# endif
|
||||
punpcklbw %xmm1, %xmm1
|
||||
test %RDX_LP, %RDX_LP
|
||||
jz L(return_null)
|
||||
@@ -60,13 +61,16 @@ ENTRY(MEMCHR)
|
||||
test %eax, %eax
|
||||
|
||||
jnz L(matches_1)
|
||||
- sub $16, %rdx
|
||||
+ sub $CHAR_PER_VEC, %rdx
|
||||
jbe L(return_null)
|
||||
add $16, %rdi
|
||||
and $15, %ecx
|
||||
and $-16, %rdi
|
||||
+#ifdef USE_AS_WMEMCHR
|
||||
+ shr $2, %ecx
|
||||
+#endif
|
||||
add %rcx, %rdx
|
||||
- sub $64, %rdx
|
||||
+ sub $(CHAR_PER_VEC * 4), %rdx
|
||||
jbe L(exit_loop)
|
||||
jmp L(loop_prolog)
|
||||
|
||||
@@ -77,16 +81,21 @@ L(crosscache):
|
||||
movdqa (%rdi), %xmm0
|
||||
|
||||
PCMPEQ %xmm1, %xmm0
|
||||
-/* Check if there is a match. */
|
||||
+ /* Check if there is a match. */
|
||||
pmovmskb %xmm0, %eax
|
||||
-/* Remove the leading bytes. */
|
||||
+ /* Remove the leading bytes. */
|
||||
sar %cl, %eax
|
||||
test %eax, %eax
|
||||
je L(unaligned_no_match)
|
||||
-/* Check which byte is a match. */
|
||||
+ /* Check which byte is a match. */
|
||||
bsf %eax, %eax
|
||||
-
|
||||
+#ifdef USE_AS_WMEMCHR
|
||||
+ mov %eax, %esi
|
||||
+ shr $2, %esi
|
||||
+ sub %rsi, %rdx
|
||||
+#else
|
||||
sub %rax, %rdx
|
||||
+#endif
|
||||
jbe L(return_null)
|
||||
add %rdi, %rax
|
||||
add %rcx, %rax
|
||||
@@ -94,15 +103,18 @@ L(crosscache):
|
||||
|
||||
.p2align 4
|
||||
L(unaligned_no_match):
|
||||
- /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
|
||||
+ /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
|
||||
"rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
|
||||
possible addition overflow. */
|
||||
neg %rcx
|
||||
add $16, %rcx
|
||||
+#ifdef USE_AS_WMEMCHR
|
||||
+ shr $2, %ecx
|
||||
+#endif
|
||||
sub %rcx, %rdx
|
||||
jbe L(return_null)
|
||||
add $16, %rdi
|
||||
- sub $64, %rdx
|
||||
+ sub $(CHAR_PER_VEC * 4), %rdx
|
||||
jbe L(exit_loop)
|
||||
|
||||
.p2align 4
|
||||
@@ -135,7 +147,7 @@ L(loop_prolog):
|
||||
test $0x3f, %rdi
|
||||
jz L(align64_loop)
|
||||
|
||||
- sub $64, %rdx
|
||||
+ sub $(CHAR_PER_VEC * 4), %rdx
|
||||
jbe L(exit_loop)
|
||||
|
||||
movdqa (%rdi), %xmm0
|
||||
@@ -167,11 +179,14 @@ L(loop_prolog):
|
||||
mov %rdi, %rcx
|
||||
and $-64, %rdi
|
||||
and $63, %ecx
|
||||
+#ifdef USE_AS_WMEMCHR
|
||||
+ shr $2, %ecx
|
||||
+#endif
|
||||
add %rcx, %rdx
|
||||
|
||||
.p2align 4
|
||||
L(align64_loop):
|
||||
- sub $64, %rdx
|
||||
+ sub $(CHAR_PER_VEC * 4), %rdx
|
||||
jbe L(exit_loop)
|
||||
movdqa (%rdi), %xmm0
|
||||
movdqa 16(%rdi), %xmm2
|
||||
@@ -218,7 +233,7 @@ L(align64_loop):
|
||||
|
||||
.p2align 4
|
||||
L(exit_loop):
|
||||
- add $32, %edx
|
||||
+ add $(CHAR_PER_VEC * 2), %edx
|
||||
jle L(exit_loop_32)
|
||||
|
||||
movdqa (%rdi), %xmm0
|
||||
@@ -238,7 +253,7 @@ L(exit_loop):
|
||||
pmovmskb %xmm3, %eax
|
||||
test %eax, %eax
|
||||
jnz L(matches32_1)
|
||||
- sub $16, %edx
|
||||
+ sub $CHAR_PER_VEC, %edx
|
||||
jle L(return_null)
|
||||
|
||||
PCMPEQ 48(%rdi), %xmm1
|
||||
@@ -250,13 +265,13 @@ L(exit_loop):
|
||||
|
||||
.p2align 4
|
||||
L(exit_loop_32):
|
||||
- add $32, %edx
|
||||
+ add $(CHAR_PER_VEC * 2), %edx
|
||||
movdqa (%rdi), %xmm0
|
||||
PCMPEQ %xmm1, %xmm0
|
||||
pmovmskb %xmm0, %eax
|
||||
test %eax, %eax
|
||||
jnz L(matches_1)
|
||||
- sub $16, %edx
|
||||
+ sub $CHAR_PER_VEC, %edx
|
||||
jbe L(return_null)
|
||||
|
||||
PCMPEQ 16(%rdi), %xmm1
|
||||
@@ -293,7 +308,13 @@ L(matches32):
|
||||
.p2align 4
|
||||
L(matches_1):
|
||||
bsf %eax, %eax
|
||||
+#ifdef USE_AS_WMEMCHR
|
||||
+ mov %eax, %esi
|
||||
+ shr $2, %esi
|
||||
+ sub %rsi, %rdx
|
||||
+#else
|
||||
sub %rax, %rdx
|
||||
+#endif
|
||||
jbe L(return_null)
|
||||
add %rdi, %rax
|
||||
ret
|
||||
@@ -301,7 +322,13 @@ L(matches_1):
|
||||
.p2align 4
|
||||
L(matches16_1):
|
||||
bsf %eax, %eax
|
||||
+#ifdef USE_AS_WMEMCHR
|
||||
+ mov %eax, %esi
|
||||
+ shr $2, %esi
|
||||
+ sub %rsi, %rdx
|
||||
+#else
|
||||
sub %rax, %rdx
|
||||
+#endif
|
||||
jbe L(return_null)
|
||||
lea 16(%rdi, %rax), %rax
|
||||
ret
|
||||
@@ -309,7 +336,13 @@ L(matches16_1):
|
||||
.p2align 4
|
||||
L(matches32_1):
|
||||
bsf %eax, %eax
|
||||
+#ifdef USE_AS_WMEMCHR
|
||||
+ mov %eax, %esi
|
||||
+ shr $2, %esi
|
||||
+ sub %rsi, %rdx
|
||||
+#else
|
||||
sub %rax, %rdx
|
||||
+#endif
|
||||
jbe L(return_null)
|
||||
lea 32(%rdi, %rax), %rax
|
||||
ret
|
||||
@@ -317,7 +350,13 @@ L(matches32_1):
|
||||
.p2align 4
|
||||
L(matches48_1):
|
||||
bsf %eax, %eax
|
||||
+#ifdef USE_AS_WMEMCHR
|
||||
+ mov %eax, %esi
|
||||
+ shr $2, %esi
|
||||
+ sub %rsi, %rdx
|
||||
+#else
|
||||
sub %rax, %rdx
|
||||
+#endif
|
||||
jbe L(return_null)
|
||||
lea 48(%rdi, %rax), %rax
|
||||
ret
|
||||
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
||||
index b377f22e..16027abb 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
||||
@@ -54,21 +54,19 @@
|
||||
|
||||
# define VEC_SIZE 32
|
||||
# define PAGE_SIZE 4096
|
||||
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
||||
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
ENTRY (MEMCHR)
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
/* Check for zero length. */
|
||||
- test %RDX_LP, %RDX_LP
|
||||
- jz L(null)
|
||||
-# endif
|
||||
-# ifdef USE_AS_WMEMCHR
|
||||
- shl $2, %RDX_LP
|
||||
-# else
|
||||
# ifdef __ILP32__
|
||||
- /* Clear the upper 32 bits. */
|
||||
- movl %edx, %edx
|
||||
+ /* Clear upper bits. */
|
||||
+ and %RDX_LP, %RDX_LP
|
||||
+# else
|
||||
+ test %RDX_LP, %RDX_LP
|
||||
# endif
|
||||
+ jz L(null)
|
||||
# endif
|
||||
/* Broadcast CHAR to YMMMATCH. */
|
||||
vmovd %esi, %xmm0
|
||||
@@ -84,7 +82,7 @@ ENTRY (MEMCHR)
|
||||
vpmovmskb %ymm1, %eax
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
/* If length < CHAR_PER_VEC handle special. */
|
||||
- cmpq $VEC_SIZE, %rdx
|
||||
+ cmpq $CHAR_PER_VEC, %rdx
|
||||
jbe L(first_vec_x0)
|
||||
# endif
|
||||
testl %eax, %eax
|
||||
@@ -98,6 +96,10 @@ ENTRY (MEMCHR)
|
||||
L(first_vec_x0):
|
||||
/* Check if first match was before length. */
|
||||
tzcntl %eax, %eax
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ /* NB: Multiply length by 4 to get byte count. */
|
||||
+ sall $2, %edx
|
||||
+# endif
|
||||
xorl %ecx, %ecx
|
||||
cmpl %eax, %edx
|
||||
leaq (%rdi, %rax), %rax
|
||||
@@ -110,12 +112,12 @@ L(null):
|
||||
# endif
|
||||
.p2align 4
|
||||
L(cross_page_boundary):
|
||||
- /* Save pointer before aligning as its original value is necessary
|
||||
- for computer return address if byte is found or adjusting length
|
||||
- if it is not and this is memchr. */
|
||||
+ /* Save pointer before aligning as its original value is
|
||||
+ necessary for computer return address if byte is found or
|
||||
+ adjusting length if it is not and this is memchr. */
|
||||
movq %rdi, %rcx
|
||||
- /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
|
||||
- rdi for rawmemchr. */
|
||||
+ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
|
||||
+ and rdi for rawmemchr. */
|
||||
orq $(VEC_SIZE - 1), %ALGN_PTR_REG
|
||||
VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
@@ -124,6 +126,10 @@ L(cross_page_boundary):
|
||||
match). */
|
||||
leaq 1(%ALGN_PTR_REG), %rsi
|
||||
subq %RRAW_PTR_REG, %rsi
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
+ shrl $2, %esi
|
||||
+# endif
|
||||
# endif
|
||||
/* Remove the leading bytes. */
|
||||
sarxl %ERAW_PTR_REG, %eax, %eax
|
||||
@@ -181,6 +187,10 @@ L(cross_page_continue):
|
||||
orq $(VEC_SIZE - 1), %rdi
|
||||
/* esi is for adjusting length to see if near the end. */
|
||||
leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarl $2, %esi
|
||||
+# endif
|
||||
# else
|
||||
orq $(VEC_SIZE - 1), %rdi
|
||||
L(cross_page_continue):
|
||||
@@ -213,7 +223,7 @@ L(cross_page_continue):
|
||||
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
/* Check if at last VEC_SIZE * 4 length. */
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
+ subq $(CHAR_PER_VEC * 4), %rdx
|
||||
jbe L(last_4x_vec_or_less_cmpeq)
|
||||
/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
|
||||
length. */
|
||||
@@ -221,6 +231,10 @@ L(cross_page_continue):
|
||||
movl %edi, %ecx
|
||||
orq $(VEC_SIZE * 4 - 1), %rdi
|
||||
andl $(VEC_SIZE * 4 - 1), %ecx
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarl $2, %ecx
|
||||
+# endif
|
||||
addq %rcx, %rdx
|
||||
# else
|
||||
/* Align data to VEC_SIZE * 4 - 1 for loop. */
|
||||
@@ -250,15 +264,19 @@ L(loop_4x_vec):
|
||||
|
||||
subq $-(VEC_SIZE * 4), %rdi
|
||||
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
+ subq $(CHAR_PER_VEC * 4), %rdx
|
||||
ja L(loop_4x_vec)
|
||||
|
||||
- /* Fall through into less than 4 remaining vectors of length case.
|
||||
- */
|
||||
+ /* Fall through into less than 4 remaining vectors of length
|
||||
+ case. */
|
||||
VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
.p2align 4
|
||||
L(last_4x_vec_or_less):
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ /* NB: Multiply length by 4 to get byte count. */
|
||||
+ sall $2, %edx
|
||||
+# endif
|
||||
/* Check if first VEC contained match. */
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x1_check)
|
||||
@@ -355,6 +373,10 @@ L(last_vec_x2_return):
|
||||
L(last_4x_vec_or_less_cmpeq):
|
||||
VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ /* NB: Multiply length by 4 to get byte count. */
|
||||
+ sall $2, %edx
|
||||
+# endif
|
||||
subq $-(VEC_SIZE * 4), %rdi
|
||||
/* Check first VEC regardless. */
|
||||
testl %eax, %eax
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,767 +0,0 @@
|
|||
From aaa23c35071537e2dcf5807e956802ed215210aa Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon, 19 Apr 2021 19:36:07 -0400
|
||||
Subject: [PATCH] x86: Optimize strlen-avx2.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug. This commit optimizes strlen-avx2.S. The optimizations are
|
||||
mostly small things but they add up to roughly 10-30% performance
|
||||
improvement for strlen. The results for strnlen are bit more
|
||||
ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
|
||||
are all passing.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +-
|
||||
sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++--------
|
||||
2 files changed, 334 insertions(+), 214 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index cbfc1a5d..f1a6460a 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -285,10 +285,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
/* Support sysdeps/x86_64/multiarch/strlen.c. */
|
||||
IFUNC_IMPL (i, name, strlen,
|
||||
IFUNC_IMPL_ADD (array, i, strlen,
|
||||
- CPU_FEATURE_USABLE (AVX2),
|
||||
+ (CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__strlen_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, strlen,
|
||||
(CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)
|
||||
&& CPU_FEATURE_USABLE (RTM)),
|
||||
__strlen_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, strlen,
|
||||
@@ -301,10 +303,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
/* Support sysdeps/x86_64/multiarch/strnlen.c. */
|
||||
IFUNC_IMPL (i, name, strnlen,
|
||||
IFUNC_IMPL_ADD (array, i, strnlen,
|
||||
- CPU_FEATURE_USABLE (AVX2),
|
||||
+ (CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__strnlen_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, strnlen,
|
||||
(CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)
|
||||
&& CPU_FEATURE_USABLE (RTM)),
|
||||
__strnlen_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, strnlen,
|
||||
@@ -640,10 +644,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
/* Support sysdeps/x86_64/multiarch/wcslen.c. */
|
||||
IFUNC_IMPL (i, name, wcslen,
|
||||
IFUNC_IMPL_ADD (array, i, wcslen,
|
||||
- CPU_FEATURE_USABLE (AVX2),
|
||||
+ (CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__wcslen_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, wcslen,
|
||||
(CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)
|
||||
&& CPU_FEATURE_USABLE (RTM)),
|
||||
__wcslen_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, wcslen,
|
||||
@@ -656,10 +662,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
/* Support sysdeps/x86_64/multiarch/wcsnlen.c. */
|
||||
IFUNC_IMPL (i, name, wcsnlen,
|
||||
IFUNC_IMPL_ADD (array, i, wcsnlen,
|
||||
- CPU_FEATURE_USABLE (AVX2),
|
||||
+ (CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__wcsnlen_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, wcsnlen,
|
||||
(CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)
|
||||
&& CPU_FEATURE_USABLE (RTM)),
|
||||
__wcsnlen_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, wcsnlen,
|
||||
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
|
||||
index 82826e10..be8a5db5 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
|
||||
@@ -27,9 +27,11 @@
|
||||
# ifdef USE_AS_WCSLEN
|
||||
# define VPCMPEQ vpcmpeqd
|
||||
# define VPMINU vpminud
|
||||
+# define CHAR_SIZE 4
|
||||
# else
|
||||
# define VPCMPEQ vpcmpeqb
|
||||
# define VPMINU vpminub
|
||||
+# define CHAR_SIZE 1
|
||||
# endif
|
||||
|
||||
# ifndef VZEROUPPER
|
||||
@@ -41,349 +43,459 @@
|
||||
# endif
|
||||
|
||||
# define VEC_SIZE 32
|
||||
+# define PAGE_SIZE 4096
|
||||
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
ENTRY (STRLEN)
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- /* Check for zero length. */
|
||||
+ /* Check zero length. */
|
||||
test %RSI_LP, %RSI_LP
|
||||
jz L(zero)
|
||||
+ /* Store max len in R8_LP before adjusting if using WCSLEN. */
|
||||
+ mov %RSI_LP, %R8_LP
|
||||
# ifdef USE_AS_WCSLEN
|
||||
shl $2, %RSI_LP
|
||||
# elif defined __ILP32__
|
||||
/* Clear the upper 32 bits. */
|
||||
movl %esi, %esi
|
||||
# endif
|
||||
- mov %RSI_LP, %R8_LP
|
||||
# endif
|
||||
- movl %edi, %ecx
|
||||
+ movl %edi, %eax
|
||||
movq %rdi, %rdx
|
||||
vpxor %xmm0, %xmm0, %xmm0
|
||||
-
|
||||
+ /* Clear high bits from edi. Only keeping bits relevant to page
|
||||
+ cross check. */
|
||||
+ andl $(PAGE_SIZE - 1), %eax
|
||||
/* Check if we may cross page boundary with one vector load. */
|
||||
- andl $(2 * VEC_SIZE - 1), %ecx
|
||||
- cmpl $VEC_SIZE, %ecx
|
||||
- ja L(cros_page_boundary)
|
||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
+ ja L(cross_page_boundary)
|
||||
|
||||
/* Check the first VEC_SIZE bytes. */
|
||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
-
|
||||
+ VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- jnz L(first_vec_x0_check)
|
||||
- /* Adjust length and check the end of data. */
|
||||
- subq $VEC_SIZE, %rsi
|
||||
- jbe L(max)
|
||||
-# else
|
||||
- jnz L(first_vec_x0)
|
||||
+ /* If length < VEC_SIZE handle special. */
|
||||
+ cmpq $VEC_SIZE, %rsi
|
||||
+ jbe L(first_vec_x0)
|
||||
# endif
|
||||
-
|
||||
- /* Align data for aligned loads in the loop. */
|
||||
- addq $VEC_SIZE, %rdi
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
+ /* If empty continue to aligned_more. Otherwise return bit
|
||||
+ position of first match. */
|
||||
+ testl %eax, %eax
|
||||
+ jz L(aligned_more)
|
||||
+ tzcntl %eax, %eax
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ shrl $2, %eax
|
||||
+# endif
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- /* Adjust length. */
|
||||
- addq %rcx, %rsi
|
||||
+L(zero):
|
||||
+ xorl %eax, %eax
|
||||
+ ret
|
||||
|
||||
- subq $(VEC_SIZE * 4), %rsi
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
+ .p2align 4
|
||||
+L(first_vec_x0):
|
||||
+ /* Set bit for max len so that tzcnt will return min of max len
|
||||
+ and position of first match. */
|
||||
+ btsq %rsi, %rax
|
||||
+ tzcntl %eax, %eax
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ shrl $2, %eax
|
||||
+# endif
|
||||
+ VZEROUPPER_RETURN
|
||||
# endif
|
||||
- jmp L(more_4x_vec)
|
||||
|
||||
.p2align 4
|
||||
-L(cros_page_boundary):
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- /* Remove the leading bytes. */
|
||||
- sarl %cl, %eax
|
||||
- testl %eax, %eax
|
||||
- jz L(aligned_more)
|
||||
+L(first_vec_x1):
|
||||
tzcntl %eax, %eax
|
||||
+ /* Safe to use 32 bit instructions as these are only called for
|
||||
+ size = [1, 159]. */
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rsi
|
||||
- jbe L(max)
|
||||
+ /* Use ecx which was computed earlier to compute correct value.
|
||||
+ */
|
||||
+ subl $(VEC_SIZE * 4 + 1), %ecx
|
||||
+ addl %ecx, %eax
|
||||
+# else
|
||||
+ subl %edx, %edi
|
||||
+ incl %edi
|
||||
+ addl %edi, %eax
|
||||
# endif
|
||||
- addq %rdi, %rax
|
||||
- addq %rcx, %rax
|
||||
- subq %rdx, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
+ shrl $2, %eax
|
||||
# endif
|
||||
-L(return_vzeroupper):
|
||||
- ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
-L(aligned_more):
|
||||
+L(first_vec_x2):
|
||||
+ tzcntl %eax, %eax
|
||||
+ /* Safe to use 32 bit instructions as these are only called for
|
||||
+ size = [1, 159]. */
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE"
|
||||
- with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
|
||||
- to void possible addition overflow. */
|
||||
- negq %rcx
|
||||
- addq $VEC_SIZE, %rcx
|
||||
-
|
||||
- /* Check the end of data. */
|
||||
- subq %rcx, %rsi
|
||||
- jbe L(max)
|
||||
+ /* Use ecx which was computed earlier to compute correct value.
|
||||
+ */
|
||||
+ subl $(VEC_SIZE * 3 + 1), %ecx
|
||||
+ addl %ecx, %eax
|
||||
+# else
|
||||
+ subl %edx, %edi
|
||||
+ addl $(VEC_SIZE + 1), %edi
|
||||
+ addl %edi, %eax
|
||||
# endif
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ shrl $2, %eax
|
||||
+# endif
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
- addq $VEC_SIZE, %rdi
|
||||
+ .p2align 4
|
||||
+L(first_vec_x3):
|
||||
+ tzcntl %eax, %eax
|
||||
+ /* Safe to use 32 bit instructions as these are only called for
|
||||
+ size = [1, 159]. */
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ /* Use ecx which was computed earlier to compute correct value.
|
||||
+ */
|
||||
+ subl $(VEC_SIZE * 2 + 1), %ecx
|
||||
+ addl %ecx, %eax
|
||||
+# else
|
||||
+ subl %edx, %edi
|
||||
+ addl $(VEC_SIZE * 2 + 1), %edi
|
||||
+ addl %edi, %eax
|
||||
+# endif
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ shrl $2, %eax
|
||||
+# endif
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
+ .p2align 4
|
||||
+L(first_vec_x4):
|
||||
+ tzcntl %eax, %eax
|
||||
+ /* Safe to use 32 bit instructions as these are only called for
|
||||
+ size = [1, 159]. */
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- subq $(VEC_SIZE * 4), %rsi
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
+ /* Use ecx which was computed earlier to compute correct value.
|
||||
+ */
|
||||
+ subl $(VEC_SIZE + 1), %ecx
|
||||
+ addl %ecx, %eax
|
||||
+# else
|
||||
+ subl %edx, %edi
|
||||
+ addl $(VEC_SIZE * 3 + 1), %edi
|
||||
+ addl %edi, %eax
|
||||
# endif
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ shrl $2, %eax
|
||||
+# endif
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
-L(more_4x_vec):
|
||||
+ .p2align 5
|
||||
+L(aligned_more):
|
||||
+ /* Align data to VEC_SIZE - 1. This is the same number of
|
||||
+ instructions as using andq with -VEC_SIZE but saves 4 bytes of
|
||||
+ code on the x4 check. */
|
||||
+ orq $(VEC_SIZE - 1), %rdi
|
||||
+L(cross_page_continue):
|
||||
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
||||
since data is only aligned to VEC_SIZE. */
|
||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
-
|
||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
|
||||
+ it simplies the logic in last_4x_vec_or_less. */
|
||||
+ leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
|
||||
+ subq %rdx, %rcx
|
||||
+# endif
|
||||
+ /* Load first VEC regardless. */
|
||||
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ /* Adjust length. If near end handle specially. */
|
||||
+ subq %rcx, %rsi
|
||||
+ jb L(last_4x_vec_or_less)
|
||||
+# endif
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x1)
|
||||
|
||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x2)
|
||||
|
||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x3)
|
||||
|
||||
- addq $(VEC_SIZE * 4), %rdi
|
||||
-
|
||||
-# ifdef USE_AS_STRNLEN
|
||||
- subq $(VEC_SIZE * 4), %rsi
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
-# endif
|
||||
-
|
||||
- /* Align data to 4 * VEC_SIZE. */
|
||||
- movq %rdi, %rcx
|
||||
- andl $(4 * VEC_SIZE - 1), %ecx
|
||||
- andq $-(4 * VEC_SIZE), %rdi
|
||||
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(first_vec_x4)
|
||||
|
||||
+ /* Align data to VEC_SIZE * 4 - 1. */
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- /* Adjust length. */
|
||||
+ /* Before adjusting length check if at last VEC_SIZE * 4. */
|
||||
+ cmpq $(VEC_SIZE * 4 - 1), %rsi
|
||||
+ jbe L(last_4x_vec_or_less_load)
|
||||
+ incq %rdi
|
||||
+ movl %edi, %ecx
|
||||
+ orq $(VEC_SIZE * 4 - 1), %rdi
|
||||
+ andl $(VEC_SIZE * 4 - 1), %ecx
|
||||
+ /* Readjust length. */
|
||||
addq %rcx, %rsi
|
||||
+# else
|
||||
+ incq %rdi
|
||||
+ orq $(VEC_SIZE * 4 - 1), %rdi
|
||||
# endif
|
||||
-
|
||||
+ /* Compare 4 * VEC at a time forward. */
|
||||
.p2align 4
|
||||
L(loop_4x_vec):
|
||||
- /* Compare 4 * VEC at a time forward. */
|
||||
- vmovdqa (%rdi), %ymm1
|
||||
- vmovdqa VEC_SIZE(%rdi), %ymm2
|
||||
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
|
||||
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
|
||||
- VPMINU %ymm1, %ymm2, %ymm5
|
||||
- VPMINU %ymm3, %ymm4, %ymm6
|
||||
- VPMINU %ymm5, %ymm6, %ymm5
|
||||
-
|
||||
- VPCMPEQ %ymm5, %ymm0, %ymm5
|
||||
- vpmovmskb %ymm5, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(4x_vec_end)
|
||||
-
|
||||
- addq $(VEC_SIZE * 4), %rdi
|
||||
-
|
||||
-# ifndef USE_AS_STRNLEN
|
||||
- jmp L(loop_4x_vec)
|
||||
-# else
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ /* Break if at end of length. */
|
||||
subq $(VEC_SIZE * 4), %rsi
|
||||
- ja L(loop_4x_vec)
|
||||
-
|
||||
-L(last_4x_vec_or_less):
|
||||
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
|
||||
- addl $(VEC_SIZE * 2), %esi
|
||||
- jle L(last_2x_vec)
|
||||
+ jb L(last_4x_vec_or_less_cmpeq)
|
||||
+# endif
|
||||
+ /* Save some code size by microfusing VPMINU with the load. Since
|
||||
+ the matches in ymm2/ymm4 can only be returned if there where no
|
||||
+ matches in ymm1/ymm3 respectively there is no issue with overlap.
|
||||
+ */
|
||||
+ vmovdqa 1(%rdi), %ymm1
|
||||
+ VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
|
||||
+ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
|
||||
+ VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
|
||||
+
|
||||
+ VPMINU %ymm2, %ymm4, %ymm5
|
||||
+ VPCMPEQ %ymm5, %ymm0, %ymm5
|
||||
+ vpmovmskb %ymm5, %ecx
|
||||
|
||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+ testl %ecx, %ecx
|
||||
+ jz L(loop_4x_vec)
|
||||
|
||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x1)
|
||||
|
||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ VPCMPEQ %ymm1, %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ subq %rdx, %rdi
|
||||
testl %eax, %eax
|
||||
+ jnz L(last_vec_return_x0)
|
||||
|
||||
- jnz L(first_vec_x2_check)
|
||||
- subl $VEC_SIZE, %esi
|
||||
- jle L(max)
|
||||
-
|
||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ VPCMPEQ %ymm2, %ymm0, %ymm2
|
||||
+ vpmovmskb %ymm2, %eax
|
||||
testl %eax, %eax
|
||||
-
|
||||
- jnz L(first_vec_x3_check)
|
||||
- movq %r8, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
+ jnz L(last_vec_return_x1)
|
||||
+
|
||||
+ /* Combine last 2 VEC. */
|
||||
+ VPCMPEQ %ymm3, %ymm0, %ymm3
|
||||
+ vpmovmskb %ymm3, %eax
|
||||
+ /* rcx has combined result from all 4 VEC. It will only be used if
|
||||
+ the first 3 other VEC all did not contain a match. */
|
||||
+ salq $32, %rcx
|
||||
+ orq %rcx, %rax
|
||||
+ tzcntq %rax, %rax
|
||||
+ subq $(VEC_SIZE * 2 - 1), %rdi
|
||||
+ addq %rdi, %rax
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
-# endif
|
||||
+# endif
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
+
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
.p2align 4
|
||||
-L(last_2x_vec):
|
||||
- addl $(VEC_SIZE * 2), %esi
|
||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
+L(last_4x_vec_or_less_load):
|
||||
+ /* Depending on entry adjust rdi / prepare first VEC in ymm1. */
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+L(last_4x_vec_or_less_cmpeq):
|
||||
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
|
||||
+L(last_4x_vec_or_less):
|
||||
|
||||
- jnz L(first_vec_x0_check)
|
||||
- subl $VEC_SIZE, %esi
|
||||
- jle L(max)
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ /* If remaining length > VEC_SIZE * 2. This works if esi is off by
|
||||
+ VEC_SIZE * 4. */
|
||||
+ testl $(VEC_SIZE * 2), %esi
|
||||
+ jnz L(last_4x_vec)
|
||||
|
||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ /* length may have been negative or positive by an offset of
|
||||
+ VEC_SIZE * 4 depending on where this was called from. This fixes
|
||||
+ that. */
|
||||
+ andl $(VEC_SIZE * 4 - 1), %esi
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec_x1_check)
|
||||
- movq %r8, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
-# endif
|
||||
- VZEROUPPER_RETURN
|
||||
+ jnz L(last_vec_x1_check)
|
||||
|
||||
- .p2align 4
|
||||
-L(first_vec_x0_check):
|
||||
+ subl $VEC_SIZE, %esi
|
||||
+ jb L(max)
|
||||
+
|
||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
tzcntl %eax, %eax
|
||||
/* Check the end of data. */
|
||||
- cmpq %rax, %rsi
|
||||
- jbe L(max)
|
||||
+ cmpl %eax, %esi
|
||||
+ jb L(max)
|
||||
+ subq %rdx, %rdi
|
||||
+ addl $(VEC_SIZE + 1), %eax
|
||||
addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
+# endif
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x1_check):
|
||||
+L(last_vec_return_x0):
|
||||
tzcntl %eax, %eax
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rsi
|
||||
- jbe L(max)
|
||||
- addq $VEC_SIZE, %rax
|
||||
+ subq $(VEC_SIZE * 4 - 1), %rdi
|
||||
addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
-# endif
|
||||
+# endif
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x2_check):
|
||||
+L(last_vec_return_x1):
|
||||
tzcntl %eax, %eax
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rsi
|
||||
- jbe L(max)
|
||||
- addq $(VEC_SIZE * 2), %rax
|
||||
+ subq $(VEC_SIZE * 3 - 1), %rdi
|
||||
addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
-# endif
|
||||
+# endif
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
.p2align 4
|
||||
-L(first_vec_x3_check):
|
||||
+L(last_vec_x1_check):
|
||||
+
|
||||
tzcntl %eax, %eax
|
||||
/* Check the end of data. */
|
||||
- cmpq %rax, %rsi
|
||||
- jbe L(max)
|
||||
- addq $(VEC_SIZE * 3), %rax
|
||||
+ cmpl %eax, %esi
|
||||
+ jb L(max)
|
||||
+ subq %rdx, %rdi
|
||||
+ incl %eax
|
||||
addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
- .p2align 4
|
||||
L(max):
|
||||
movq %r8, %rax
|
||||
+ VZEROUPPER_RETURN
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(last_4x_vec):
|
||||
+ /* Test first 2x VEC normally. */
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(last_vec_x1)
|
||||
+
|
||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(last_vec_x2)
|
||||
+
|
||||
+ /* Normalize length. */
|
||||
+ andl $(VEC_SIZE * 4 - 1), %esi
|
||||
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(last_vec_x3)
|
||||
+
|
||||
+ subl $(VEC_SIZE * 3), %esi
|
||||
+ jb L(max)
|
||||
+
|
||||
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ tzcntl %eax, %eax
|
||||
+ /* Check the end of data. */
|
||||
+ cmpl %eax, %esi
|
||||
+ jb L(max)
|
||||
+ subq %rdx, %rdi
|
||||
+ addl $(VEC_SIZE * 3 + 1), %eax
|
||||
+ addq %rdi, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
- .p2align 4
|
||||
-L(zero):
|
||||
- xorl %eax, %eax
|
||||
- ret
|
||||
-# endif
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x0):
|
||||
+L(last_vec_x1):
|
||||
+ /* essentially duplicates of first_vec_x1 but use 64 bit
|
||||
+ instructions. */
|
||||
tzcntl %eax, %eax
|
||||
+ subq %rdx, %rdi
|
||||
+ incl %eax
|
||||
addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
-# endif
|
||||
+# endif
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x1):
|
||||
+L(last_vec_x2):
|
||||
+ /* essentially duplicates of first_vec_x1 but use 64 bit
|
||||
+ instructions. */
|
||||
tzcntl %eax, %eax
|
||||
- addq $VEC_SIZE, %rax
|
||||
+ subq %rdx, %rdi
|
||||
+ addl $(VEC_SIZE + 1), %eax
|
||||
addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
-# endif
|
||||
+# endif
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x2):
|
||||
+L(last_vec_x3):
|
||||
tzcntl %eax, %eax
|
||||
- addq $(VEC_SIZE * 2), %rax
|
||||
+ subl $(VEC_SIZE * 2), %esi
|
||||
+ /* Check the end of data. */
|
||||
+ cmpl %eax, %esi
|
||||
+ jb L(max_end)
|
||||
+ subq %rdx, %rdi
|
||||
+ addl $(VEC_SIZE * 2 + 1), %eax
|
||||
addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
-# endif
|
||||
+# endif
|
||||
+ VZEROUPPER_RETURN
|
||||
+L(max_end):
|
||||
+ movq %r8, %rax
|
||||
VZEROUPPER_RETURN
|
||||
+# endif
|
||||
|
||||
+ /* Cold case for crossing page with first load. */
|
||||
.p2align 4
|
||||
-L(4x_vec_end):
|
||||
- VPCMPEQ %ymm1, %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
- VPCMPEQ %ymm2, %ymm0, %ymm2
|
||||
- vpmovmskb %ymm2, %eax
|
||||
+L(cross_page_boundary):
|
||||
+ /* Align data to VEC_SIZE - 1. */
|
||||
+ orq $(VEC_SIZE - 1), %rdi
|
||||
+ VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
|
||||
+ so no need to manually mod rdx. */
|
||||
+ sarxl %edx, %eax, %eax
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec_x1)
|
||||
- VPCMPEQ %ymm3, %ymm0, %ymm3
|
||||
- vpmovmskb %ymm3, %eax
|
||||
+ jnz L(cross_page_less_vec)
|
||||
+ leaq 1(%rdi), %rcx
|
||||
+ subq %rdx, %rcx
|
||||
+ /* Check length. */
|
||||
+ cmpq %rsi, %rcx
|
||||
+ jb L(cross_page_continue)
|
||||
+ movq %r8, %rax
|
||||
+# else
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec_x2)
|
||||
- VPCMPEQ %ymm4, %ymm0, %ymm4
|
||||
- vpmovmskb %ymm4, %eax
|
||||
-L(first_vec_x3):
|
||||
+ jz L(cross_page_continue)
|
||||
tzcntl %eax, %eax
|
||||
- addq $(VEC_SIZE * 3), %rax
|
||||
- addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ shrl $2, %eax
|
||||
+# endif
|
||||
# endif
|
||||
+L(return_vzeroupper):
|
||||
+ ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
+
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ .p2align 4
|
||||
+L(cross_page_less_vec):
|
||||
+ tzcntl %eax, %eax
|
||||
+ cmpq %rax, %rsi
|
||||
+ cmovb %esi, %eax
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ shrl $2, %eax
|
||||
+# endif
|
||||
VZEROUPPER_RETURN
|
||||
+# endif
|
||||
|
||||
END (STRLEN)
|
||||
#endif
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,701 +0,0 @@
|
|||
From 2a76821c3081d2c0231ecd2618f52662cb48fccd Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon, 3 May 2021 03:03:19 -0400
|
||||
Subject: [PATCH] x86: Optimize memchr-evex.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug. This commit optimizes memchr-evex.S. The optimizations include
|
||||
replacing some branches with cmovcc, avoiding some branches entirely
|
||||
in the less_4x_vec case, making the page cross logic less strict,
|
||||
saving some ALU in the alignment process, and most importantly
|
||||
increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
|
||||
test-wmemchr are all passing.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++----------
|
||||
1 file changed, 322 insertions(+), 225 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
|
||||
index 6dd5d67b..81d5cd64 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
|
||||
@@ -26,14 +26,28 @@
|
||||
|
||||
# ifdef USE_AS_WMEMCHR
|
||||
# define VPBROADCAST vpbroadcastd
|
||||
-# define VPCMP vpcmpd
|
||||
-# define SHIFT_REG r8d
|
||||
+# define VPMINU vpminud
|
||||
+# define VPCMP vpcmpd
|
||||
+# define VPCMPEQ vpcmpeqd
|
||||
+# define CHAR_SIZE 4
|
||||
# else
|
||||
# define VPBROADCAST vpbroadcastb
|
||||
-# define VPCMP vpcmpb
|
||||
-# define SHIFT_REG ecx
|
||||
+# define VPMINU vpminub
|
||||
+# define VPCMP vpcmpb
|
||||
+# define VPCMPEQ vpcmpeqb
|
||||
+# define CHAR_SIZE 1
|
||||
# endif
|
||||
|
||||
+# ifdef USE_AS_RAWMEMCHR
|
||||
+# define RAW_PTR_REG rcx
|
||||
+# define ALGN_PTR_REG rdi
|
||||
+# else
|
||||
+# define RAW_PTR_REG rdi
|
||||
+# define ALGN_PTR_REG rcx
|
||||
+# endif
|
||||
+
|
||||
+# define XMMZERO xmm23
|
||||
+# define YMMZERO ymm23
|
||||
# define XMMMATCH xmm16
|
||||
# define YMMMATCH ymm16
|
||||
# define YMM1 ymm17
|
||||
@@ -44,6 +58,8 @@
|
||||
# define YMM6 ymm22
|
||||
|
||||
# define VEC_SIZE 32
|
||||
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
||||
+# define PAGE_SIZE 4096
|
||||
|
||||
.section .text.evex,"ax",@progbits
|
||||
ENTRY (MEMCHR)
|
||||
@@ -51,11 +67,7 @@ ENTRY (MEMCHR)
|
||||
/* Check for zero length. */
|
||||
test %RDX_LP, %RDX_LP
|
||||
jz L(zero)
|
||||
-# endif
|
||||
- movl %edi, %ecx
|
||||
-# ifdef USE_AS_WMEMCHR
|
||||
- shl $2, %RDX_LP
|
||||
-# else
|
||||
+
|
||||
# ifdef __ILP32__
|
||||
/* Clear the upper 32 bits. */
|
||||
movl %edx, %edx
|
||||
@@ -64,318 +76,403 @@ ENTRY (MEMCHR)
|
||||
/* Broadcast CHAR to YMMMATCH. */
|
||||
VPBROADCAST %esi, %YMMMATCH
|
||||
/* Check if we may cross page boundary with one vector load. */
|
||||
- andl $(2 * VEC_SIZE - 1), %ecx
|
||||
- cmpl $VEC_SIZE, %ecx
|
||||
- ja L(cros_page_boundary)
|
||||
+ movl %edi, %eax
|
||||
+ andl $(PAGE_SIZE - 1), %eax
|
||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
+ ja L(cross_page_boundary)
|
||||
|
||||
/* Check the first VEC_SIZE bytes. */
|
||||
- VPCMP $0, (%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
-
|
||||
+ VPCMP $0, (%rdi), %YMMMATCH, %k0
|
||||
+ kmovd %k0, %eax
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
- jnz L(first_vec_x0_check)
|
||||
- /* Adjust length and check the end of data. */
|
||||
- subq $VEC_SIZE, %rdx
|
||||
- jbe L(zero)
|
||||
+ /* If length < CHAR_PER_VEC handle special. */
|
||||
+ cmpq $CHAR_PER_VEC, %rdx
|
||||
+ jbe L(first_vec_x0)
|
||||
+# endif
|
||||
+ testl %eax, %eax
|
||||
+ jz L(aligned_more)
|
||||
+ tzcntl %eax, %eax
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
||||
# else
|
||||
- jnz L(first_vec_x0)
|
||||
+ addq %rdi, %rax
|
||||
# endif
|
||||
-
|
||||
- /* Align data for aligned loads in the loop. */
|
||||
- addq $VEC_SIZE, %rdi
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
+ ret
|
||||
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
- /* Adjust length. */
|
||||
- addq %rcx, %rdx
|
||||
-
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
-# endif
|
||||
- jmp L(more_4x_vec)
|
||||
+L(zero):
|
||||
+ xorl %eax, %eax
|
||||
+ ret
|
||||
|
||||
+ .p2align 5
|
||||
+L(first_vec_x0):
|
||||
+ /* Check if first match was before length. */
|
||||
+ tzcntl %eax, %eax
|
||||
+ xorl %ecx, %ecx
|
||||
+ cmpl %eax, %edx
|
||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ cmovle %rcx, %rax
|
||||
+ ret
|
||||
+# else
|
||||
+ /* NB: first_vec_x0 is 17 bytes which will leave
|
||||
+ cross_page_boundary (which is relatively cold) close enough
|
||||
+ to ideal alignment. So only realign L(cross_page_boundary) if
|
||||
+ rawmemchr. */
|
||||
.p2align 4
|
||||
-L(cros_page_boundary):
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
+# endif
|
||||
+L(cross_page_boundary):
|
||||
+ /* Save pointer before aligning as its original value is
|
||||
+ necessary for computer return address if byte is found or
|
||||
+ adjusting length if it is not and this is memchr. */
|
||||
+ movq %rdi, %rcx
|
||||
+ /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
|
||||
+ for rawmemchr. */
|
||||
+ andq $-VEC_SIZE, %ALGN_PTR_REG
|
||||
+ VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
|
||||
+ kmovd %k0, %r8d
|
||||
# ifdef USE_AS_WMEMCHR
|
||||
- /* NB: Divide shift count by 4 since each bit in K1 represent 4
|
||||
+ /* NB: Divide shift count by 4 since each bit in K0 represent 4
|
||||
bytes. */
|
||||
- movl %ecx, %SHIFT_REG
|
||||
- sarl $2, %SHIFT_REG
|
||||
+ sarl $2, %eax
|
||||
+# endif
|
||||
+# ifndef USE_AS_RAWMEMCHR
|
||||
+ movl $(PAGE_SIZE / CHAR_SIZE), %esi
|
||||
+ subl %eax, %esi
|
||||
# endif
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
- VPCMP $0, (%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
- /* Remove the leading bytes. */
|
||||
- sarxl %SHIFT_REG, %eax, %eax
|
||||
- testl %eax, %eax
|
||||
- jz L(aligned_more)
|
||||
- tzcntl %eax, %eax
|
||||
# ifdef USE_AS_WMEMCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
+ andl $(CHAR_PER_VEC - 1), %eax
|
||||
# endif
|
||||
+ /* Remove the leading bytes. */
|
||||
+ sarxl %eax, %r8d, %eax
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
/* Check the end of data. */
|
||||
- cmpq %rax, %rdx
|
||||
- jbe L(zero)
|
||||
+ cmpq %rsi, %rdx
|
||||
+ jbe L(first_vec_x0)
|
||||
+# endif
|
||||
+ testl %eax, %eax
|
||||
+ jz L(cross_page_continue)
|
||||
+ tzcntl %eax, %eax
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||||
+ leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
|
||||
+# else
|
||||
+ addq %RAW_PTR_REG, %rax
|
||||
# endif
|
||||
- addq %rdi, %rax
|
||||
- addq %rcx, %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(aligned_more):
|
||||
-# ifndef USE_AS_RAWMEMCHR
|
||||
- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
|
||||
- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
|
||||
- overflow. */
|
||||
- negq %rcx
|
||||
- addq $VEC_SIZE, %rcx
|
||||
+L(first_vec_x1):
|
||||
+ tzcntl %eax, %eax
|
||||
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
|
||||
- /* Check the end of data. */
|
||||
- subq %rcx, %rdx
|
||||
- jbe L(zero)
|
||||
-# endif
|
||||
+ .p2align 4
|
||||
+L(first_vec_x2):
|
||||
+ tzcntl %eax, %eax
|
||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
|
||||
- addq $VEC_SIZE, %rdi
|
||||
+ .p2align 4
|
||||
+L(first_vec_x3):
|
||||
+ tzcntl %eax, %eax
|
||||
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
|
||||
-# ifndef USE_AS_RAWMEMCHR
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
-# endif
|
||||
+ .p2align 4
|
||||
+L(first_vec_x4):
|
||||
+ tzcntl %eax, %eax
|
||||
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
|
||||
-L(more_4x_vec):
|
||||
+ .p2align 5
|
||||
+L(aligned_more):
|
||||
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
||||
since data is only aligned to VEC_SIZE. */
|
||||
- VPCMP $0, (%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
|
||||
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
+# ifndef USE_AS_RAWMEMCHR
|
||||
+ /* Align data to VEC_SIZE. */
|
||||
+L(cross_page_continue):
|
||||
+ xorl %ecx, %ecx
|
||||
+ subl %edi, %ecx
|
||||
+ andq $-VEC_SIZE, %rdi
|
||||
+ /* esi is for adjusting length to see if near the end. */
|
||||
+ leal (VEC_SIZE * 5)(%rdi, %rcx), %esi
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarl $2, %esi
|
||||
+# endif
|
||||
+# else
|
||||
+ andq $-VEC_SIZE, %rdi
|
||||
+L(cross_page_continue):
|
||||
+# endif
|
||||
+ /* Load first VEC regardless. */
|
||||
+ VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+# ifndef USE_AS_RAWMEMCHR
|
||||
+ /* Adjust length. If near end handle specially. */
|
||||
+ subq %rsi, %rdx
|
||||
+ jbe L(last_4x_vec_or_less)
|
||||
+# endif
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x1)
|
||||
|
||||
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
|
||||
+ kmovd %k0, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x2)
|
||||
|
||||
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
|
||||
+ kmovd %k0, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x3)
|
||||
|
||||
- addq $(VEC_SIZE * 4), %rdi
|
||||
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(first_vec_x4)
|
||||
+
|
||||
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
-# endif
|
||||
+ /* Check if at last CHAR_PER_VEC * 4 length. */
|
||||
+ subq $(CHAR_PER_VEC * 4), %rdx
|
||||
+ jbe L(last_4x_vec_or_less_cmpeq)
|
||||
+ addq $VEC_SIZE, %rdi
|
||||
|
||||
- /* Align data to 4 * VEC_SIZE. */
|
||||
- movq %rdi, %rcx
|
||||
- andl $(4 * VEC_SIZE - 1), %ecx
|
||||
+ /* Align data to VEC_SIZE * 4 for the loop and readjust length.
|
||||
+ */
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ movl %edi, %ecx
|
||||
andq $-(4 * VEC_SIZE), %rdi
|
||||
-
|
||||
-# ifndef USE_AS_RAWMEMCHR
|
||||
- /* Adjust length. */
|
||||
+ andl $(VEC_SIZE * 4 - 1), %ecx
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarl $2, %ecx
|
||||
addq %rcx, %rdx
|
||||
+# else
|
||||
+ addq %rdi, %rdx
|
||||
+ andq $-(4 * VEC_SIZE), %rdi
|
||||
+ subq %rdi, %rdx
|
||||
+# endif
|
||||
+# else
|
||||
+ addq $VEC_SIZE, %rdi
|
||||
+ andq $-(4 * VEC_SIZE), %rdi
|
||||
# endif
|
||||
|
||||
+ vpxorq %XMMZERO, %XMMZERO, %XMMZERO
|
||||
+
|
||||
+ /* Compare 4 * VEC at a time forward. */
|
||||
.p2align 4
|
||||
L(loop_4x_vec):
|
||||
- /* Compare 4 * VEC at a time forward. */
|
||||
- VPCMP $0, (%rdi), %YMMMATCH, %k1
|
||||
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
|
||||
- kord %k1, %k2, %k5
|
||||
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
|
||||
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
|
||||
-
|
||||
- kord %k3, %k4, %k6
|
||||
- kortestd %k5, %k6
|
||||
- jnz L(4x_vec_end)
|
||||
-
|
||||
- addq $(VEC_SIZE * 4), %rdi
|
||||
-
|
||||
+ /* It would be possible to save some instructions using 4x VPCMP
|
||||
+ but bottleneck on port 5 makes it not woth it. */
|
||||
+ VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
|
||||
+ /* xor will set bytes match esi to zero. */
|
||||
+ vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
|
||||
+ vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
|
||||
+ VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
|
||||
+ /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
|
||||
+ VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z}
|
||||
+ VPCMP $0, %YMM3, %YMMZERO, %k2
|
||||
# ifdef USE_AS_RAWMEMCHR
|
||||
- jmp L(loop_4x_vec)
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+ kortestd %k2, %k3
|
||||
+ jz L(loop_4x_vec)
|
||||
# else
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
+ kortestd %k2, %k3
|
||||
+ jnz L(loop_4x_vec_end)
|
||||
+
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+
|
||||
+ subq $(CHAR_PER_VEC * 4), %rdx
|
||||
ja L(loop_4x_vec)
|
||||
|
||||
+ /* Fall through into less than 4 remaining vectors of length case.
|
||||
+ */
|
||||
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ addq $(VEC_SIZE * 3), %rdi
|
||||
+ .p2align 4
|
||||
L(last_4x_vec_or_less):
|
||||
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
|
||||
- addl $(VEC_SIZE * 2), %edx
|
||||
- jle L(last_2x_vec)
|
||||
-
|
||||
- VPCMP $0, (%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
+ /* Check if first VEC contained match. */
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
+ jnz L(first_vec_x1_check)
|
||||
|
||||
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x1)
|
||||
+ /* If remaining length > CHAR_PER_VEC * 2. */
|
||||
+ addl $(CHAR_PER_VEC * 2), %edx
|
||||
+ jg L(last_4x_vec)
|
||||
|
||||
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
+L(last_2x_vec):
|
||||
+ /* If remaining length < CHAR_PER_VEC. */
|
||||
+ addl $CHAR_PER_VEC, %edx
|
||||
+ jle L(zero_end)
|
||||
|
||||
- jnz L(first_vec_x2_check)
|
||||
- subl $VEC_SIZE, %edx
|
||||
- jle L(zero)
|
||||
+ /* Check VEC2 and compare any match with remaining length. */
|
||||
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ tzcntl %eax, %eax
|
||||
+ cmpl %eax, %edx
|
||||
+ jbe L(set_zero_end)
|
||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+L(zero_end):
|
||||
+ ret
|
||||
|
||||
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
|
||||
- jnz L(first_vec_x3_check)
|
||||
+ .p2align 4
|
||||
+L(first_vec_x1_check):
|
||||
+ tzcntl %eax, %eax
|
||||
+ /* Adjust length. */
|
||||
+ subl $-(CHAR_PER_VEC * 4), %edx
|
||||
+ /* Check if match within remaining length. */
|
||||
+ cmpl %eax, %edx
|
||||
+ jbe L(set_zero_end)
|
||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||||
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
+L(set_zero_end):
|
||||
xorl %eax, %eax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(last_2x_vec):
|
||||
- addl $(VEC_SIZE * 2), %edx
|
||||
- VPCMP $0, (%rdi), %YMMMATCH, %k1
|
||||
+L(loop_4x_vec_end):
|
||||
+# endif
|
||||
+ /* rawmemchr will fall through into this if match was found in
|
||||
+ loop. */
|
||||
+
|
||||
+ /* k1 has not of matches with VEC1. */
|
||||
kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ subl $((1 << CHAR_PER_VEC) - 1), %eax
|
||||
+# else
|
||||
+ incl %eax
|
||||
+# endif
|
||||
+ jnz L(last_vec_x1_return)
|
||||
|
||||
- jnz L(first_vec_x0_check)
|
||||
- subl $VEC_SIZE, %edx
|
||||
- jle L(zero)
|
||||
+ VPCMP $0, %YMM2, %YMMZERO, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(last_vec_x2_return)
|
||||
|
||||
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
+ kmovd %k2, %eax
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec_x1_check)
|
||||
- xorl %eax, %eax
|
||||
- ret
|
||||
+ jnz L(last_vec_x3_return)
|
||||
|
||||
- .p2align 4
|
||||
-L(first_vec_x0_check):
|
||||
+ kmovd %k3, %eax
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WMEMCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
+# ifdef USE_AS_RAWMEMCHR
|
||||
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+# else
|
||||
+ leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
# endif
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rdx
|
||||
- jbe L(zero)
|
||||
- addq %rdi, %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x1_check):
|
||||
+L(last_vec_x1_return):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WMEMCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
-# endif
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rdx
|
||||
- jbe L(zero)
|
||||
- addq $VEC_SIZE, %rax
|
||||
+# ifdef USE_AS_RAWMEMCHR
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
||||
+# else
|
||||
addq %rdi, %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(first_vec_x2_check):
|
||||
- tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WMEMCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
+# endif
|
||||
+# else
|
||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||||
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
# endif
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rdx
|
||||
- jbe L(zero)
|
||||
- addq $(VEC_SIZE * 2), %rax
|
||||
- addq %rdi, %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x3_check):
|
||||
+L(last_vec_x2_return):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WMEMCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
+# ifdef USE_AS_RAWMEMCHR
|
||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||||
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+# else
|
||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||||
+ leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
# endif
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rdx
|
||||
- jbe L(zero)
|
||||
- addq $(VEC_SIZE * 3), %rax
|
||||
- addq %rdi, %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(zero):
|
||||
- xorl %eax, %eax
|
||||
- ret
|
||||
-# endif
|
||||
-
|
||||
- .p2align 4
|
||||
-L(first_vec_x0):
|
||||
+L(last_vec_x3_return):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WMEMCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- leaq (%rdi, %rax, 4), %rax
|
||||
+# ifdef USE_AS_RAWMEMCHR
|
||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
# else
|
||||
- addq %rdi, %rax
|
||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||||
+ leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
# endif
|
||||
ret
|
||||
|
||||
+
|
||||
+# ifndef USE_AS_RAWMEMCHR
|
||||
+L(last_4x_vec_or_less_cmpeq):
|
||||
+ VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+ /* Check first VEC regardless. */
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(first_vec_x1_check)
|
||||
+
|
||||
+ /* If remaining length <= CHAR_PER_VEC * 2. */
|
||||
+ addl $(CHAR_PER_VEC * 2), %edx
|
||||
+ jle L(last_2x_vec)
|
||||
+
|
||||
.p2align 4
|
||||
-L(first_vec_x1):
|
||||
+L(last_4x_vec):
|
||||
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(last_vec_x2)
|
||||
+
|
||||
+
|
||||
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ /* Create mask for possible matches within remaining length. */
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
|
||||
+ bzhil %edx, %ecx, %ecx
|
||||
+# else
|
||||
+ movq $-1, %rcx
|
||||
+ bzhiq %rdx, %rcx, %rcx
|
||||
+# endif
|
||||
+ /* Test matches in data against length match. */
|
||||
+ andl %ecx, %eax
|
||||
+ jnz L(last_vec_x3)
|
||||
+
|
||||
+ /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
|
||||
+ remaining length was found to be > CHAR_PER_VEC * 2. */
|
||||
+ subl $CHAR_PER_VEC, %edx
|
||||
+ jbe L(zero_end2)
|
||||
+
|
||||
+
|
||||
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ /* Shift remaining length mask for last VEC. */
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ shrl $CHAR_PER_VEC, %ecx
|
||||
+# else
|
||||
+ shrq $CHAR_PER_VEC, %rcx
|
||||
+# endif
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(zero_end2)
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WMEMCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- leaq VEC_SIZE(%rdi, %rax, 4), %rax
|
||||
-# else
|
||||
- addq $VEC_SIZE, %rax
|
||||
- addq %rdi, %rax
|
||||
-# endif
|
||||
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+L(zero_end2):
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(first_vec_x2):
|
||||
+L(last_vec_x2):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WMEMCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
|
||||
-# else
|
||||
- addq $(VEC_SIZE * 2), %rax
|
||||
- addq %rdi, %rax
|
||||
-# endif
|
||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(4x_vec_end):
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
- kmovd %k2, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x1)
|
||||
- kmovd %k3, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x2)
|
||||
- kmovd %k4, %eax
|
||||
- testl %eax, %eax
|
||||
-L(first_vec_x3):
|
||||
+L(last_vec_x3):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WMEMCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
|
||||
-# else
|
||||
- addq $(VEC_SIZE * 3), %rax
|
||||
- addq %rdi, %rax
|
||||
-# endif
|
||||
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
ret
|
||||
+# endif
|
||||
|
||||
END (MEMCHR)
|
||||
#endif
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,30 +0,0 @@
|
|||
From 6ea916adfa0ab9af6e7dc6adcf6f977dfe017835 Mon Sep 17 00:00:00 2001
|
||||
From: Alice Xu <alice.d.xu@gmail.com>
|
||||
Date: Fri, 7 May 2021 19:03:21 -0700
|
||||
Subject: [PATCH] x86-64: Fix an unknown vector operation in memchr-evex.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
An unknown vector operation occurred in commit 2a76821c308. Fixed it
|
||||
by using "ymm{k1}{z}" but not "ymm {k1} {z}".
|
||||
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/memchr-evex.S | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
|
||||
index 81d5cd64..f3fdad4f 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
|
||||
@@ -271,7 +271,7 @@ L(loop_4x_vec):
|
||||
vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
|
||||
VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
|
||||
/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
|
||||
- VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z}
|
||||
+ VPMINU %YMM2, %YMM3, %YMM3{%k1}{z}
|
||||
VPCMP $0, %YMM3, %YMMZERO, %k2
|
||||
# ifdef USE_AS_RAWMEMCHR
|
||||
subq $-(VEC_SIZE * 4), %rdi
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,566 +0,0 @@
|
|||
From a0db678071c60b6c47c468d231dd0b3694ba7a98 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Tue, 22 Jun 2021 20:42:10 -0700
|
||||
Subject: [PATCH] x86-64: Move strlen.S to multiarch/strlen-vec.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Since strlen.S contains SSE2 version of strlen/strnlen and SSE4.1
|
||||
version of wcslen/wcsnlen, move strlen.S to multiarch/strlen-vec.S
|
||||
and include multiarch/strlen-vec.S from SSE2 and SSE4.1 variants.
|
||||
This also removes the unused symbols, __GI___strlen_sse2 and
|
||||
__GI___wcsnlen_sse4_1.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strlen-sse2.S | 2 +-
|
||||
sysdeps/x86_64/multiarch/strlen-vec.S | 257 ++++++++++++++++++++++
|
||||
sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S | 2 +-
|
||||
sysdeps/x86_64/strlen.S | 243 +-------------------
|
||||
4 files changed, 262 insertions(+), 242 deletions(-)
|
||||
create mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S
|
||||
|
||||
Conflicts:
|
||||
sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
|
||||
(Copyright dates, URL)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S
|
||||
index 7bc57b8d..449c8a7f 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strlen-sse2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strlen-sse2.S
|
||||
@@ -20,4 +20,4 @@
|
||||
# define strlen __strlen_sse2
|
||||
#endif
|
||||
|
||||
-#include "../strlen.S"
|
||||
+#include "strlen-vec.S"
|
||||
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
|
||||
new file mode 100644
|
||||
index 00000000..8f660bb9
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
|
||||
@@ -0,0 +1,257 @@
|
||||
+/* SSE2 version of strlen and SSE4.1 version of wcslen.
|
||||
+ Copyright (C) 2012-2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+
|
||||
+#ifdef AS_WCSLEN
|
||||
+# define PMINU pminud
|
||||
+# define PCMPEQ pcmpeqd
|
||||
+# define SHIFT_RETURN shrq $2, %rax
|
||||
+#else
|
||||
+# define PMINU pminub
|
||||
+# define PCMPEQ pcmpeqb
|
||||
+# define SHIFT_RETURN
|
||||
+#endif
|
||||
+
|
||||
+/* Long lived register in strlen(s), strnlen(s, n) are:
|
||||
+
|
||||
+ %xmm3 - zero
|
||||
+ %rdi - s
|
||||
+ %r10 (s+n) & (~(64-1))
|
||||
+ %r11 s+n
|
||||
+*/
|
||||
+
|
||||
+
|
||||
+.text
|
||||
+ENTRY(strlen)
|
||||
+
|
||||
+/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
|
||||
+#define FIND_ZERO \
|
||||
+ PCMPEQ (%rax), %xmm0; \
|
||||
+ PCMPEQ 16(%rax), %xmm1; \
|
||||
+ PCMPEQ 32(%rax), %xmm2; \
|
||||
+ PCMPEQ 48(%rax), %xmm3; \
|
||||
+ pmovmskb %xmm0, %esi; \
|
||||
+ pmovmskb %xmm1, %edx; \
|
||||
+ pmovmskb %xmm2, %r8d; \
|
||||
+ pmovmskb %xmm3, %ecx; \
|
||||
+ salq $16, %rdx; \
|
||||
+ salq $16, %rcx; \
|
||||
+ orq %rsi, %rdx; \
|
||||
+ orq %r8, %rcx; \
|
||||
+ salq $32, %rcx; \
|
||||
+ orq %rcx, %rdx;
|
||||
+
|
||||
+#ifdef AS_STRNLEN
|
||||
+/* Do not read anything when n==0. */
|
||||
+ test %RSI_LP, %RSI_LP
|
||||
+ jne L(n_nonzero)
|
||||
+ xor %rax, %rax
|
||||
+ ret
|
||||
+L(n_nonzero):
|
||||
+# ifdef AS_WCSLEN
|
||||
+ shl $2, %RSI_LP
|
||||
+# endif
|
||||
+
|
||||
+/* Initialize long lived registers. */
|
||||
+
|
||||
+ add %RDI_LP, %RSI_LP
|
||||
+ mov %RSI_LP, %R10_LP
|
||||
+ and $-64, %R10_LP
|
||||
+ mov %RSI_LP, %R11_LP
|
||||
+#endif
|
||||
+
|
||||
+ pxor %xmm0, %xmm0
|
||||
+ pxor %xmm1, %xmm1
|
||||
+ pxor %xmm2, %xmm2
|
||||
+ pxor %xmm3, %xmm3
|
||||
+ movq %rdi, %rax
|
||||
+ movq %rdi, %rcx
|
||||
+ andq $4095, %rcx
|
||||
+/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
|
||||
+ cmpq $4047, %rcx
|
||||
+/* We cannot unify this branching as it would be ~6 cycles slower. */
|
||||
+ ja L(cross_page)
|
||||
+
|
||||
+#ifdef AS_STRNLEN
|
||||
+/* Test if end is among first 64 bytes. */
|
||||
+# define STRNLEN_PROLOG \
|
||||
+ mov %r11, %rsi; \
|
||||
+ subq %rax, %rsi; \
|
||||
+ andq $-64, %rax; \
|
||||
+ testq $-64, %rsi; \
|
||||
+ je L(strnlen_ret)
|
||||
+#else
|
||||
+# define STRNLEN_PROLOG andq $-64, %rax;
|
||||
+#endif
|
||||
+
|
||||
+/* Ignore bits in mask that come before start of string. */
|
||||
+#define PROLOG(lab) \
|
||||
+ movq %rdi, %rcx; \
|
||||
+ xorq %rax, %rcx; \
|
||||
+ STRNLEN_PROLOG; \
|
||||
+ sarq %cl, %rdx; \
|
||||
+ test %rdx, %rdx; \
|
||||
+ je L(lab); \
|
||||
+ bsfq %rdx, %rax; \
|
||||
+ SHIFT_RETURN; \
|
||||
+ ret
|
||||
+
|
||||
+#ifdef AS_STRNLEN
|
||||
+ andq $-16, %rax
|
||||
+ FIND_ZERO
|
||||
+#else
|
||||
+ /* Test first 16 bytes unaligned. */
|
||||
+ movdqu (%rax), %xmm4
|
||||
+ PCMPEQ %xmm0, %xmm4
|
||||
+ pmovmskb %xmm4, %edx
|
||||
+ test %edx, %edx
|
||||
+ je L(next48_bytes)
|
||||
+ bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
|
||||
+ SHIFT_RETURN
|
||||
+ ret
|
||||
+
|
||||
+L(next48_bytes):
|
||||
+/* Same as FIND_ZERO except we do not check first 16 bytes. */
|
||||
+ andq $-16, %rax
|
||||
+ PCMPEQ 16(%rax), %xmm1
|
||||
+ PCMPEQ 32(%rax), %xmm2
|
||||
+ PCMPEQ 48(%rax), %xmm3
|
||||
+ pmovmskb %xmm1, %edx
|
||||
+ pmovmskb %xmm2, %r8d
|
||||
+ pmovmskb %xmm3, %ecx
|
||||
+ salq $16, %rdx
|
||||
+ salq $16, %rcx
|
||||
+ orq %r8, %rcx
|
||||
+ salq $32, %rcx
|
||||
+ orq %rcx, %rdx
|
||||
+#endif
|
||||
+
|
||||
+ /* When no zero byte is found xmm1-3 are zero so we do not have to
|
||||
+ zero them. */
|
||||
+ PROLOG(loop)
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(cross_page):
|
||||
+ andq $-64, %rax
|
||||
+ FIND_ZERO
|
||||
+ PROLOG(loop_init)
|
||||
+
|
||||
+#ifdef AS_STRNLEN
|
||||
+/* We must do this check to correctly handle strnlen (s, -1). */
|
||||
+L(strnlen_ret):
|
||||
+ bts %rsi, %rdx
|
||||
+ sarq %cl, %rdx
|
||||
+ test %rdx, %rdx
|
||||
+ je L(loop_init)
|
||||
+ bsfq %rdx, %rax
|
||||
+ SHIFT_RETURN
|
||||
+ ret
|
||||
+#endif
|
||||
+ .p2align 4
|
||||
+L(loop_init):
|
||||
+ pxor %xmm1, %xmm1
|
||||
+ pxor %xmm2, %xmm2
|
||||
+ pxor %xmm3, %xmm3
|
||||
+#ifdef AS_STRNLEN
|
||||
+ .p2align 4
|
||||
+L(loop):
|
||||
+
|
||||
+ addq $64, %rax
|
||||
+ cmpq %rax, %r10
|
||||
+ je L(exit_end)
|
||||
+
|
||||
+ movdqa (%rax), %xmm0
|
||||
+ PMINU 16(%rax), %xmm0
|
||||
+ PMINU 32(%rax), %xmm0
|
||||
+ PMINU 48(%rax), %xmm0
|
||||
+ PCMPEQ %xmm3, %xmm0
|
||||
+ pmovmskb %xmm0, %edx
|
||||
+ testl %edx, %edx
|
||||
+ jne L(exit)
|
||||
+ jmp L(loop)
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(exit_end):
|
||||
+ cmp %rax, %r11
|
||||
+ je L(first) /* Do not read when end is at page boundary. */
|
||||
+ pxor %xmm0, %xmm0
|
||||
+ FIND_ZERO
|
||||
+
|
||||
+L(first):
|
||||
+ bts %r11, %rdx
|
||||
+ bsfq %rdx, %rdx
|
||||
+ addq %rdx, %rax
|
||||
+ subq %rdi, %rax
|
||||
+ SHIFT_RETURN
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(exit):
|
||||
+ pxor %xmm0, %xmm0
|
||||
+ FIND_ZERO
|
||||
+
|
||||
+ bsfq %rdx, %rdx
|
||||
+ addq %rdx, %rax
|
||||
+ subq %rdi, %rax
|
||||
+ SHIFT_RETURN
|
||||
+ ret
|
||||
+
|
||||
+#else
|
||||
+
|
||||
+ /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
|
||||
+ .p2align 4
|
||||
+L(loop):
|
||||
+
|
||||
+ movdqa 64(%rax), %xmm0
|
||||
+ PMINU 80(%rax), %xmm0
|
||||
+ PMINU 96(%rax), %xmm0
|
||||
+ PMINU 112(%rax), %xmm0
|
||||
+ PCMPEQ %xmm3, %xmm0
|
||||
+ pmovmskb %xmm0, %edx
|
||||
+ testl %edx, %edx
|
||||
+ jne L(exit64)
|
||||
+
|
||||
+ subq $-128, %rax
|
||||
+
|
||||
+ movdqa (%rax), %xmm0
|
||||
+ PMINU 16(%rax), %xmm0
|
||||
+ PMINU 32(%rax), %xmm0
|
||||
+ PMINU 48(%rax), %xmm0
|
||||
+ PCMPEQ %xmm3, %xmm0
|
||||
+ pmovmskb %xmm0, %edx
|
||||
+ testl %edx, %edx
|
||||
+ jne L(exit0)
|
||||
+ jmp L(loop)
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(exit64):
|
||||
+ addq $64, %rax
|
||||
+L(exit0):
|
||||
+ pxor %xmm0, %xmm0
|
||||
+ FIND_ZERO
|
||||
+
|
||||
+ bsfq %rdx, %rdx
|
||||
+ addq %rdx, %rax
|
||||
+ subq %rdi, %rax
|
||||
+ SHIFT_RETURN
|
||||
+ ret
|
||||
+
|
||||
+#endif
|
||||
+
|
||||
+END(strlen)
|
||||
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
|
||||
index a8cab0cb..5fa51fe0 100644
|
||||
--- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
|
||||
+++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
|
||||
@@ -2,4 +2,4 @@
|
||||
#define AS_STRNLEN
|
||||
#define strlen __wcsnlen_sse4_1
|
||||
|
||||
-#include "../strlen.S"
|
||||
+#include "strlen-vec.S"
|
||||
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
|
||||
index f845f3d4..ad047d84 100644
|
||||
--- a/sysdeps/x86_64/strlen.S
|
||||
+++ b/sysdeps/x86_64/strlen.S
|
||||
@@ -1,5 +1,5 @@
|
||||
-/* SSE2 version of strlen/wcslen.
|
||||
- Copyright (C) 2012-2018 Free Software Foundation, Inc.
|
||||
+/* SSE2 version of strlen.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
@@ -16,243 +16,6 @@
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
-#include <sysdep.h>
|
||||
+#include "multiarch/strlen-vec.S"
|
||||
|
||||
-#ifdef AS_WCSLEN
|
||||
-# define PMINU pminud
|
||||
-# define PCMPEQ pcmpeqd
|
||||
-# define SHIFT_RETURN shrq $2, %rax
|
||||
-#else
|
||||
-# define PMINU pminub
|
||||
-# define PCMPEQ pcmpeqb
|
||||
-# define SHIFT_RETURN
|
||||
-#endif
|
||||
-
|
||||
-/* Long lived register in strlen(s), strnlen(s, n) are:
|
||||
-
|
||||
- %xmm3 - zero
|
||||
- %rdi - s
|
||||
- %r10 (s+n) & (~(64-1))
|
||||
- %r11 s+n
|
||||
-*/
|
||||
-
|
||||
-
|
||||
-.text
|
||||
-ENTRY(strlen)
|
||||
-
|
||||
-/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
|
||||
-#define FIND_ZERO \
|
||||
- PCMPEQ (%rax), %xmm0; \
|
||||
- PCMPEQ 16(%rax), %xmm1; \
|
||||
- PCMPEQ 32(%rax), %xmm2; \
|
||||
- PCMPEQ 48(%rax), %xmm3; \
|
||||
- pmovmskb %xmm0, %esi; \
|
||||
- pmovmskb %xmm1, %edx; \
|
||||
- pmovmskb %xmm2, %r8d; \
|
||||
- pmovmskb %xmm3, %ecx; \
|
||||
- salq $16, %rdx; \
|
||||
- salq $16, %rcx; \
|
||||
- orq %rsi, %rdx; \
|
||||
- orq %r8, %rcx; \
|
||||
- salq $32, %rcx; \
|
||||
- orq %rcx, %rdx;
|
||||
-
|
||||
-#ifdef AS_STRNLEN
|
||||
-/* Do not read anything when n==0. */
|
||||
- test %RSI_LP, %RSI_LP
|
||||
- jne L(n_nonzero)
|
||||
- xor %rax, %rax
|
||||
- ret
|
||||
-L(n_nonzero):
|
||||
-# ifdef AS_WCSLEN
|
||||
- shl $2, %RSI_LP
|
||||
-# endif
|
||||
-
|
||||
-/* Initialize long lived registers. */
|
||||
-
|
||||
- add %RDI_LP, %RSI_LP
|
||||
- mov %RSI_LP, %R10_LP
|
||||
- and $-64, %R10_LP
|
||||
- mov %RSI_LP, %R11_LP
|
||||
-#endif
|
||||
-
|
||||
- pxor %xmm0, %xmm0
|
||||
- pxor %xmm1, %xmm1
|
||||
- pxor %xmm2, %xmm2
|
||||
- pxor %xmm3, %xmm3
|
||||
- movq %rdi, %rax
|
||||
- movq %rdi, %rcx
|
||||
- andq $4095, %rcx
|
||||
-/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
|
||||
- cmpq $4047, %rcx
|
||||
-/* We cannot unify this branching as it would be ~6 cycles slower. */
|
||||
- ja L(cross_page)
|
||||
-
|
||||
-#ifdef AS_STRNLEN
|
||||
-/* Test if end is among first 64 bytes. */
|
||||
-# define STRNLEN_PROLOG \
|
||||
- mov %r11, %rsi; \
|
||||
- subq %rax, %rsi; \
|
||||
- andq $-64, %rax; \
|
||||
- testq $-64, %rsi; \
|
||||
- je L(strnlen_ret)
|
||||
-#else
|
||||
-# define STRNLEN_PROLOG andq $-64, %rax;
|
||||
-#endif
|
||||
-
|
||||
-/* Ignore bits in mask that come before start of string. */
|
||||
-#define PROLOG(lab) \
|
||||
- movq %rdi, %rcx; \
|
||||
- xorq %rax, %rcx; \
|
||||
- STRNLEN_PROLOG; \
|
||||
- sarq %cl, %rdx; \
|
||||
- test %rdx, %rdx; \
|
||||
- je L(lab); \
|
||||
- bsfq %rdx, %rax; \
|
||||
- SHIFT_RETURN; \
|
||||
- ret
|
||||
-
|
||||
-#ifdef AS_STRNLEN
|
||||
- andq $-16, %rax
|
||||
- FIND_ZERO
|
||||
-#else
|
||||
- /* Test first 16 bytes unaligned. */
|
||||
- movdqu (%rax), %xmm4
|
||||
- PCMPEQ %xmm0, %xmm4
|
||||
- pmovmskb %xmm4, %edx
|
||||
- test %edx, %edx
|
||||
- je L(next48_bytes)
|
||||
- bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
|
||||
- SHIFT_RETURN
|
||||
- ret
|
||||
-
|
||||
-L(next48_bytes):
|
||||
-/* Same as FIND_ZERO except we do not check first 16 bytes. */
|
||||
- andq $-16, %rax
|
||||
- PCMPEQ 16(%rax), %xmm1
|
||||
- PCMPEQ 32(%rax), %xmm2
|
||||
- PCMPEQ 48(%rax), %xmm3
|
||||
- pmovmskb %xmm1, %edx
|
||||
- pmovmskb %xmm2, %r8d
|
||||
- pmovmskb %xmm3, %ecx
|
||||
- salq $16, %rdx
|
||||
- salq $16, %rcx
|
||||
- orq %r8, %rcx
|
||||
- salq $32, %rcx
|
||||
- orq %rcx, %rdx
|
||||
-#endif
|
||||
-
|
||||
- /* When no zero byte is found xmm1-3 are zero so we do not have to
|
||||
- zero them. */
|
||||
- PROLOG(loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(cross_page):
|
||||
- andq $-64, %rax
|
||||
- FIND_ZERO
|
||||
- PROLOG(loop_init)
|
||||
-
|
||||
-#ifdef AS_STRNLEN
|
||||
-/* We must do this check to correctly handle strnlen (s, -1). */
|
||||
-L(strnlen_ret):
|
||||
- bts %rsi, %rdx
|
||||
- sarq %cl, %rdx
|
||||
- test %rdx, %rdx
|
||||
- je L(loop_init)
|
||||
- bsfq %rdx, %rax
|
||||
- SHIFT_RETURN
|
||||
- ret
|
||||
-#endif
|
||||
- .p2align 4
|
||||
-L(loop_init):
|
||||
- pxor %xmm1, %xmm1
|
||||
- pxor %xmm2, %xmm2
|
||||
- pxor %xmm3, %xmm3
|
||||
-#ifdef AS_STRNLEN
|
||||
- .p2align 4
|
||||
-L(loop):
|
||||
-
|
||||
- addq $64, %rax
|
||||
- cmpq %rax, %r10
|
||||
- je L(exit_end)
|
||||
-
|
||||
- movdqa (%rax), %xmm0
|
||||
- PMINU 16(%rax), %xmm0
|
||||
- PMINU 32(%rax), %xmm0
|
||||
- PMINU 48(%rax), %xmm0
|
||||
- PCMPEQ %xmm3, %xmm0
|
||||
- pmovmskb %xmm0, %edx
|
||||
- testl %edx, %edx
|
||||
- jne L(exit)
|
||||
- jmp L(loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(exit_end):
|
||||
- cmp %rax, %r11
|
||||
- je L(first) /* Do not read when end is at page boundary. */
|
||||
- pxor %xmm0, %xmm0
|
||||
- FIND_ZERO
|
||||
-
|
||||
-L(first):
|
||||
- bts %r11, %rdx
|
||||
- bsfq %rdx, %rdx
|
||||
- addq %rdx, %rax
|
||||
- subq %rdi, %rax
|
||||
- SHIFT_RETURN
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(exit):
|
||||
- pxor %xmm0, %xmm0
|
||||
- FIND_ZERO
|
||||
-
|
||||
- bsfq %rdx, %rdx
|
||||
- addq %rdx, %rax
|
||||
- subq %rdi, %rax
|
||||
- SHIFT_RETURN
|
||||
- ret
|
||||
-
|
||||
-#else
|
||||
-
|
||||
- /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
|
||||
- .p2align 4
|
||||
-L(loop):
|
||||
-
|
||||
- movdqa 64(%rax), %xmm0
|
||||
- PMINU 80(%rax), %xmm0
|
||||
- PMINU 96(%rax), %xmm0
|
||||
- PMINU 112(%rax), %xmm0
|
||||
- PCMPEQ %xmm3, %xmm0
|
||||
- pmovmskb %xmm0, %edx
|
||||
- testl %edx, %edx
|
||||
- jne L(exit64)
|
||||
-
|
||||
- subq $-128, %rax
|
||||
-
|
||||
- movdqa (%rax), %xmm0
|
||||
- PMINU 16(%rax), %xmm0
|
||||
- PMINU 32(%rax), %xmm0
|
||||
- PMINU 48(%rax), %xmm0
|
||||
- PCMPEQ %xmm3, %xmm0
|
||||
- pmovmskb %xmm0, %edx
|
||||
- testl %edx, %edx
|
||||
- jne L(exit0)
|
||||
- jmp L(loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(exit64):
|
||||
- addq $64, %rax
|
||||
-L(exit0):
|
||||
- pxor %xmm0, %xmm0
|
||||
- FIND_ZERO
|
||||
-
|
||||
- bsfq %rdx, %rdx
|
||||
- addq %rdx, %rax
|
||||
- subq %rdi, %rax
|
||||
- SHIFT_RETURN
|
||||
- ret
|
||||
-
|
||||
-#endif
|
||||
-
|
||||
-END(strlen)
|
||||
libc_hidden_builtin_def (strlen)
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,181 +0,0 @@
|
|||
From 6f573a27b6c8b4236445810a44660612323f5a73 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Wed, 23 Jun 2021 01:19:34 -0400
|
||||
Subject: [PATCH] x86-64: Add wcslen optimize for sse4.1
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug. This comment adds the ifunc / build infrastructure
|
||||
necessary for wcslen to prefer the sse4.1 implementation
|
||||
in strlen-vec.S. test-wcslen.c is passing.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/Makefile | 4 +-
|
||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 3 ++
|
||||
sysdeps/x86_64/multiarch/ifunc-wcslen.h | 52 ++++++++++++++++++++++
|
||||
sysdeps/x86_64/multiarch/wcslen-sse4_1.S | 4 ++
|
||||
sysdeps/x86_64/multiarch/wcslen.c | 2 +-
|
||||
sysdeps/x86_64/multiarch/wcsnlen.c | 34 +-------------
|
||||
6 files changed, 63 insertions(+), 36 deletions(-)
|
||||
create mode 100644 sysdeps/x86_64/multiarch/ifunc-wcslen.h
|
||||
create mode 100644 sysdeps/x86_64/multiarch/wcslen-sse4_1.S
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
||||
index 491c7698..65fde4eb 100644
|
||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
||||
@@ -93,8 +93,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
|
||||
wcscpy-ssse3 wcscpy-c \
|
||||
wcschr-sse2 wcschr-avx2 \
|
||||
wcsrchr-sse2 wcsrchr-avx2 \
|
||||
- wcsnlen-sse4_1 wcsnlen-c \
|
||||
- wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
|
||||
+ wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \
|
||||
+ wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \
|
||||
wcschr-avx2-rtm \
|
||||
wcscmp-avx2-rtm \
|
||||
wcslen-avx2-rtm \
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index f1a6460a..580913ca 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -657,6 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
||||
&& CPU_FEATURE_USABLE (BMI2)),
|
||||
__wcslen_evex)
|
||||
+ IFUNC_IMPL_ADD (array, i, wcsnlen,
|
||||
+ CPU_FEATURE_USABLE (SSE4_1),
|
||||
+ __wcsnlen_sse4_1)
|
||||
IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
|
||||
|
||||
/* Support sysdeps/x86_64/multiarch/wcsnlen.c. */
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
|
||||
new file mode 100644
|
||||
index 00000000..39e33473
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
|
||||
@@ -0,0 +1,52 @@
|
||||
+/* Common definition for ifunc selections for wcslen and wcsnlen
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2017-2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <init-arch.h>
|
||||
+
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
|
||||
+
|
||||
+static inline void *
|
||||
+IFUNC_SELECTOR (void)
|
||||
+{
|
||||
+ const struct cpu_features* cpu_features = __get_cpu_features ();
|
||||
+
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
|
||||
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
||||
+ {
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
||||
+ return OPTIMIZE (evex);
|
||||
+
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
||||
+ return OPTIMIZE (avx2_rtm);
|
||||
+
|
||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
+ return OPTIMIZE (avx2);
|
||||
+ }
|
||||
+
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
|
||||
+ return OPTIMIZE (sse4_1);
|
||||
+
|
||||
+ return OPTIMIZE (sse2);
|
||||
+}
|
||||
diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
|
||||
new file mode 100644
|
||||
index 00000000..7e62621a
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
|
||||
@@ -0,0 +1,4 @@
|
||||
+#define AS_WCSLEN
|
||||
+#define strlen __wcslen_sse4_1
|
||||
+
|
||||
+#include "strlen-vec.S"
|
||||
diff --git a/sysdeps/x86_64/multiarch/wcslen.c b/sysdeps/x86_64/multiarch/wcslen.c
|
||||
index 6d06e47c..3b04b75b 100644
|
||||
--- a/sysdeps/x86_64/multiarch/wcslen.c
|
||||
+++ b/sysdeps/x86_64/multiarch/wcslen.c
|
||||
@@ -24,7 +24,7 @@
|
||||
# undef __wcslen
|
||||
|
||||
# define SYMBOL_NAME wcslen
|
||||
-# include "ifunc-avx2.h"
|
||||
+# include "ifunc-wcslen.h"
|
||||
|
||||
libc_ifunc_redirected (__redirect_wcslen, __wcslen, IFUNC_SELECTOR ());
|
||||
weak_alias (__wcslen, wcslen);
|
||||
diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
|
||||
index 20b731ae..06736410 100644
|
||||
--- a/sysdeps/x86_64/multiarch/wcsnlen.c
|
||||
+++ b/sysdeps/x86_64/multiarch/wcsnlen.c
|
||||
@@ -24,39 +24,7 @@
|
||||
# undef __wcsnlen
|
||||
|
||||
# define SYMBOL_NAME wcsnlen
|
||||
-# include <init-arch.h>
|
||||
-
|
||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
|
||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
|
||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
|
||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
|
||||
-
|
||||
-static inline void *
|
||||
-IFUNC_SELECTOR (void)
|
||||
-{
|
||||
- const struct cpu_features* cpu_features = __get_cpu_features ();
|
||||
-
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
||||
- && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
||||
- {
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
||||
- && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
||||
- return OPTIMIZE (evex);
|
||||
-
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
||||
- return OPTIMIZE (avx2_rtm);
|
||||
-
|
||||
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
- return OPTIMIZE (avx2);
|
||||
- }
|
||||
-
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
|
||||
- return OPTIMIZE (sse4_1);
|
||||
-
|
||||
- return OPTIMIZE (sse2);
|
||||
-}
|
||||
+# include "ifunc-wcslen.h"
|
||||
|
||||
libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());
|
||||
weak_alias (__wcsnlen, wcsnlen);
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,396 +0,0 @@
|
|||
From 231c56760c1e2ded21ad96bbb860b1f08c556c7a Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Mon, 21 Jan 2019 11:27:25 -0800
|
||||
Subject: [PATCH] x86-64 memcpy: Properly handle the length parameter [BZ#
|
||||
24097]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
||||
functions written in assembly can only use the lower 32 bits of a
|
||||
64-bit register as length or must clear the upper 32 bits before using
|
||||
the full 64-bit register for length.
|
||||
|
||||
This pach fixes memcpy for x32. Tested on x86-64 and x32. On x86-64,
|
||||
libc.so is the same with and withou the fix.
|
||||
|
||||
[BZ# 24097]
|
||||
CVE-2019-6488
|
||||
* sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: Use RDX_LP for
|
||||
length. Clear the upper 32 bits of RDX register.
|
||||
* sysdeps/x86_64/multiarch/memcpy-ssse3.S: Likewise.
|
||||
* sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S:
|
||||
Likewise.
|
||||
* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
|
||||
Likewise.
|
||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcpy.
|
||||
tst-size_t-wmemchr.
|
||||
* sysdeps/x86_64/x32/tst-size_t-memcpy.c: New file.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 17 ++++--
|
||||
sysdeps/x86_64/multiarch/memcpy-ssse3.S | 17 ++++--
|
||||
.../multiarch/memmove-avx512-no-vzeroupper.S | 16 +++--
|
||||
.../multiarch/memmove-vec-unaligned-erms.S | 54 +++++++++--------
|
||||
sysdeps/x86_64/x32/Makefile | 2 +-
|
||||
sysdeps/x86_64/x32/tst-size_t-memcpy.c | 58 +++++++++++++++++++
|
||||
6 files changed, 122 insertions(+), 42 deletions(-)
|
||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcpy.c
|
||||
|
||||
Conflicts:
|
||||
ChangeLog
|
||||
(removed)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
|
||||
index 3cd11233..568eebd3 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
|
||||
@@ -45,28 +45,33 @@
|
||||
.section .text.ssse3,"ax",@progbits
|
||||
#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
|
||||
ENTRY (MEMPCPY_CHK)
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (MEMPCPY_CHK)
|
||||
|
||||
ENTRY (MEMPCPY)
|
||||
- movq %rdi, %rax
|
||||
- addq %rdx, %rax
|
||||
+ mov %RDI_LP, %RAX_LP
|
||||
+ add %RDX_LP, %RAX_LP
|
||||
jmp L(start)
|
||||
END (MEMPCPY)
|
||||
#endif
|
||||
|
||||
#if !defined USE_AS_BCOPY
|
||||
ENTRY (MEMCPY_CHK)
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (MEMCPY_CHK)
|
||||
#endif
|
||||
|
||||
ENTRY (MEMCPY)
|
||||
- mov %rdi, %rax
|
||||
+ mov %RDI_LP, %RAX_LP
|
||||
#ifdef USE_AS_MEMPCPY
|
||||
- add %rdx, %rax
|
||||
+ add %RDX_LP, %RAX_LP
|
||||
+#endif
|
||||
+
|
||||
+#ifdef __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ mov %edx, %edx
|
||||
#endif
|
||||
|
||||
#ifdef USE_AS_MEMMOVE
|
||||
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
|
||||
index 0240bfa3..0bd5ee99 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
|
||||
@@ -45,28 +45,33 @@
|
||||
.section .text.ssse3,"ax",@progbits
|
||||
#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
|
||||
ENTRY (MEMPCPY_CHK)
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (MEMPCPY_CHK)
|
||||
|
||||
ENTRY (MEMPCPY)
|
||||
- movq %rdi, %rax
|
||||
- addq %rdx, %rax
|
||||
+ mov %RDI_LP, %RAX_LP
|
||||
+ add %RDX_LP, %RAX_LP
|
||||
jmp L(start)
|
||||
END (MEMPCPY)
|
||||
#endif
|
||||
|
||||
#if !defined USE_AS_BCOPY
|
||||
ENTRY (MEMCPY_CHK)
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (MEMCPY_CHK)
|
||||
#endif
|
||||
|
||||
ENTRY (MEMCPY)
|
||||
- mov %rdi, %rax
|
||||
+ mov %RDI_LP, %RAX_LP
|
||||
#ifdef USE_AS_MEMPCPY
|
||||
- add %rdx, %rax
|
||||
+ add %RDX_LP, %RAX_LP
|
||||
+#endif
|
||||
+
|
||||
+#ifdef __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ mov %edx, %edx
|
||||
#endif
|
||||
|
||||
#ifdef USE_AS_MEMMOVE
|
||||
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
|
||||
index effc3ac2..6ca2bbc9 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
|
||||
@@ -24,27 +24,31 @@
|
||||
|
||||
.section .text.avx512,"ax",@progbits
|
||||
ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (__mempcpy_chk_avx512_no_vzeroupper)
|
||||
|
||||
ENTRY (__mempcpy_avx512_no_vzeroupper)
|
||||
- movq %rdi, %rax
|
||||
- addq %rdx, %rax
|
||||
+ mov %RDI_LP, %RAX_LP
|
||||
+ add %RDX_LP, %RAX_LP
|
||||
jmp L(start)
|
||||
END (__mempcpy_avx512_no_vzeroupper)
|
||||
|
||||
ENTRY (__memmove_chk_avx512_no_vzeroupper)
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (__memmove_chk_avx512_no_vzeroupper)
|
||||
|
||||
ENTRY (__memmove_avx512_no_vzeroupper)
|
||||
- mov %rdi, %rax
|
||||
+ mov %RDI_LP, %RAX_LP
|
||||
# ifdef USE_AS_MEMPCPY
|
||||
- add %rdx, %rax
|
||||
+ add %RDX_LP, %RAX_LP
|
||||
# endif
|
||||
L(start):
|
||||
+# ifdef __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ mov %edx, %edx
|
||||
+# endif
|
||||
lea (%rsi, %rdx), %rcx
|
||||
lea (%rdi, %rdx), %r9
|
||||
cmp $512, %rdx
|
||||
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
index c952576c..274aa1c7 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
@@ -95,20 +95,20 @@
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
#if defined SHARED && IS_IN (libc)
|
||||
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
|
||||
#endif
|
||||
|
||||
ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
|
||||
- movq %rdi, %rax
|
||||
- addq %rdx, %rax
|
||||
+ mov %RDI_LP, %RAX_LP
|
||||
+ add %RDX_LP, %RAX_LP
|
||||
jmp L(start)
|
||||
END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
|
||||
|
||||
#if defined SHARED && IS_IN (libc)
|
||||
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
|
||||
#endif
|
||||
@@ -116,9 +116,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
|
||||
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
|
||||
movq %rdi, %rax
|
||||
L(start):
|
||||
- cmpq $VEC_SIZE, %rdx
|
||||
+# ifdef __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ movl %edx, %edx
|
||||
+# endif
|
||||
+ cmp $VEC_SIZE, %RDX_LP
|
||||
jb L(less_vec)
|
||||
- cmpq $(VEC_SIZE * 2), %rdx
|
||||
+ cmp $(VEC_SIZE * 2), %RDX_LP
|
||||
ja L(more_2x_vec)
|
||||
#if !defined USE_MULTIARCH || !IS_IN (libc)
|
||||
L(last_2x_vec):
|
||||
@@ -138,38 +142,38 @@ END (MEMMOVE_SYMBOL (__memmove, unaligned))
|
||||
|
||||
# if VEC_SIZE == 16
|
||||
ENTRY (__mempcpy_chk_erms)
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (__mempcpy_chk_erms)
|
||||
|
||||
/* Only used to measure performance of REP MOVSB. */
|
||||
ENTRY (__mempcpy_erms)
|
||||
- movq %rdi, %rax
|
||||
+ mov %RDI_LP, %RAX_LP
|
||||
/* Skip zero length. */
|
||||
- testq %rdx, %rdx
|
||||
+ test %RDX_LP, %RDX_LP
|
||||
jz 2f
|
||||
- addq %rdx, %rax
|
||||
+ add %RDX_LP, %RAX_LP
|
||||
jmp L(start_movsb)
|
||||
END (__mempcpy_erms)
|
||||
|
||||
ENTRY (__memmove_chk_erms)
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (__memmove_chk_erms)
|
||||
|
||||
ENTRY (__memmove_erms)
|
||||
movq %rdi, %rax
|
||||
/* Skip zero length. */
|
||||
- testq %rdx, %rdx
|
||||
+ test %RDX_LP, %RDX_LP
|
||||
jz 2f
|
||||
L(start_movsb):
|
||||
- movq %rdx, %rcx
|
||||
- cmpq %rsi, %rdi
|
||||
+ mov %RDX_LP, %RCX_LP
|
||||
+ cmp %RSI_LP, %RDI_LP
|
||||
jb 1f
|
||||
/* Source == destination is less common. */
|
||||
je 2f
|
||||
- leaq (%rsi,%rcx), %rdx
|
||||
- cmpq %rdx, %rdi
|
||||
+ lea (%rsi,%rcx), %RDX_LP
|
||||
+ cmp %RDX_LP, %RDI_LP
|
||||
jb L(movsb_backward)
|
||||
1:
|
||||
rep movsb
|
||||
@@ -189,20 +193,20 @@ strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
|
||||
|
||||
# ifdef SHARED
|
||||
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
|
||||
# endif
|
||||
|
||||
ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
|
||||
- movq %rdi, %rax
|
||||
- addq %rdx, %rax
|
||||
+ mov %RDI_LP, %RAX_LP
|
||||
+ add %RDX_LP, %RAX_LP
|
||||
jmp L(start_erms)
|
||||
END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
|
||||
|
||||
# ifdef SHARED
|
||||
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
|
||||
# endif
|
||||
@@ -210,9 +214,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
|
||||
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
|
||||
movq %rdi, %rax
|
||||
L(start_erms):
|
||||
- cmpq $VEC_SIZE, %rdx
|
||||
+# ifdef __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ movl %edx, %edx
|
||||
+# endif
|
||||
+ cmp $VEC_SIZE, %RDX_LP
|
||||
jb L(less_vec)
|
||||
- cmpq $(VEC_SIZE * 2), %rdx
|
||||
+ cmp $(VEC_SIZE * 2), %RDX_LP
|
||||
ja L(movsb_more_2x_vec)
|
||||
L(last_2x_vec):
|
||||
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
@@ -236,7 +244,7 @@ L(movsb):
|
||||
/* Avoid slow backward REP MOVSB. */
|
||||
jb L(more_8x_vec_backward)
|
||||
1:
|
||||
- movq %rdx, %rcx
|
||||
+ mov %RDX_LP, %RCX_LP
|
||||
rep movsb
|
||||
L(nop):
|
||||
ret
|
||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
||||
index ddec7f04..2fe1e5ac 100644
|
||||
--- a/sysdeps/x86_64/x32/Makefile
|
||||
+++ b/sysdeps/x86_64/x32/Makefile
|
||||
@@ -6,7 +6,7 @@ CFLAGS-s_llround.c += -fno-builtin-lround
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),string)
|
||||
-tests += tst-size_t-memchr tst-size_t-memcmp
|
||||
+tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),wcsmbs)
|
||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-memcpy.c b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
|
||||
new file mode 100644
|
||||
index 00000000..66b71e17
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
|
||||
@@ -0,0 +1,58 @@
|
||||
+/* Test memcpy with size_t in the lower 32 bits of 64-bit register.
|
||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#define TEST_NAME "memcpy"
|
||||
+#include "test-size_t.h"
|
||||
+
|
||||
+IMPL (memcpy, 1)
|
||||
+
|
||||
+typedef void *(*proto_t) (void *, const void *, size_t);
|
||||
+
|
||||
+static void *
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+do_memcpy (parameter_t a, parameter_t b)
|
||||
+{
|
||||
+ return CALL (&b, a.p, b.p, a.len);
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+test_main (void)
|
||||
+{
|
||||
+ test_init ();
|
||||
+
|
||||
+ parameter_t dest = { { page_size }, buf1 };
|
||||
+ parameter_t src = { { 0 }, buf2 };
|
||||
+
|
||||
+ int ret = 0;
|
||||
+ FOR_EACH_IMPL (impl, 0)
|
||||
+ {
|
||||
+ src.fn = impl->fn;
|
||||
+ do_memcpy (dest, src);
|
||||
+ int res = memcmp (dest.p, src.p, dest.len);
|
||||
+ if (res)
|
||||
+ {
|
||||
+ error (0, 0, "Wrong result in function %s: %i != 0",
|
||||
+ impl->name, res);
|
||||
+ ret = 1;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
||||
+}
|
||||
+
|
||||
+#include <support/test-driver.c>
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,497 +0,0 @@
|
|||
From a775a7a3eb1e85b54af0b4ee5ff4dcf66772a1fb Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Wed, 23 Jun 2021 01:56:29 -0400
|
||||
Subject: [PATCH] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ
|
||||
#27974]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
This commit fixes the bug mentioned in the previous commit.
|
||||
|
||||
The previous implementations of wmemchr in these files relied
|
||||
on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
|
||||
|
||||
The new overflow tests added in the previous commit now
|
||||
pass (As well as all the other tests).
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
|
||||
sysdeps/x86_64/multiarch/strlen-vec.S | 15 ++-
|
||||
2 files changed, 107 insertions(+), 38 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
|
||||
index be8a5db5..37688966 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
|
||||
@@ -44,21 +44,21 @@
|
||||
|
||||
# define VEC_SIZE 32
|
||||
# define PAGE_SIZE 4096
|
||||
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
||||
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
ENTRY (STRLEN)
|
||||
# ifdef USE_AS_STRNLEN
|
||||
/* Check zero length. */
|
||||
+# ifdef __ILP32__
|
||||
+ /* Clear upper bits. */
|
||||
+ and %RSI_LP, %RSI_LP
|
||||
+# else
|
||||
test %RSI_LP, %RSI_LP
|
||||
+# endif
|
||||
jz L(zero)
|
||||
/* Store max len in R8_LP before adjusting if using WCSLEN. */
|
||||
mov %RSI_LP, %R8_LP
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- shl $2, %RSI_LP
|
||||
-# elif defined __ILP32__
|
||||
- /* Clear the upper 32 bits. */
|
||||
- movl %esi, %esi
|
||||
-# endif
|
||||
# endif
|
||||
movl %edi, %eax
|
||||
movq %rdi, %rdx
|
||||
@@ -72,10 +72,10 @@ ENTRY (STRLEN)
|
||||
|
||||
/* Check the first VEC_SIZE bytes. */
|
||||
VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
# ifdef USE_AS_STRNLEN
|
||||
/* If length < VEC_SIZE handle special. */
|
||||
- cmpq $VEC_SIZE, %rsi
|
||||
+ cmpq $CHAR_PER_VEC, %rsi
|
||||
jbe L(first_vec_x0)
|
||||
# endif
|
||||
/* If empty continue to aligned_more. Otherwise return bit
|
||||
@@ -84,6 +84,7 @@ ENTRY (STRLEN)
|
||||
jz L(aligned_more)
|
||||
tzcntl %eax, %eax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrl $2, %eax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -97,9 +98,14 @@ L(zero):
|
||||
L(first_vec_x0):
|
||||
/* Set bit for max len so that tzcnt will return min of max len
|
||||
and position of first match. */
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Multiply length by 4 to get byte count. */
|
||||
+ sall $2, %esi
|
||||
+# endif
|
||||
btsq %rsi, %rax
|
||||
tzcntl %eax, %eax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrl $2, %eax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -113,14 +119,19 @@ L(first_vec_x1):
|
||||
# ifdef USE_AS_STRNLEN
|
||||
/* Use ecx which was computed earlier to compute correct value.
|
||||
*/
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
|
||||
+# else
|
||||
subl $(VEC_SIZE * 4 + 1), %ecx
|
||||
addl %ecx, %eax
|
||||
+# endif
|
||||
# else
|
||||
subl %edx, %edi
|
||||
incl %edi
|
||||
addl %edi, %eax
|
||||
# endif
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrl $2, %eax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -133,14 +144,19 @@ L(first_vec_x2):
|
||||
# ifdef USE_AS_STRNLEN
|
||||
/* Use ecx which was computed earlier to compute correct value.
|
||||
*/
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
|
||||
+# else
|
||||
subl $(VEC_SIZE * 3 + 1), %ecx
|
||||
addl %ecx, %eax
|
||||
+# endif
|
||||
# else
|
||||
subl %edx, %edi
|
||||
addl $(VEC_SIZE + 1), %edi
|
||||
addl %edi, %eax
|
||||
# endif
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrl $2, %eax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -153,14 +169,19 @@ L(first_vec_x3):
|
||||
# ifdef USE_AS_STRNLEN
|
||||
/* Use ecx which was computed earlier to compute correct value.
|
||||
*/
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
|
||||
+# else
|
||||
subl $(VEC_SIZE * 2 + 1), %ecx
|
||||
addl %ecx, %eax
|
||||
+# endif
|
||||
# else
|
||||
subl %edx, %edi
|
||||
addl $(VEC_SIZE * 2 + 1), %edi
|
||||
addl %edi, %eax
|
||||
# endif
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrl $2, %eax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -173,14 +194,19 @@ L(first_vec_x4):
|
||||
# ifdef USE_AS_STRNLEN
|
||||
/* Use ecx which was computed earlier to compute correct value.
|
||||
*/
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
|
||||
+# else
|
||||
subl $(VEC_SIZE + 1), %ecx
|
||||
addl %ecx, %eax
|
||||
+# endif
|
||||
# else
|
||||
subl %edx, %edi
|
||||
addl $(VEC_SIZE * 3 + 1), %edi
|
||||
addl %edi, %eax
|
||||
# endif
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrl $2, %eax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -195,10 +221,14 @@ L(cross_page_continue):
|
||||
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
||||
since data is only aligned to VEC_SIZE. */
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
|
||||
- it simplies the logic in last_4x_vec_or_less. */
|
||||
+ /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
|
||||
+ because it simplies the logic in last_4x_vec_or_less. */
|
||||
leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
|
||||
subq %rdx, %rcx
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarl $2, %ecx
|
||||
+# endif
|
||||
# endif
|
||||
/* Load first VEC regardless. */
|
||||
VPCMPEQ 1(%rdi), %ymm0, %ymm1
|
||||
@@ -207,34 +237,38 @@ L(cross_page_continue):
|
||||
subq %rcx, %rsi
|
||||
jb L(last_4x_vec_or_less)
|
||||
# endif
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x1)
|
||||
|
||||
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x2)
|
||||
|
||||
VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x3)
|
||||
|
||||
VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x4)
|
||||
|
||||
/* Align data to VEC_SIZE * 4 - 1. */
|
||||
# ifdef USE_AS_STRNLEN
|
||||
/* Before adjusting length check if at last VEC_SIZE * 4. */
|
||||
- cmpq $(VEC_SIZE * 4 - 1), %rsi
|
||||
+ cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
|
||||
jbe L(last_4x_vec_or_less_load)
|
||||
incq %rdi
|
||||
movl %edi, %ecx
|
||||
orq $(VEC_SIZE * 4 - 1), %rdi
|
||||
andl $(VEC_SIZE * 4 - 1), %ecx
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarl $2, %ecx
|
||||
+# endif
|
||||
/* Readjust length. */
|
||||
addq %rcx, %rsi
|
||||
# else
|
||||
@@ -246,13 +280,13 @@ L(cross_page_continue):
|
||||
L(loop_4x_vec):
|
||||
# ifdef USE_AS_STRNLEN
|
||||
/* Break if at end of length. */
|
||||
- subq $(VEC_SIZE * 4), %rsi
|
||||
+ subq $(CHAR_PER_VEC * 4), %rsi
|
||||
jb L(last_4x_vec_or_less_cmpeq)
|
||||
# endif
|
||||
- /* Save some code size by microfusing VPMINU with the load. Since
|
||||
- the matches in ymm2/ymm4 can only be returned if there where no
|
||||
- matches in ymm1/ymm3 respectively there is no issue with overlap.
|
||||
- */
|
||||
+ /* Save some code size by microfusing VPMINU with the load.
|
||||
+ Since the matches in ymm2/ymm4 can only be returned if there
|
||||
+ where no matches in ymm1/ymm3 respectively there is no issue
|
||||
+ with overlap. */
|
||||
vmovdqa 1(%rdi), %ymm1
|
||||
VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
|
||||
vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
|
||||
@@ -260,7 +294,7 @@ L(loop_4x_vec):
|
||||
|
||||
VPMINU %ymm2, %ymm4, %ymm5
|
||||
VPCMPEQ %ymm5, %ymm0, %ymm5
|
||||
- vpmovmskb %ymm5, %ecx
|
||||
+ vpmovmskb %ymm5, %ecx
|
||||
|
||||
subq $-(VEC_SIZE * 4), %rdi
|
||||
testl %ecx, %ecx
|
||||
@@ -268,27 +302,28 @@ L(loop_4x_vec):
|
||||
|
||||
|
||||
VPCMPEQ %ymm1, %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
subq %rdx, %rdi
|
||||
testl %eax, %eax
|
||||
jnz L(last_vec_return_x0)
|
||||
|
||||
VPCMPEQ %ymm2, %ymm0, %ymm2
|
||||
- vpmovmskb %ymm2, %eax
|
||||
+ vpmovmskb %ymm2, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(last_vec_return_x1)
|
||||
|
||||
/* Combine last 2 VEC. */
|
||||
VPCMPEQ %ymm3, %ymm0, %ymm3
|
||||
- vpmovmskb %ymm3, %eax
|
||||
- /* rcx has combined result from all 4 VEC. It will only be used if
|
||||
- the first 3 other VEC all did not contain a match. */
|
||||
+ vpmovmskb %ymm3, %eax
|
||||
+ /* rcx has combined result from all 4 VEC. It will only be used
|
||||
+ if the first 3 other VEC all did not contain a match. */
|
||||
salq $32, %rcx
|
||||
orq %rcx, %rax
|
||||
tzcntq %rax, %rax
|
||||
subq $(VEC_SIZE * 2 - 1), %rdi
|
||||
addq %rdi, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -297,15 +332,19 @@ L(loop_4x_vec):
|
||||
# ifdef USE_AS_STRNLEN
|
||||
.p2align 4
|
||||
L(last_4x_vec_or_less_load):
|
||||
- /* Depending on entry adjust rdi / prepare first VEC in ymm1. */
|
||||
+ /* Depending on entry adjust rdi / prepare first VEC in ymm1.
|
||||
+ */
|
||||
subq $-(VEC_SIZE * 4), %rdi
|
||||
L(last_4x_vec_or_less_cmpeq):
|
||||
VPCMPEQ 1(%rdi), %ymm0, %ymm1
|
||||
L(last_4x_vec_or_less):
|
||||
-
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- /* If remaining length > VEC_SIZE * 2. This works if esi is off by
|
||||
- VEC_SIZE * 4. */
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Multiply length by 4 to get byte count. */
|
||||
+ sall $2, %esi
|
||||
+# endif
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ /* If remaining length > VEC_SIZE * 2. This works if esi is off
|
||||
+ by VEC_SIZE * 4. */
|
||||
testl $(VEC_SIZE * 2), %esi
|
||||
jnz L(last_4x_vec)
|
||||
|
||||
@@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
|
||||
jb L(max)
|
||||
|
||||
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
tzcntl %eax, %eax
|
||||
/* Check the end of data. */
|
||||
cmpl %eax, %esi
|
||||
@@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
|
||||
addl $(VEC_SIZE + 1), %eax
|
||||
addq %rdi, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -340,6 +380,7 @@ L(last_vec_return_x0):
|
||||
subq $(VEC_SIZE * 4 - 1), %rdi
|
||||
addq %rdi, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -350,6 +391,7 @@ L(last_vec_return_x1):
|
||||
subq $(VEC_SIZE * 3 - 1), %rdi
|
||||
addq %rdi, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -366,6 +408,7 @@ L(last_vec_x1_check):
|
||||
incl %eax
|
||||
addq %rdi, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -381,14 +424,14 @@ L(last_4x_vec):
|
||||
jnz L(last_vec_x1)
|
||||
|
||||
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(last_vec_x2)
|
||||
|
||||
/* Normalize length. */
|
||||
andl $(VEC_SIZE * 4 - 1), %esi
|
||||
VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(last_vec_x3)
|
||||
|
||||
@@ -396,7 +439,7 @@ L(last_4x_vec):
|
||||
jb L(max)
|
||||
|
||||
VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
tzcntl %eax, %eax
|
||||
/* Check the end of data. */
|
||||
cmpl %eax, %esi
|
||||
@@ -405,6 +448,7 @@ L(last_4x_vec):
|
||||
addl $(VEC_SIZE * 3 + 1), %eax
|
||||
addq %rdi, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -419,6 +463,7 @@ L(last_vec_x1):
|
||||
incl %eax
|
||||
addq %rdi, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -432,6 +477,7 @@ L(last_vec_x2):
|
||||
addl $(VEC_SIZE + 1), %eax
|
||||
addq %rdi, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -447,6 +493,7 @@ L(last_vec_x3):
|
||||
addl $(VEC_SIZE * 2 + 1), %eax
|
||||
addq %rdi, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -455,13 +502,13 @@ L(max_end):
|
||||
VZEROUPPER_RETURN
|
||||
# endif
|
||||
|
||||
- /* Cold case for crossing page with first load. */
|
||||
+ /* Cold case for crossing page with first load. */
|
||||
.p2align 4
|
||||
L(cross_page_boundary):
|
||||
/* Align data to VEC_SIZE - 1. */
|
||||
orq $(VEC_SIZE - 1), %rdi
|
||||
VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
|
||||
so no need to manually mod rdx. */
|
||||
sarxl %edx, %eax, %eax
|
||||
@@ -470,6 +517,10 @@ L(cross_page_boundary):
|
||||
jnz L(cross_page_less_vec)
|
||||
leaq 1(%rdi), %rcx
|
||||
subq %rdx, %rcx
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
+ shrl $2, %ecx
|
||||
+# endif
|
||||
/* Check length. */
|
||||
cmpq %rsi, %rcx
|
||||
jb L(cross_page_continue)
|
||||
@@ -479,6 +530,7 @@ L(cross_page_boundary):
|
||||
jz L(cross_page_continue)
|
||||
tzcntl %eax, %eax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide length by 4 to get wchar_t count. */
|
||||
shrl $2, %eax
|
||||
# endif
|
||||
# endif
|
||||
@@ -489,6 +541,10 @@ L(return_vzeroupper):
|
||||
.p2align 4
|
||||
L(cross_page_less_vec):
|
||||
tzcntl %eax, %eax
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Multiply length by 4 to get byte count. */
|
||||
+ sall $2, %esi
|
||||
+# endif
|
||||
cmpq %rax, %rsi
|
||||
cmovb %esi, %eax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
|
||||
index 8f660bb9..439e486a 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strlen-vec.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
|
||||
@@ -65,12 +65,25 @@ ENTRY(strlen)
|
||||
ret
|
||||
L(n_nonzero):
|
||||
# ifdef AS_WCSLEN
|
||||
- shl $2, %RSI_LP
|
||||
+/* Check for overflow from maxlen * sizeof(wchar_t). If it would
|
||||
+ overflow the only way this program doesn't have undefined behavior
|
||||
+ is if there is a null terminator in valid memory so wcslen will
|
||||
+ suffice. */
|
||||
+ mov %RSI_LP, %R10_LP
|
||||
+ sar $62, %R10_LP
|
||||
+ test %R10_LP, %R10_LP
|
||||
+ jnz __wcslen_sse4_1
|
||||
+ sal $2, %RSI_LP
|
||||
# endif
|
||||
|
||||
+
|
||||
/* Initialize long lived registers. */
|
||||
|
||||
add %RDI_LP, %RSI_LP
|
||||
+# ifdef AS_WCSLEN
|
||||
+/* Check for overflow again from s + maxlen * sizeof(wchar_t). */
|
||||
+ jbe __wcslen_sse4_1
|
||||
+# endif
|
||||
mov %RSI_LP, %R10_LP
|
||||
and $-64, %R10_LP
|
||||
mov %RSI_LP, %R11_LP
|
||||
--
|
||||
GitLab
|
||||
|
|
@ -1,745 +0,0 @@
|
|||
From 4ba65586847751372520a36757c17f114588794e Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon, 19 Apr 2021 19:36:06 -0400
|
||||
Subject: [PATCH] x86: Optimize strlen-evex.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug. This commit optimizes strlen-evex.S. The
|
||||
optimizations are mostly small things but they add up to roughly
|
||||
10-30% performance improvement for strlen. The results for strnlen are
|
||||
bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and
|
||||
test-wcsnlen are all passing.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strlen-evex.S | 581 ++++++++++++++-----------
|
||||
1 file changed, 317 insertions(+), 264 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
|
||||
index 05838190..4bf6874b 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
|
||||
@@ -29,11 +29,13 @@
|
||||
# ifdef USE_AS_WCSLEN
|
||||
# define VPCMP vpcmpd
|
||||
# define VPMINU vpminud
|
||||
-# define SHIFT_REG r9d
|
||||
+# define SHIFT_REG ecx
|
||||
+# define CHAR_SIZE 4
|
||||
# else
|
||||
# define VPCMP vpcmpb
|
||||
# define VPMINU vpminub
|
||||
-# define SHIFT_REG ecx
|
||||
+# define SHIFT_REG edx
|
||||
+# define CHAR_SIZE 1
|
||||
# endif
|
||||
|
||||
# define XMMZERO xmm16
|
||||
@@ -46,132 +48,165 @@
|
||||
# define YMM6 ymm22
|
||||
|
||||
# define VEC_SIZE 32
|
||||
+# define PAGE_SIZE 4096
|
||||
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
||||
|
||||
.section .text.evex,"ax",@progbits
|
||||
ENTRY (STRLEN)
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- /* Check for zero length. */
|
||||
+ /* Check zero length. */
|
||||
test %RSI_LP, %RSI_LP
|
||||
jz L(zero)
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- shl $2, %RSI_LP
|
||||
-# elif defined __ILP32__
|
||||
+# ifdef __ILP32__
|
||||
/* Clear the upper 32 bits. */
|
||||
movl %esi, %esi
|
||||
# endif
|
||||
mov %RSI_LP, %R8_LP
|
||||
# endif
|
||||
- movl %edi, %ecx
|
||||
- movq %rdi, %rdx
|
||||
+ movl %edi, %eax
|
||||
vpxorq %XMMZERO, %XMMZERO, %XMMZERO
|
||||
-
|
||||
+ /* Clear high bits from edi. Only keeping bits relevant to page
|
||||
+ cross check. */
|
||||
+ andl $(PAGE_SIZE - 1), %eax
|
||||
/* Check if we may cross page boundary with one vector load. */
|
||||
- andl $(2 * VEC_SIZE - 1), %ecx
|
||||
- cmpl $VEC_SIZE, %ecx
|
||||
- ja L(cros_page_boundary)
|
||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
+ ja L(cross_page_boundary)
|
||||
|
||||
/* Check the first VEC_SIZE bytes. Each bit in K0 represents a
|
||||
null byte. */
|
||||
VPCMP $0, (%rdi), %YMMZERO, %k0
|
||||
kmovd %k0, %eax
|
||||
- testl %eax, %eax
|
||||
-
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- jnz L(first_vec_x0_check)
|
||||
- /* Adjust length and check the end of data. */
|
||||
- subq $VEC_SIZE, %rsi
|
||||
- jbe L(max)
|
||||
-# else
|
||||
- jnz L(first_vec_x0)
|
||||
+ /* If length < CHAR_PER_VEC handle special. */
|
||||
+ cmpq $CHAR_PER_VEC, %rsi
|
||||
+ jbe L(first_vec_x0)
|
||||
# endif
|
||||
-
|
||||
- /* Align data for aligned loads in the loop. */
|
||||
- addq $VEC_SIZE, %rdi
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
-
|
||||
+ testl %eax, %eax
|
||||
+ jz L(aligned_more)
|
||||
+ tzcntl %eax, %eax
|
||||
+ ret
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- /* Adjust length. */
|
||||
- addq %rcx, %rsi
|
||||
+L(zero):
|
||||
+ xorl %eax, %eax
|
||||
+ ret
|
||||
|
||||
- subq $(VEC_SIZE * 4), %rsi
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
+ .p2align 4
|
||||
+L(first_vec_x0):
|
||||
+ /* Set bit for max len so that tzcnt will return min of max len
|
||||
+ and position of first match. */
|
||||
+ btsq %rsi, %rax
|
||||
+ tzcntl %eax, %eax
|
||||
+ ret
|
||||
# endif
|
||||
- jmp L(more_4x_vec)
|
||||
|
||||
.p2align 4
|
||||
-L(cros_page_boundary):
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
-
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- /* NB: Divide shift count by 4 since each bit in K0 represent 4
|
||||
- bytes. */
|
||||
- movl %ecx, %SHIFT_REG
|
||||
- sarl $2, %SHIFT_REG
|
||||
+L(first_vec_x1):
|
||||
+ tzcntl %eax, %eax
|
||||
+ /* Safe to use 32 bit instructions as these are only called for
|
||||
+ size = [1, 159]. */
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ /* Use ecx which was computed earlier to compute correct value.
|
||||
+ */
|
||||
+ leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
|
||||
+# else
|
||||
+ subl %edx, %edi
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarl $2, %edi
|
||||
+# endif
|
||||
+ leal CHAR_PER_VEC(%rdi, %rax), %eax
|
||||
# endif
|
||||
- VPCMP $0, (%rdi), %YMMZERO, %k0
|
||||
- kmovd %k0, %eax
|
||||
+ ret
|
||||
|
||||
- /* Remove the leading bytes. */
|
||||
- sarxl %SHIFT_REG, %eax, %eax
|
||||
- testl %eax, %eax
|
||||
- jz L(aligned_more)
|
||||
+ .p2align 4
|
||||
+L(first_vec_x2):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
-# endif
|
||||
+ /* Safe to use 32 bit instructions as these are only called for
|
||||
+ size = [1, 159]. */
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rsi
|
||||
- jbe L(max)
|
||||
-# endif
|
||||
- addq %rdi, %rax
|
||||
- addq %rcx, %rax
|
||||
- subq %rdx, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
+ /* Use ecx which was computed earlier to compute correct value.
|
||||
+ */
|
||||
+ leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
|
||||
+# else
|
||||
+ subl %edx, %edi
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarl $2, %edi
|
||||
+# endif
|
||||
+ leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax
|
||||
# endif
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(aligned_more):
|
||||
+L(first_vec_x3):
|
||||
+ tzcntl %eax, %eax
|
||||
+ /* Safe to use 32 bit instructions as these are only called for
|
||||
+ size = [1, 159]. */
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE"
|
||||
- with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
|
||||
- to void possible addition overflow. */
|
||||
- negq %rcx
|
||||
- addq $VEC_SIZE, %rcx
|
||||
-
|
||||
- /* Check the end of data. */
|
||||
- subq %rcx, %rsi
|
||||
- jbe L(max)
|
||||
+ /* Use ecx which was computed earlier to compute correct value.
|
||||
+ */
|
||||
+ leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
|
||||
+# else
|
||||
+ subl %edx, %edi
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarl $2, %edi
|
||||
+# endif
|
||||
+ leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax
|
||||
# endif
|
||||
+ ret
|
||||
|
||||
- addq $VEC_SIZE, %rdi
|
||||
-
|
||||
+ .p2align 4
|
||||
+L(first_vec_x4):
|
||||
+ tzcntl %eax, %eax
|
||||
+ /* Safe to use 32 bit instructions as these are only called for
|
||||
+ size = [1, 159]. */
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- subq $(VEC_SIZE * 4), %rsi
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
+ /* Use ecx which was computed earlier to compute correct value.
|
||||
+ */
|
||||
+ leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
|
||||
+# else
|
||||
+ subl %edx, %edi
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarl $2, %edi
|
||||
+# endif
|
||||
+ leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
|
||||
# endif
|
||||
+ ret
|
||||
|
||||
-L(more_4x_vec):
|
||||
+ .p2align 5
|
||||
+L(aligned_more):
|
||||
+ movq %rdi, %rdx
|
||||
+ /* Align data to VEC_SIZE. */
|
||||
+ andq $-(VEC_SIZE), %rdi
|
||||
+L(cross_page_continue):
|
||||
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
||||
since data is only aligned to VEC_SIZE. */
|
||||
- VPCMP $0, (%rdi), %YMMZERO, %k0
|
||||
- kmovd %k0, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
-
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ /* + CHAR_SIZE because it simplies the logic in
|
||||
+ last_4x_vec_or_less. */
|
||||
+ leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
|
||||
+ subq %rdx, %rcx
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarl $2, %ecx
|
||||
+# endif
|
||||
+# endif
|
||||
+ /* Load first VEC regardless. */
|
||||
VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ /* Adjust length. If near end handle specially. */
|
||||
+ subq %rcx, %rsi
|
||||
+ jb L(last_4x_vec_or_less)
|
||||
+# endif
|
||||
kmovd %k0, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x1)
|
||||
|
||||
VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
|
||||
kmovd %k0, %eax
|
||||
- testl %eax, %eax
|
||||
+ test %eax, %eax
|
||||
jnz L(first_vec_x2)
|
||||
|
||||
VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
|
||||
@@ -179,258 +214,276 @@ L(more_4x_vec):
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x3)
|
||||
|
||||
- addq $(VEC_SIZE * 4), %rdi
|
||||
-
|
||||
-# ifdef USE_AS_STRNLEN
|
||||
- subq $(VEC_SIZE * 4), %rsi
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
-# endif
|
||||
-
|
||||
- /* Align data to 4 * VEC_SIZE. */
|
||||
- movq %rdi, %rcx
|
||||
- andl $(4 * VEC_SIZE - 1), %ecx
|
||||
- andq $-(4 * VEC_SIZE), %rdi
|
||||
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(first_vec_x4)
|
||||
|
||||
+ addq $VEC_SIZE, %rdi
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- /* Adjust length. */
|
||||
+ /* Check if at last VEC_SIZE * 4 length. */
|
||||
+ cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
|
||||
+ jbe L(last_4x_vec_or_less_load)
|
||||
+ movl %edi, %ecx
|
||||
+ andl $(VEC_SIZE * 4 - 1), %ecx
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarl $2, %ecx
|
||||
+# endif
|
||||
+ /* Readjust length. */
|
||||
addq %rcx, %rsi
|
||||
# endif
|
||||
+ /* Align data to VEC_SIZE * 4. */
|
||||
+ andq $-(VEC_SIZE * 4), %rdi
|
||||
|
||||
+ /* Compare 4 * VEC at a time forward. */
|
||||
.p2align 4
|
||||
L(loop_4x_vec):
|
||||
- /* Compare 4 * VEC at a time forward. */
|
||||
- VMOVA (%rdi), %YMM1
|
||||
- VMOVA VEC_SIZE(%rdi), %YMM2
|
||||
- VMOVA (VEC_SIZE * 2)(%rdi), %YMM3
|
||||
- VMOVA (VEC_SIZE * 3)(%rdi), %YMM4
|
||||
-
|
||||
- VPMINU %YMM1, %YMM2, %YMM5
|
||||
- VPMINU %YMM3, %YMM4, %YMM6
|
||||
+ /* Load first VEC regardless. */
|
||||
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ /* Break if at end of length. */
|
||||
+ subq $(CHAR_PER_VEC * 4), %rsi
|
||||
+ jb L(last_4x_vec_or_less_cmpeq)
|
||||
+# endif
|
||||
+ /* Save some code size by microfusing VPMINU with the load. Since
|
||||
+ the matches in ymm2/ymm4 can only be returned if there where no
|
||||
+ matches in ymm1/ymm3 respectively there is no issue with overlap.
|
||||
+ */
|
||||
+ VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
|
||||
+ VMOVA (VEC_SIZE * 6)(%rdi), %YMM3
|
||||
+ VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
|
||||
+
|
||||
+ VPCMP $0, %YMM2, %YMMZERO, %k0
|
||||
+ VPCMP $0, %YMM4, %YMMZERO, %k1
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+ kortestd %k0, %k1
|
||||
+ jz L(loop_4x_vec)
|
||||
+
|
||||
+ /* Check if end was in first half. */
|
||||
+ kmovd %k0, %eax
|
||||
+ subq %rdx, %rdi
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ shrq $2, %rdi
|
||||
+# endif
|
||||
+ testl %eax, %eax
|
||||
+ jz L(second_vec_return)
|
||||
|
||||
- VPMINU %YMM5, %YMM6, %YMM5
|
||||
- VPCMP $0, %YMM5, %YMMZERO, %k0
|
||||
- ktestd %k0, %k0
|
||||
- jnz L(4x_vec_end)
|
||||
+ VPCMP $0, %YMM1, %YMMZERO, %k2
|
||||
+ kmovd %k2, %edx
|
||||
+ /* Combine VEC1 matches (edx) with VEC2 matches (eax). */
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ sall $CHAR_PER_VEC, %eax
|
||||
+ orl %edx, %eax
|
||||
+ tzcntl %eax, %eax
|
||||
+# else
|
||||
+ salq $CHAR_PER_VEC, %rax
|
||||
+ orq %rdx, %rax
|
||||
+ tzcntq %rax, %rax
|
||||
+# endif
|
||||
+ addq %rdi, %rax
|
||||
+ ret
|
||||
|
||||
- addq $(VEC_SIZE * 4), %rdi
|
||||
|
||||
-# ifndef USE_AS_STRNLEN
|
||||
- jmp L(loop_4x_vec)
|
||||
-# else
|
||||
- subq $(VEC_SIZE * 4), %rsi
|
||||
- ja L(loop_4x_vec)
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
|
||||
+L(last_4x_vec_or_less_load):
|
||||
+ /* Depending on entry adjust rdi / prepare first VEC in YMM1. */
|
||||
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
|
||||
+L(last_4x_vec_or_less_cmpeq):
|
||||
+ VPCMP $0, %YMM1, %YMMZERO, %k0
|
||||
+ addq $(VEC_SIZE * 3), %rdi
|
||||
L(last_4x_vec_or_less):
|
||||
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
|
||||
- addl $(VEC_SIZE * 2), %esi
|
||||
- jle L(last_2x_vec)
|
||||
-
|
||||
- VPCMP $0, (%rdi), %YMMZERO, %k0
|
||||
kmovd %k0, %eax
|
||||
+ /* If remaining length > VEC_SIZE * 2. This works if esi is off by
|
||||
+ VEC_SIZE * 4. */
|
||||
+ testl $(CHAR_PER_VEC * 2), %esi
|
||||
+ jnz L(last_4x_vec)
|
||||
+
|
||||
+ /* length may have been negative or positive by an offset of
|
||||
+ CHAR_PER_VEC * 4 depending on where this was called from. This
|
||||
+ fixes that. */
|
||||
+ andl $(CHAR_PER_VEC * 4 - 1), %esi
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
+ jnz L(last_vec_x1_check)
|
||||
|
||||
- VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
|
||||
- kmovd %k0, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x1)
|
||||
+ /* Check the end of data. */
|
||||
+ subl $CHAR_PER_VEC, %esi
|
||||
+ jb L(max)
|
||||
|
||||
VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
|
||||
kmovd %k0, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x2_check)
|
||||
- subl $VEC_SIZE, %esi
|
||||
- jle L(max)
|
||||
+ tzcntl %eax, %eax
|
||||
+ /* Check the end of data. */
|
||||
+ cmpl %eax, %esi
|
||||
+ jb L(max)
|
||||
|
||||
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
|
||||
- kmovd %k0, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x3_check)
|
||||
+ subq %rdx, %rdi
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarq $2, %rdi
|
||||
+# endif
|
||||
+ leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
|
||||
+ ret
|
||||
+L(max):
|
||||
movq %r8, %rax
|
||||
+ ret
|
||||
+# endif
|
||||
+
|
||||
+ /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
|
||||
+ in the 4x VEC loop can use 2 byte encoding. */
|
||||
+ .p2align 4
|
||||
+L(second_vec_return):
|
||||
+ VPCMP $0, %YMM3, %YMMZERO, %k0
|
||||
+ /* Combine YMM3 matches (k0) with YMM4 matches (k1). */
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ kunpckbw %k0, %k1, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ tzcntl %eax, %eax
|
||||
+# else
|
||||
+ kunpckdq %k0, %k1, %k0
|
||||
+ kmovq %k0, %rax
|
||||
+ tzcntq %rax, %rax
|
||||
+# endif
|
||||
+ leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
|
||||
+ ret
|
||||
+
|
||||
+
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+L(last_vec_x1_check):
|
||||
+ tzcntl %eax, %eax
|
||||
+ /* Check the end of data. */
|
||||
+ cmpl %eax, %esi
|
||||
+ jb L(max)
|
||||
+ subq %rdx, %rdi
|
||||
# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarq $2, %rdi
|
||||
# endif
|
||||
+ leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(last_2x_vec):
|
||||
- addl $(VEC_SIZE * 2), %esi
|
||||
+L(last_4x_vec):
|
||||
+ /* Test first 2x VEC normally. */
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(last_vec_x1)
|
||||
|
||||
- VPCMP $0, (%rdi), %YMMZERO, %k0
|
||||
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
|
||||
kmovd %k0, %eax
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec_x0_check)
|
||||
- subl $VEC_SIZE, %esi
|
||||
- jle L(max)
|
||||
+ jnz L(last_vec_x2)
|
||||
|
||||
- VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
|
||||
+ /* Normalize length. */
|
||||
+ andl $(CHAR_PER_VEC * 4 - 1), %esi
|
||||
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
|
||||
kmovd %k0, %eax
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec_x1_check)
|
||||
- movq %r8, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
-# endif
|
||||
- ret
|
||||
+ jnz L(last_vec_x3)
|
||||
|
||||
- .p2align 4
|
||||
-L(first_vec_x0_check):
|
||||
+ /* Check the end of data. */
|
||||
+ subl $(CHAR_PER_VEC * 3), %esi
|
||||
+ jb L(max)
|
||||
+
|
||||
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
|
||||
+ kmovd %k0, %eax
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
-# endif
|
||||
/* Check the end of data. */
|
||||
- cmpq %rax, %rsi
|
||||
- jbe L(max)
|
||||
- addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
+ cmpl %eax, %esi
|
||||
+ jb L(max_end)
|
||||
+
|
||||
+ subq %rdx, %rdi
|
||||
# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarq $2, %rdi
|
||||
# endif
|
||||
+ leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x1_check):
|
||||
+L(last_vec_x1):
|
||||
tzcntl %eax, %eax
|
||||
+ subq %rdx, %rdi
|
||||
# ifdef USE_AS_WCSLEN
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
-# endif
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rsi
|
||||
- jbe L(max)
|
||||
- addq $VEC_SIZE, %rax
|
||||
- addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarq $2, %rdi
|
||||
# endif
|
||||
+ leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x2_check):
|
||||
+L(last_vec_x2):
|
||||
tzcntl %eax, %eax
|
||||
+ subq %rdx, %rdi
|
||||
# ifdef USE_AS_WCSLEN
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
-# endif
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rsi
|
||||
- jbe L(max)
|
||||
- addq $(VEC_SIZE * 2), %rax
|
||||
- addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarq $2, %rdi
|
||||
# endif
|
||||
+ leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x3_check):
|
||||
+L(last_vec_x3):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
-# endif
|
||||
+ subl $(CHAR_PER_VEC * 2), %esi
|
||||
/* Check the end of data. */
|
||||
- cmpq %rax, %rsi
|
||||
- jbe L(max)
|
||||
- addq $(VEC_SIZE * 3), %rax
|
||||
- addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
+ cmpl %eax, %esi
|
||||
+ jb L(max_end)
|
||||
+ subq %rdx, %rdi
|
||||
# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarq $2, %rdi
|
||||
# endif
|
||||
+ leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax
|
||||
ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(max):
|
||||
+L(max_end):
|
||||
movq %r8, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
-# endif
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(zero):
|
||||
- xorl %eax, %eax
|
||||
ret
|
||||
# endif
|
||||
|
||||
+ /* Cold case for crossing page with first load. */
|
||||
.p2align 4
|
||||
-L(first_vec_x0):
|
||||
- tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
-# endif
|
||||
- addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
+L(cross_page_boundary):
|
||||
+ movq %rdi, %rdx
|
||||
+ /* Align data to VEC_SIZE. */
|
||||
+ andq $-VEC_SIZE, %rdi
|
||||
+ VPCMP $0, (%rdi), %YMMZERO, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ /* Remove the leading bytes. */
|
||||
# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
+ /* NB: Divide shift count by 4 since each bit in K0 represent 4
|
||||
+ bytes. */
|
||||
+ movl %edx, %ecx
|
||||
+ shrl $2, %ecx
|
||||
+ andl $(CHAR_PER_VEC - 1), %ecx
|
||||
# endif
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(first_vec_x1):
|
||||
+ /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */
|
||||
+ sarxl %SHIFT_REG, %eax, %eax
|
||||
+ testl %eax, %eax
|
||||
+# ifndef USE_AS_STRNLEN
|
||||
+ jz L(cross_page_continue)
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
-# endif
|
||||
- addq $VEC_SIZE, %rax
|
||||
- addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
-# endif
|
||||
ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(first_vec_x2):
|
||||
- tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
-# endif
|
||||
- addq $(VEC_SIZE * 2), %rax
|
||||
- addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
-# endif
|
||||
+# else
|
||||
+ jnz L(cross_page_less_vec)
|
||||
+# ifndef USE_AS_WCSLEN
|
||||
+ movl %edx, %ecx
|
||||
+ andl $(CHAR_PER_VEC - 1), %ecx
|
||||
+# endif
|
||||
+ movl $CHAR_PER_VEC, %eax
|
||||
+ subl %ecx, %eax
|
||||
+ /* Check the end of data. */
|
||||
+ cmpq %rax, %rsi
|
||||
+ ja L(cross_page_continue)
|
||||
+ movl %esi, %eax
|
||||
ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(4x_vec_end):
|
||||
- VPCMP $0, %YMM1, %YMMZERO, %k0
|
||||
- kmovd %k0, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
- VPCMP $0, %YMM2, %YMMZERO, %k1
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x1)
|
||||
- VPCMP $0, %YMM3, %YMMZERO, %k2
|
||||
- kmovd %k2, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x2)
|
||||
- VPCMP $0, %YMM4, %YMMZERO, %k3
|
||||
- kmovd %k3, %eax
|
||||
-L(first_vec_x3):
|
||||
+L(cross_page_less_vec):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
-# endif
|
||||
- addq $(VEC_SIZE * 3), %rax
|
||||
- addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
-# endif
|
||||
+ /* Select min of length and position of first null. */
|
||||
+ cmpq %rax, %rsi
|
||||
+ cmovb %esi, %eax
|
||||
ret
|
||||
+# endif
|
||||
|
||||
END (STRLEN)
|
||||
#endif
|
||||
--
|
||||
GitLab
|
||||
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Reference in a new issue