Compare commits

...

No commits in common. "glibc-2.28-251.0.1.an8.2" and "a23.1" have entirely different histories.

1060 changed files with 15194 additions and 268687 deletions

View file

@ -1,36 +1,40 @@
commit bd77dd7e73e3530203be1c52c8a29d08270cb25d
Author: Florian Weimer <fweimer@redhat.com>
Date: Wed Sep 13 14:10:56 2023 +0200
From 4ea972b7edd7e36610e8cde18bf7a8149d7bac4f Mon Sep 17 00:00:00 2001
From: Florian Weimer <fweimer@redhat.com>
Date: Wed, 13 Sep 2023 14:10:56 +0200
Subject: [PATCH] CVE-2023-4527: Stack read overflow with large TCP responses
in no-aaaa mode
CVE-2023-4527: Stack read overflow with large TCP responses in no-aaaa mode
Without passing alt_dns_packet_buffer, __res_context_search can only
store 2048 bytes (what fits into dns_packet_buffer). However,
the function returns the total packet size, and the subsequent
DNS parsing code in _nss_dns_gethostbyname4_r reads beyond the end
of the stack-allocated buffer.
Without passing alt_dns_packet_buffer, __res_context_search can only
store 2048 bytes (what fits into dns_packet_buffer). However,
the function returns the total packet size, and the subsequent
DNS parsing code in _nss_dns_gethostbyname4_r reads beyond the end
of the stack-allocated buffer.
Fixes commit f282cdbe7f436c75864e5640a4 ("resolv: Implement no-aaaa
stub resolver option") and bug 30842.
Fixes commit f282cdbe7f436c75864e5640a4 ("resolv: Implement no-aaaa
stub resolver option") and bug 30842.
Conflicts:
resolv/nss_dns/dns-host.c
(missing dns_packet_buffer cleanup downstream)
(cherry picked from commit bd77dd7e73e3530203be1c52c8a29d08270cb25d)
---
resolv/Makefile | 2 +
resolv/nss_dns/dns-host.c | 2 +-
resolv/tst-resolv-noaaaa-vc.c | 129 ++++++++++++++++++++++++++++++++++
4 files changed, 139 insertions(+), 1 deletion(-)
create mode 100644 resolv/tst-resolv-noaaaa-vc.c
diff --git a/resolv/Makefile b/resolv/Makefile
index ab8ad49b5318ad41..4f4eaf060443c128 100644
index f8a92c6cff..28cedf49ee 100644
--- a/resolv/Makefile
+++ b/resolv/Makefile
@@ -58,6 +58,7 @@ tests += \
tst-resolv-edns \
@@ -101,6 +101,7 @@ tests += \
tst-resolv-invalid-cname \
tst-resolv-network \
tst-resolv-noaaaa \
+ tst-resolv-noaaaa-vc \
tst-resolv-nondecimal \
tst-resolv-res_init-multi \
tst-resolv-search \
@@ -202,6 +203,7 @@ $(objpfx)tst-resolv-res_init-multi: $(objpfx)libresolv.so \
$(objpfx)tst-resolv-res_init-thread: $(libdl) $(objpfx)libresolv.so \
@@ -291,6 +292,7 @@ $(objpfx)tst-resolv-res_init-thread: $(objpfx)libresolv.so \
$(objpfx)tst-resolv-invalid-cname: $(objpfx)libresolv.so \
$(shared-thread-library)
$(objpfx)tst-resolv-noaaaa: $(objpfx)libresolv.so $(shared-thread-library)
+$(objpfx)tst-resolv-noaaaa-vc: $(objpfx)libresolv.so $(shared-thread-library)
@ -38,21 +42,21 @@ index ab8ad49b5318ad41..4f4eaf060443c128 100644
$(objpfx)tst-resolv-qtypes: $(objpfx)libresolv.so $(shared-thread-library)
$(objpfx)tst-resolv-rotate: $(objpfx)libresolv.so $(shared-thread-library)
diff --git a/resolv/nss_dns/dns-host.c b/resolv/nss_dns/dns-host.c
index ff0a0b6f7f1f4703..f678c7d7caa3a026 100644
index 9fa81f23c8..227734da5c 100644
--- a/resolv/nss_dns/dns-host.c
+++ b/resolv/nss_dns/dns-host.c
@@ -392,7 +392,7 @@ _nss_dns_gethostbyname4_r (const char *name, struct gaih_addrtuple **pat,
else
@@ -427,7 +427,7 @@ _nss_dns_gethostbyname4_r (const char *name, struct gaih_addrtuple **pat,
{
n = __res_context_search (ctx, name, C_IN, T_A,
- host_buffer.buf->buf, 2048, NULL,
+ host_buffer.buf->buf, 2048, &host_buffer.ptr,
NULL, NULL, NULL, NULL);
dns_packet_buffer, sizeof (dns_packet_buffer),
- NULL, NULL, NULL, NULL, NULL);
+ &alt_dns_packet_buffer, NULL, NULL, NULL, NULL);
if (n >= 0)
status = gaih_getanswer_noaaaa (host_buffer.buf, n,
status = gaih_getanswer_noaaaa (alt_dns_packet_buffer, n,
&abuf, pat, errnop, herrnop, ttlp);
diff --git a/resolv/tst-resolv-noaaaa-vc.c b/resolv/tst-resolv-noaaaa-vc.c
new file mode 100644
index 0000000000000000..9f5aebd99f2d74a2
index 0000000000..9f5aebd99f
--- /dev/null
+++ b/resolv/tst-resolv-noaaaa-vc.c
@@ -0,0 +1,129 @@
@ -185,3 +189,6 @@ index 0000000000000000..9f5aebd99f2d74a2
+}
+
+#include <support/test-driver.c>
--
2.39.3

View file

@ -1,4 +1,8 @@
Avoid UAF in getcanonname (CVE-2023-4806)
From a9728f798ec7f05454c95637ee6581afaa9b487d Mon Sep 17 00:00:00 2001
From: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date: Fri, 15 Sep 2023 13:51:12 -0400
Subject: [PATCH] getaddrinfo: Fix use after free in getcanonname
(CVE-2023-4806)
When an NSS plugin only implements the _gethostbyname2_r and
_getcanonname_r callbacks, getaddrinfo could use memory that was freed
@ -17,60 +21,39 @@ reference in res->at->name. This then gets dereferenced in the
getcanonname_r plugin call, resulting in the use after free.
Fix this by copying h_name over and freeing it at the end. This
resolves BZ #30843, which is assigned CVE-2023-4806. This is a minimal
RHEL-8-specific fix. Test case differences from upstream:
resolves BZ #30843, which is assigned CVE-2023-4806.
- The test module needs to explicitly link against libnss_files on
RHEL-8; upstream libnss_files is built into libc.so.
- Test module code was adapted to not use the upstream NSS module
convenience macros.
This change is adapted from the following commit from upstream:
commit 973fe93a5675c42798b2161c6f29c01b0e243994
Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date: Fri Sep 15 13:51:12 2023 -0400
getaddrinfo: Fix use after free in getcanonname (CVE-2023-4806)
When an NSS plugin only implements the _gethostbyname2_r and
_getcanonname_r callbacks, getaddrinfo could use memory that was freed
during tmpbuf resizing, through h_name in a previous query response.
The backing store for res->at->name when doing a query with
gethostbyname3_r or gethostbyname2_r is tmpbuf, which is reallocated in
gethosts during the query. For AF_INET6 lookup with AI_ALL |
AI_V4MAPPED, gethosts gets called twice, once for a v6 lookup and second
for a v4 lookup. In this case, if the first call reallocates tmpbuf
enough number of times, resulting in a malloc, th->h_name (that
res->at->name refers to) ends up on a heap allocated storage in tmpbuf.
Now if the second call to gethosts also causes the plugin callback to
return NSS_STATUS_TRYAGAIN, tmpbuf will get freed, resulting in a UAF
reference in res->at->name. This then gets dereferenced in the
getcanonname_r plugin call, resulting in the use after free.
Fix this by copying h_name over and freeing it at the end. This
resolves BZ #30843, which is assigned CVE-2023-4806.
Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
(cherry picked from commit 973fe93a5675c42798b2161c6f29c01b0e243994)
---
nss/Makefile | 15 ++++-
nss/nss_test_gai_hv2_canonname.c | 56 +++++++++++++++++
nss/tst-nss-gai-hv2-canonname.c | 63 +++++++++++++++++++
nss/tst-nss-gai-hv2-canonname.h | 1 +
.../postclean.req | 0
.../tst-nss-gai-hv2-canonname.script | 2 +
sysdeps/posix/getaddrinfo.c | 25 +++++---
7 files changed, 152 insertions(+), 10 deletions(-)
create mode 100644 nss/nss_test_gai_hv2_canonname.c
create mode 100644 nss/tst-nss-gai-hv2-canonname.c
create mode 100644 nss/tst-nss-gai-hv2-canonname.h
create mode 100644 nss/tst-nss-gai-hv2-canonname.root/postclean.req
create mode 100644 nss/tst-nss-gai-hv2-canonname.root/tst-nss-gai-hv2-canonname.script
diff --git a/nss/Makefile b/nss/Makefile
index cfb255c6e7a3a4de..5829a2539306ddb5 100644
index a978e3927a..f0af87e6f1 100644
--- a/nss/Makefile
+++ b/nss/Makefile
@@ -66,7 +66,8 @@ xtests = bug-erange
tests-container = \
tst-nss-db-endpwent \
tst-nss-db-endgrent \
- tst-nss-gai-actions
+ tst-nss-gai-actions \
+ tst-nss-gai-hv2-canonname
@@ -81,6 +81,7 @@ tests-container := \
tst-nss-test3 \
tst-reload1 \
tst-reload2 \
+ tst-nss-gai-hv2-canonname \
# tests-container
# Tests which need libdl
ifeq (yes,$(build-shared))
@@ -132,7 +133,8 @@ routines += $(libnss_files-routines)
static-only-routines += $(libnss_files-routines)
@@ -144,7 +145,8 @@ libnss_compat-inhibit-o = $(filter-out .os,$(object-suffixes))
ifeq ($(build-static-nss),yes)
tests-static += tst-nss-static
endif
-extra-test-objs += nss_test1.os nss_test2.os nss_test_errno.os
@ -79,7 +62,7 @@ index cfb255c6e7a3a4de..5829a2539306ddb5 100644
include ../Rules
@@ -169,12 +171,17 @@ rtld-tests-LDFLAGS += -Wl,--dynamic-list=nss_test.ver
@@ -179,12 +181,16 @@ rtld-tests-LDFLAGS += -Wl,--dynamic-list=nss_test.ver
libof-nss_test1 = extramodules
libof-nss_test2 = extramodules
libof-nss_test_errno = extramodules
@ -91,34 +74,38 @@ index cfb255c6e7a3a4de..5829a2539306ddb5 100644
$(objpfx)/libnss_test_errno.so: $(objpfx)nss_test_errno.os $(link-libc-deps)
$(build-module)
+$(objpfx)/libnss_test_gai_hv2_canonname.so: \
+ $(objpfx)nss_test_gai_hv2_canonname.os $(link-libc-deps) \
+ $(objpfx)/libnss_files.so
+ $(objpfx)nss_test_gai_hv2_canonname.os $(link-libc-deps)
+ $(build-module)
$(objpfx)nss_test2.os : nss_test1.c
ifdef libnss_test1.so-version
$(objpfx)/libnss_test1.so$(libnss_test1.so-version): $(objpfx)/libnss_test1.so
@@ -187,10 +194,14 @@ endif
# Use the nss_files suffix for these objects as well.
$(objpfx)/libnss_test1.so$(libnss_files.so-version): $(objpfx)/libnss_test1.so
@@ -194,10 +200,14 @@ $(objpfx)/libnss_test2.so$(libnss_files.so-version): $(objpfx)/libnss_test2.so
$(objpfx)/libnss_test_errno.so$(libnss_files.so-version): \
$(objpfx)/libnss_test_errno.so
$(make-link)
+$(objpfx)/libnss_test_gai_hv2_canonname.so$(libnss_files.so-version): \
+ $(objpfx)/libnss_test_gai_hv2_canonname.so
+ $(make-link)
$(patsubst %,$(objpfx)%.out,$(tests)) : \
$(objpfx)/libnss_test1.so$(libnss_test1.so-version) \
$(objpfx)/libnss_test2.so$(libnss_test2.so-version) \
$(patsubst %,$(objpfx)%.out,$(tests) $(tests-container)) : \
$(objpfx)/libnss_test1.so$(libnss_files.so-version) \
$(objpfx)/libnss_test2.so$(libnss_files.so-version) \
- $(objpfx)/libnss_test_errno.so$(libnss_files.so-version)
+ $(objpfx)/libnss_test_errno.so$(libnss_files.so-version) \
+ $(objpfx)/libnss_test_gai_hv2_canonname.so$(libnss_files.so-version)
ifeq (yes,$(have-thread-library))
$(objpfx)tst-cancel-getpwuid_r: $(shared-thread-library)
@@ -214,3 +224,4 @@ LDFLAGS-tst-nss-test3 = -Wl,--disable-new-dtags
LDFLAGS-tst-nss-test4 = -Wl,--disable-new-dtags
LDFLAGS-tst-nss-test5 = -Wl,--disable-new-dtags
LDFLAGS-tst-nss-test_errno = -Wl,--disable-new-dtags
+LDFLAGS-tst-nss-test_gai_hv2_canonname = -Wl,--disable-new-dtags
diff --git a/nss/nss_test_gai_hv2_canonname.c b/nss/nss_test_gai_hv2_canonname.c
new file mode 100644
index 0000000000000000..4195d7d24fdd5f6d
index 0000000000..4439c83c9f
--- /dev/null
+++ b/nss/nss_test_gai_hv2_canonname.c
@@ -0,0 +1,64 @@
@@ -0,0 +1,56 @@
+/* NSS service provider that only provides gethostbyname2_r.
+ Copyright The GNU Toolchain Authors.
+ This file is part of the GNU C Library.
@ -137,7 +124,6 @@ index 0000000000000000..4195d7d24fdd5f6d
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <netdb.h>
+#include <nss.h>
+#include <stdlib.h>
+#include <string.h>
@ -145,20 +131,13 @@ index 0000000000000000..4195d7d24fdd5f6d
+
+/* Catch misnamed and functions. */
+#pragma GCC diagnostic error "-Wmissing-prototypes"
+NSS_DECLARE_MODULE_FUNCTIONS (test_gai_hv2_canonname)
+
+extern enum nss_status _nss_files_gethostbyname2_r (const char *, int,
+ struct hostent *, char *,
+ size_t, int *, int *);
+
+enum nss_status
+_nss_test_gai_hv2_canonname_gethostbyname2_r (const char *, int, struct hostent
+ *, char *, size_t, int *, int *);
+
+enum nss_status
+_nss_test_gai_hv2_canonname_getcanonname_r (const char *, char *, size_t, char
+ **, int *, int *);
+
+enum nss_status
+_nss_test_gai_hv2_canonname_gethostbyname2_r (const char *name, int af,
+ struct hostent *result,
+ char *buffer, size_t buflen,
@ -185,7 +164,7 @@ index 0000000000000000..4195d7d24fdd5f6d
+}
diff --git a/nss/tst-nss-gai-hv2-canonname.c b/nss/tst-nss-gai-hv2-canonname.c
new file mode 100644
index 0000000000000000..d5f10c07d6a90773
index 0000000000..d5f10c07d6
--- /dev/null
+++ b/nss/tst-nss-gai-hv2-canonname.c
@@ -0,0 +1,63 @@
@ -254,61 +233,86 @@ index 0000000000000000..d5f10c07d6a90773
+#include <support/test-driver.c>
diff --git a/nss/tst-nss-gai-hv2-canonname.h b/nss/tst-nss-gai-hv2-canonname.h
new file mode 100644
index 0000000000000000..14f2a9cb0867dff9
index 0000000000..14f2a9cb08
--- /dev/null
+++ b/nss/tst-nss-gai-hv2-canonname.h
@@ -0,0 +1 @@
+#define QUERYNAME "test.example.com"
diff --git a/nss/tst-nss-gai-hv2-canonname.root/postclean.req b/nss/tst-nss-gai-hv2-canonname.root/postclean.req
new file mode 100644
index 0000000000000000..e69de29bb2d1d643
index 0000000000..e69de29bb2
diff --git a/nss/tst-nss-gai-hv2-canonname.root/tst-nss-gai-hv2-canonname.script b/nss/tst-nss-gai-hv2-canonname.root/tst-nss-gai-hv2-canonname.script
new file mode 100644
index 0000000000000000..31848b4a28524af6
index 0000000000..31848b4a28
--- /dev/null
+++ b/nss/tst-nss-gai-hv2-canonname.root/tst-nss-gai-hv2-canonname.script
@@ -0,0 +1,2 @@
+cp $B/nss/libnss_test_gai_hv2_canonname.so $L/libnss_test_gai_hv2_canonname.so.2
+su
diff --git a/sysdeps/posix/getaddrinfo.c b/sysdeps/posix/getaddrinfo.c
index 4fa963644af8b7d5..46046504a6858f2e 100644
index 5cda9bb072..7a43a3bf4c 100644
--- a/sysdeps/posix/getaddrinfo.c
+++ b/sysdeps/posix/getaddrinfo.c
@@ -233,7 +233,6 @@ convert_hostent_to_gaih_addrtuple (const struct addrinfo *req,
@@ -120,6 +120,7 @@ struct gaih_result
{
struct gaih_addrtuple *at;
char *canon;
+ char *h_name;
bool free_at;
bool got_ipv6;
};
@@ -165,6 +166,7 @@ gaih_result_reset (struct gaih_result *res)
if (res->free_at)
free (res->at);
free (res->canon);
+ free (res->h_name);
memset (res, 0, sizeof (*res));
}
@@ -203,9 +205,8 @@ gaih_inet_serv (const char *servicename, const struct gaih_typeproto *tp,
return 0;
}
-/* Convert struct hostent to a list of struct gaih_addrtuple objects. h_name
- is not copied, and the struct hostent object must not be deallocated
- prematurely. The new addresses are appended to the tuple array in RES. */
+/* Convert struct hostent to a list of struct gaih_addrtuple objects. The new
+ addresses are appended to the tuple array in RES. */
static bool
convert_hostent_to_gaih_addrtuple (const struct addrinfo *req, int family,
struct hostent *h, struct gaih_result *res)
@@ -238,6 +239,15 @@ convert_hostent_to_gaih_addrtuple (const struct addrinfo *req, int family,
res->at = array;
res->free_at = true;
+ /* Duplicate h_name because it may get reclaimed when the underlying storage
+ is freed. */
+ if (res->h_name == NULL)
+ {
+ res->h_name = __strdup (h->h_name);
+ if (res->h_name == NULL)
+ return false;
+ }
+
/* Update the next pointers on reallocation. */
for (size_t i = 0; i < old; i++)
array[i].next = array + i + 1;
@@ -262,7 +272,6 @@ convert_hostent_to_gaih_addrtuple (const struct addrinfo *req, int family,
}
array[i].next = array + i + 1;
}
- array[0].name = h->h_name;
array[count - 1].next = NULL;
*result = array;
@@ -287,6 +286,18 @@ convert_hostent_to_gaih_addrtuple (const struct addrinfo *req,
} \
*pat = addrmem; \
\
+ /* Store h_name so that it survives accidental deallocation when \
+ gethosts is called again and tmpbuf gets reallocated. */ \
+ if (h_name == NULL && th.h_name != NULL) \
+ { \
+ h_name = __strdup (th.h_name); \
+ if (h_name == NULL) \
+ { \
+ __resolv_context_put (res_ctx); \
+ result = -EAI_SYSTEM; \
+ goto free_and_return; \
+ } \
+ } \
if (localcanon != NULL && canon == NULL) \
{ \
canonbuf = __strdup (localcanon); \
@@ -323,15 +334,15 @@ typedef enum nss_status (*nss_getcanonname_r)
return true;
@@ -324,15 +333,15 @@ gethosts (nss_gethostbyname3_r fct, int family, const char *name,
memory allocation failure. The returned string is allocated on the
heap; the caller has to free it. */
static char *
-getcanonname (service_user *nip, struct gaih_addrtuple *at, const char *name)
+getcanonname (service_user *nip, const char *hname, const char *name)
-getcanonname (nss_action_list nip, struct gaih_addrtuple *at, const char *name)
+getcanonname (nss_action_list nip, const char *hname, const char *name)
{
nss_getcanonname_r cfct = __nss_lookup_function (nip, "getcanonname_r");
nss_getcanonname_r *cfct = __nss_lookup_function (nip, "getcanonname_r");
char *s = (char *) name;
if (cfct != NULL)
{
@ -320,28 +324,15 @@ index 4fa963644af8b7d5..46046504a6858f2e 100644
/* If the canonical name cannot be determined, use the passed
string. */
s = (char *) name;
@@ -349,6 +360,7 @@ gaih_inet (const char *name, const struct gaih_service *service,
struct gaih_addrtuple *at = NULL;
bool got_ipv6 = false;
const char *canon = NULL;
+ char *h_name = NULL;
const char *orig_name = name;
/* Reserve stack memory for the scratch buffer in the getaddrinfo
@@ -919,7 +931,7 @@ gaih_inet (const char *name, const struct gaih_service *service,
if ((req->ai_flags & AI_CANONNAME) != 0
&& canon == NULL)
{
- canonbuf = getcanonname (nip, at, name);
+ canonbuf = getcanonname (nip, h_name, name);
if (canonbuf == NULL)
{
__resolv_context_enable_inet6
@@ -1169,6 +1181,7 @@ gaih_inet (const char *name, const struct gaih_service *service,
free ((char *) name);
free (addrmem);
free (canonbuf);
+ free (h_name);
return result;
}
@@ -771,7 +780,7 @@ get_nss_addresses (const char *name, const struct addrinfo *req,
if ((req->ai_flags & AI_CANONNAME) != 0
&& res->canon == NULL)
{
- char *canonbuf = getcanonname (nip, res->at, name);
+ char *canonbuf = getcanonname (nip, res->h_name, name);
if (canonbuf == NULL)
{
__resolv_context_put (res_ctx);
--
2.39.3

98
0085-CVE-2023-5156.patch Normal file
View file

@ -0,0 +1,98 @@
From 856bac55f98dc840e7c27cfa82262b933385de90 Mon Sep 17 00:00:00 2001
From: Romain Geissler <romain.geissler@amadeus.com>
Date: Mon, 25 Sep 2023 01:21:51 +0100
Subject: [PATCH] Fix leak in getaddrinfo introduced by the fix for
CVE-2023-4806 [BZ #30843]
This patch fixes a very recently added leak in getaddrinfo.
This was assigned CVE-2023-5156.
Resolves: BZ #30884
Related: BZ #30842
Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
(cherry picked from commit ec6b95c3303c700eb89eebeda2d7264cc184a796)
---
nss/Makefile | 20 ++++++++++++++++++++
nss/tst-nss-gai-hv2-canonname.c | 3 +++
sysdeps/posix/getaddrinfo.c | 4 +---
3 files changed, 24 insertions(+), 3 deletions(-)
diff --git a/nss/Makefile b/nss/Makefile
index f0af87e6f1..7a52c68791 100644
--- a/nss/Makefile
+++ b/nss/Makefile
@@ -148,6 +148,15 @@ endif
extra-test-objs += nss_test1.os nss_test2.os nss_test_errno.os \
nss_test_gai_hv2_canonname.os
+ifeq ($(run-built-tests),yes)
+ifneq (no,$(PERL))
+tests-special += $(objpfx)mtrace-tst-nss-gai-hv2-canonname.out
+endif
+endif
+
+generated += mtrace-tst-nss-gai-hv2-canonname.out \
+ tst-nss-gai-hv2-canonname.mtrace
+
include ../Rules
ifeq (yes,$(have-selinux))
@@ -216,6 +225,17 @@ endif
$(objpfx)tst-nss-files-alias-leak.out: $(objpfx)/libnss_files.so
$(objpfx)tst-nss-files-alias-truncated.out: $(objpfx)/libnss_files.so
+tst-nss-gai-hv2-canonname-ENV = \
+ MALLOC_TRACE=$(objpfx)tst-nss-gai-hv2-canonname.mtrace \
+ LD_PRELOAD=$(common-objpfx)/malloc/libc_malloc_debug.so
+$(objpfx)mtrace-tst-nss-gai-hv2-canonname.out: \
+ $(objpfx)tst-nss-gai-hv2-canonname.out
+ { test -r $(objpfx)tst-nss-gai-hv2-canonname.mtrace \
+ || ( echo "tst-nss-gai-hv2-canonname.mtrace does not exist"; exit 77; ) \
+ && $(common-objpfx)malloc/mtrace \
+ $(objpfx)tst-nss-gai-hv2-canonname.mtrace; } > $@; \
+ $(evaluate-test)
+
# Disable DT_RUNPATH on NSS tests so that the glibc internal NSS
# functions can load testing NSS modules via DT_RPATH.
LDFLAGS-tst-nss-test1 = -Wl,--disable-new-dtags
diff --git a/nss/tst-nss-gai-hv2-canonname.c b/nss/tst-nss-gai-hv2-canonname.c
index d5f10c07d6..7db53cf09d 100644
--- a/nss/tst-nss-gai-hv2-canonname.c
+++ b/nss/tst-nss-gai-hv2-canonname.c
@@ -21,6 +21,7 @@
#include <netdb.h>
#include <stdlib.h>
#include <string.h>
+#include <mcheck.h>
#include <support/check.h>
#include <support/xstdio.h>
#include "nss/tst-nss-gai-hv2-canonname.h"
@@ -41,6 +42,8 @@ static void do_prepare (int a, char **av)
static int
do_test (void)
{
+ mtrace ();
+
__nss_configure_lookup ("hosts", "test_gai_hv2_canonname");
struct addrinfo hints = {};
diff --git a/sysdeps/posix/getaddrinfo.c b/sysdeps/posix/getaddrinfo.c
index 7a43a3bf4c..f975dcd2bc 100644
--- a/sysdeps/posix/getaddrinfo.c
+++ b/sysdeps/posix/getaddrinfo.c
@@ -1196,9 +1196,7 @@ free_and_return:
if (malloc_name)
free ((char *) name);
free (addrmem);
- if (res.free_at)
- free (res.at);
- free (res.canon);
+ gaih_result_reset (&res);
return result;
}
--
2.39.3

181
0087-CVE-2023-6246.patch Normal file
View file

@ -0,0 +1,181 @@
From d1a83b6767f68b3cb5b4b4ea2617254acd040c82 Mon Sep 17 00:00:00 2001
From: Arjun Shankar <arjun@redhat.com>
Date: Mon, 15 Jan 2024 17:44:43 +0100
Subject: [PATCH] syslog: Fix heap buffer overflow in __vsyslog_internal
(CVE-2023-6246)
__vsyslog_internal did not handle a case where printing a SYSLOG_HEADER
containing a long program name failed to update the required buffer
size, leading to the allocation and overflow of a too-small buffer on
the heap. This commit fixes that. It also adds a new regression test
that uses glibc.malloc.check.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
Tested-by: Carlos O'Donell <carlos@redhat.com>
(cherry picked from commit 6bd0e4efcc78f3c0115e5ea9739a1642807450da)
---
misc/Makefile | 8 ++-
misc/syslog.c | 50 +++++++++++++------
misc/tst-syslog-long-progname.c | 39 +++++++++++++++
.../postclean.req | 0
4 files changed, 82 insertions(+), 15 deletions(-)
create mode 100644 misc/tst-syslog-long-progname.c
create mode 100644 misc/tst-syslog-long-progname.root/postclean.req
diff --git a/misc/Makefile b/misc/Makefile
index ba8232a0e9..66e9ded8f9 100644
--- a/misc/Makefile
+++ b/misc/Makefile
@@ -115,7 +115,10 @@ tests-special += $(objpfx)tst-error1-mem.out \
$(objpfx)tst-allocate_once-mem.out
endif
-tests-container := tst-syslog
+tests-container := \
+ tst-syslog \
+ tst-syslog-long-progname \
+ # tests-container
CFLAGS-select.c += -fexceptions -fasynchronous-unwind-tables
CFLAGS-tsearch.c += $(uses-callbacks)
@@ -175,6 +178,9 @@ $(objpfx)tst-allocate_once-mem.out: $(objpfx)tst-allocate_once.out
$(common-objpfx)malloc/mtrace $(objpfx)tst-allocate_once.mtrace > $@; \
$(evaluate-test)
+tst-syslog-long-progname-ENV = GLIBC_TUNABLES=glibc.malloc.check=3 \
+ LD_PRELOAD=libc_malloc_debug.so.0
+
$(objpfx)tst-select: $(librt)
$(objpfx)tst-select-time64: $(librt)
$(objpfx)tst-pselect: $(librt)
diff --git a/misc/syslog.c b/misc/syslog.c
index f67d4b58a4..fe1daf988b 100644
--- a/misc/syslog.c
+++ b/misc/syslog.c
@@ -122,8 +122,9 @@ __vsyslog_internal (int pri, const char *fmt, va_list ap,
{
/* Try to use a static buffer as an optimization. */
char bufs[1024];
- char *buf = NULL;
- size_t bufsize = 0;
+ char *buf = bufs;
+ size_t bufsize;
+
int msgoff;
int saved_errno = errno;
@@ -175,29 +176,50 @@ __vsyslog_internal (int pri, const char *fmt, va_list ap,
#define SYSLOG_HEADER_WITHOUT_TS(__pri, __msgoff) \
"<%d>: %n", __pri, __msgoff
- int l;
+ int l, vl;
if (has_ts)
l = __snprintf (bufs, sizeof bufs,
SYSLOG_HEADER (pri, timestamp, &msgoff, pid));
else
l = __snprintf (bufs, sizeof bufs,
SYSLOG_HEADER_WITHOUT_TS (pri, &msgoff));
+
+ char *pos;
+ size_t len;
+
if (0 <= l && l < sizeof bufs)
{
- va_list apc;
- va_copy (apc, ap);
+ /* At this point, there is still a chance that we can print the
+ remaining part of the log into bufs and use that. */
+ pos = bufs + l;
+ len = sizeof (bufs) - l;
+ }
+ else
+ {
+ buf = NULL;
+ /* We already know that bufs is too small to use for this log message.
+ The next vsnprintf into bufs is used only to calculate the total
+ required buffer length. We will discard bufs contents and allocate
+ an appropriately sized buffer later instead. */
+ pos = bufs;
+ len = sizeof (bufs);
+ }
- /* Restore errno for %m format. */
- __set_errno (saved_errno);
+ {
+ va_list apc;
+ va_copy (apc, ap);
- int vl = __vsnprintf_internal (bufs + l, sizeof bufs - l, fmt, apc,
- mode_flags);
- if (0 <= vl && vl < sizeof bufs - l)
- buf = bufs;
- bufsize = l + vl;
+ /* Restore errno for %m format. */
+ __set_errno (saved_errno);
- va_end (apc);
- }
+ vl = __vsnprintf_internal (pos, len, fmt, apc, mode_flags);
+
+ if (!(0 <= vl && vl < len))
+ buf = NULL;
+
+ bufsize = l + vl;
+ va_end (apc);
+ }
if (buf == NULL)
{
diff --git a/misc/tst-syslog-long-progname.c b/misc/tst-syslog-long-progname.c
new file mode 100644
index 0000000000..88f37a8a00
--- /dev/null
+++ b/misc/tst-syslog-long-progname.c
@@ -0,0 +1,39 @@
+/* Test heap buffer overflow in syslog with long __progname (CVE-2023-6246)
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <syslog.h>
+#include <string.h>
+
+extern char * __progname;
+
+static int
+do_test (void)
+{
+ char long_progname[2048];
+
+ memset (long_progname, 'X', sizeof (long_progname) - 1);
+ long_progname[sizeof (long_progname) - 1] = '\0';
+
+ __progname = long_progname;
+
+ syslog (LOG_INFO, "Hello, World!");
+
+ return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/misc/tst-syslog-long-progname.root/postclean.req b/misc/tst-syslog-long-progname.root/postclean.req
new file mode 100644
index 0000000000..e69de29bb2
--
2.39.3

106
0088-CVE-2023-6779.patch Normal file
View file

@ -0,0 +1,106 @@
From 2bc9d7c002bdac38b5c2a3f11b78e309d7765b83 Mon Sep 17 00:00:00 2001
From: Arjun Shankar <arjun@redhat.com>
Date: Mon, 15 Jan 2024 17:44:44 +0100
Subject: [PATCH] syslog: Fix heap buffer overflow in __vsyslog_internal
(CVE-2023-6779)
__vsyslog_internal used the return value of snprintf/vsnprintf to
calculate buffer sizes for memory allocation. If these functions (for
any reason) failed and returned -1, the resulting buffer would be too
small to hold output. This commit fixes that.
All snprintf/vsnprintf calls are checked for negative return values and
the function silently returns upon encountering them.
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
(cherry picked from commit 7e5a0c286da33159d47d0122007aac016f3e02cd)
---
misc/syslog.c | 39 ++++++++++++++++++++++++++++-----------
1 file changed, 28 insertions(+), 11 deletions(-)
diff --git a/misc/syslog.c b/misc/syslog.c
index fe1daf988b..3108ae9134 100644
--- a/misc/syslog.c
+++ b/misc/syslog.c
@@ -183,11 +183,13 @@ __vsyslog_internal (int pri, const char *fmt, va_list ap,
else
l = __snprintf (bufs, sizeof bufs,
SYSLOG_HEADER_WITHOUT_TS (pri, &msgoff));
+ if (l < 0)
+ goto out;
char *pos;
size_t len;
- if (0 <= l && l < sizeof bufs)
+ if (l < sizeof bufs)
{
/* At this point, there is still a chance that we can print the
remaining part of the log into bufs and use that. */
@@ -213,12 +215,15 @@ __vsyslog_internal (int pri, const char *fmt, va_list ap,
__set_errno (saved_errno);
vl = __vsnprintf_internal (pos, len, fmt, apc, mode_flags);
+ va_end (apc);
+
+ if (vl < 0)
+ goto out;
- if (!(0 <= vl && vl < len))
+ if (vl >= len)
buf = NULL;
bufsize = l + vl;
- va_end (apc);
}
if (buf == NULL)
@@ -229,25 +234,37 @@ __vsyslog_internal (int pri, const char *fmt, va_list ap,
/* Tell the cancellation handler to free this buffer. */
clarg.buf = buf;
+ int cl;
if (has_ts)
- __snprintf (buf, l + 1,
- SYSLOG_HEADER (pri, timestamp, &msgoff, pid));
+ cl = __snprintf (buf, l + 1,
+ SYSLOG_HEADER (pri, timestamp, &msgoff, pid));
else
- __snprintf (buf, l + 1,
- SYSLOG_HEADER_WITHOUT_TS (pri, &msgoff));
+ cl = __snprintf (buf, l + 1,
+ SYSLOG_HEADER_WITHOUT_TS (pri, &msgoff));
+ if (cl != l)
+ goto out;
va_list apc;
va_copy (apc, ap);
- __vsnprintf_internal (buf + l, bufsize - l + 1, fmt, apc,
- mode_flags);
+ cl = __vsnprintf_internal (buf + l, bufsize - l + 1, fmt, apc,
+ mode_flags);
va_end (apc);
+
+ if (cl != vl)
+ goto out;
}
else
{
+ int bl;
/* Nothing much to do but emit an error message. */
- bufsize = __snprintf (bufs, sizeof bufs,
- "out of memory[%d]", __getpid ());
+ bl = __snprintf (bufs, sizeof bufs,
+ "out of memory[%d]", __getpid ());
+ if (bl < 0 || bl >= sizeof bufs)
+ goto out;
+
+ bufsize = bl;
buf = bufs;
+ msgoff = 0;
}
}
--
2.39.3

41
0089-CVE-2023-6780.patch Normal file
View file

@ -0,0 +1,41 @@
From b9b7d6a27aa0632f334352fa400771115b3c69b7 Mon Sep 17 00:00:00 2001
From: Arjun Shankar <arjun@redhat.com>
Date: Mon, 15 Jan 2024 17:44:45 +0100
Subject: [PATCH] syslog: Fix integer overflow in __vsyslog_internal
(CVE-2023-6780)
__vsyslog_internal calculated a buffer size by adding two integers, but
did not first check if the addition would overflow. This commit fixes
that.
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
Tested-by: Carlos O'Donell <carlos@redhat.com>
(cherry picked from commit ddf542da94caf97ff43cc2875c88749880b7259b)
---
misc/syslog.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/misc/syslog.c b/misc/syslog.c
index 3108ae9134..9336036666 100644
--- a/misc/syslog.c
+++ b/misc/syslog.c
@@ -41,6 +41,7 @@ static char sccsid[] = "@(#)syslog.c 8.4 (Berkeley) 3/18/94";
#include <sys/uio.h>
#include <sys/un.h>
#include <syslog.h>
+#include <limits.h>
static int LogType = SOCK_DGRAM; /* type of socket connection */
static int LogFile = -1; /* fd for log */
@@ -217,7 +218,7 @@ __vsyslog_internal (int pri, const char *fmt, va_list ap,
vl = __vsnprintf_internal (pos, len, fmt, apc, mode_flags);
va_end (apc);
- if (vl < 0)
+ if (vl < 0 || vl >= INT_MAX - l)
goto out;
if (vl >= len)
--
2.39.3

View file

@ -1,34 +1,42 @@
Author: Charles Fol <folcharles@gmail.com>
Date: Thu Mar 28 12:25:38 2024 -0300
From f9dc609e06b1136bb0408be9605ce7973a767ada Mon Sep 17 00:00:00 2001
From: Charles Fol <folcharles@gmail.com>
Date: Thu, 28 Mar 2024 12:25:38 -0300
Subject: [PATCH] iconv: ISO-2022-CN-EXT: fix out-of-bound writes when writing
escape sequence (CVE-2024-2961)
iconv: ISO-2022-CN-EXT: fix out-of-bound writes when writing escape sequence (CVE-2024-2961)
ISO-2022-CN-EXT uses escape sequences to indicate character set changes
(as specified by RFC 1922). While the SOdesignation has the expected
bounds checks, neither SS2designation nor SS3designation have its;
allowing a write overflow of 1, 2, or 3 bytes with fixed values:
'$+I', '$+J', '$+K', '$+L', '$+M', or '$*H'.
ISO-2022-CN-EXT uses escape sequences to indicate character set changes
(as specified by RFC 1922). While the SOdesignation has the expected
bounds checks, neither SS2designation nor SS3designation have its;
allowing a write overflow of 1, 2, or 3 bytes with fixed values:
'$+I', '$+J', '$+K', '$+L', '$+M', or '$*H'.
Checked on aarch64-linux-gnu.
Checked on aarch64-linux-gnu.
Co-authored-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
Tested-by: Carlos O'Donell <carlos@redhat.com>
Co-authored-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
Tested-by: Carlos O'Donell <carlos@redhat.com>
---
iconvdata/Makefile | 5 +-
iconvdata/iso-2022-cn-ext.c | 12 +++
iconvdata/tst-iconv-iso-2022-cn-ext.c | 128 ++++++++++++++++++++++++++
3 files changed, 144 insertions(+), 1 deletion(-)
create mode 100644 iconvdata/tst-iconv-iso-2022-cn-ext.c
diff --git a/iconvdata/Makefile b/iconvdata/Makefile
index 646e2ccd11478646..c959758a90ed954f 100644
index ea019ce5c0..7196a8744b 100644
--- a/iconvdata/Makefile
+++ b/iconvdata/Makefile
@@ -75,7 +75,7 @@ ifeq (yes,$(build-shared))
@@ -75,7 +75,8 @@ ifeq (yes,$(build-shared))
tests = bug-iconv1 bug-iconv2 tst-loading tst-e2big tst-iconv4 bug-iconv4 \
tst-iconv6 bug-iconv5 bug-iconv6 tst-iconv7 bug-iconv8 bug-iconv9 \
bug-iconv10 bug-iconv11 bug-iconv12 bug-iconv13 bug-iconv14 \
- bug-iconv15
+ bug-iconv15 tst-iconv-iso-2022-cn-ext
bug-iconv10 bug-iconv11 bug-iconv12 tst-iconv-big5-hkscs-to-2ucs4 \
- bug-iconv13 bug-iconv14 bug-iconv15
+ bug-iconv13 bug-iconv14 bug-iconv15 \
+ tst-iconv-iso-2022-cn-ext
ifeq ($(have-thread-library),yes)
tests += bug-iconv3
endif
@@ -325,6 +325,8 @@ $(objpfx)bug-iconv14.out: $(addprefix $(objpfx), $(gconv-modules)) \
@@ -330,6 +331,8 @@ $(objpfx)bug-iconv14.out: $(addprefix $(objpfx), $(gconv-modules)) \
$(addprefix $(objpfx),$(modules.so))
$(objpfx)bug-iconv15.out: $(addprefix $(objpfx), $(gconv-modules)) \
$(addprefix $(objpfx),$(modules.so))
@ -38,10 +46,10 @@ index 646e2ccd11478646..c959758a90ed954f 100644
$(objpfx)iconv-test.out: run-iconv-test.sh \
$(addprefix $(objpfx), $(gconv-modules)) \
diff --git a/iconvdata/iso-2022-cn-ext.c b/iconvdata/iso-2022-cn-ext.c
index c21a7187b4d7808e..bd9493c12d95070b 100644
index b34c8a36f4..cce29b1969 100644
--- a/iconvdata/iso-2022-cn-ext.c
+++ b/iconvdata/iso-2022-cn-ext.c
@@ -575,6 +575,12 @@ DIAG_IGNORE_Os_NEEDS_COMMENT (5, "-Wmaybe-uninitialized");
@@ -574,6 +574,12 @@ DIAG_IGNORE_Os_NEEDS_COMMENT (5, "-Wmaybe-uninitialized");
{ \
const char *escseq; \
\
@ -54,7 +62,7 @@ index c21a7187b4d7808e..bd9493c12d95070b 100644
assert (used == CNS11643_2_set); /* XXX */ \
escseq = "*H"; \
*outptr++ = ESC; \
@@ -588,6 +594,12 @@ DIAG_IGNORE_Os_NEEDS_COMMENT (5, "-Wmaybe-uninitialized");
@@ -587,6 +593,12 @@ DIAG_IGNORE_Os_NEEDS_COMMENT (5, "-Wmaybe-uninitialized");
{ \
const char *escseq; \
\
@ -69,7 +77,7 @@ index c21a7187b4d7808e..bd9493c12d95070b 100644
*outptr++ = ESC; \
diff --git a/iconvdata/tst-iconv-iso-2022-cn-ext.c b/iconvdata/tst-iconv-iso-2022-cn-ext.c
new file mode 100644
index 0000000000000000..96a8765fd5369681
index 0000000000..96a8765fd5
--- /dev/null
+++ b/iconvdata/tst-iconv-iso-2022-cn-ext.c
@@ -0,0 +1,128 @@
@ -155,7 +163,7 @@ index 0000000000000000..96a8765fd5369681
+
+ /* Same as before for SS2designation. */
+ {
+ char inbuf[] = " \xe3\xb4\xbd";
+ char inbuf[] = "ã´½ \xe3\xb4\xbd";
+
+ for (int i = 0; i < 14; i++)
+ {
@ -175,7 +183,7 @@ index 0000000000000000..96a8765fd5369681
+
+ /* Same as before for SS3designation. */
+ {
+ char inbuf[] = " \xe5\x8a\x84";
+ char inbuf[] = "劄 \xe5\x8a\x84";
+
+ for (int i = 0; i < 14; i++)
+ {
@ -201,3 +209,5 @@ index 0000000000000000..96a8765fd5369681
+}
+
+#include <support/test-driver.c>
--
2.39.3

View file

@ -1,38 +1,18 @@
This patch was developed under embargo and cannot reference an upstream
commit. To find the associated commit please review the upstream git
log for CVE-2023-4911 to identify the relevant commits.
From 27e06a423cf06845a0515ab767a109b31b34724a Mon Sep 17 00:00:00 2001
From: Chunmei Xu <xuchunmei@linux.alibaba.com>
Date: Tue, 5 Mar 2024 14:12:15 +0800
Subject: [PATCH 1/1] fix CVE-2023-4911
Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date: Tue Sep 19 18:39:32 2023 -0400
tunables: Terminate if end of input is reached (CVE-2023-4911)
The string parsing routine may end up writing beyond bounds of tunestr
if the input tunable string is malformed, of the form name=name=val.
This gets processed twice, first as name=name=val and next as name=val,
resulting in tunestr being name=name=val:name=val, thus overflowing
tunestr.
Terminate the parsing loop at the first instance itself so that tunestr
does not overflow.
This also fixes up tst-env-setuid-tunables to actually handle failures
correct and add new tests to validate the fix for this CVE.
Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
Conflicts:
NEWS
(Dropped)
elf/tst-env-setuid-tunables.c
(Trivial conflict at HAVE_TUNABLES)
---
elf/dl-tunables.c | 16 ++++++++-------
elf/tst-env-setuid-tunables.c | 37 +++++++++++++++++++++++++++--------
2 files changed, 38 insertions(+), 15 deletions(-)
diff --git a/elf/dl-tunables.c b/elf/dl-tunables.c
index 3c84809d44381241..2c878e08ea197b29 100644
index 62b7332d..0edfade8 100644
--- a/elf/dl-tunables.c
+++ b/elf/dl-tunables.c
@@ -193,11 +193,7 @@ parse_tunables (char *tunestr, char *valstring)
@@ -180,11 +180,7 @@ parse_tunables (char *tunestr, char *valstring)
/* If we reach the end of the string before getting a valid name-value
pair, bail out. */
if (p[len] == '\0')
@ -45,7 +25,7 @@ index 3c84809d44381241..2c878e08ea197b29 100644
/* We did not find a valid name-value pair before encountering the
colon. */
@@ -257,9 +253,16 @@ parse_tunables (char *tunestr, char *valstring)
@@ -244,9 +240,15 @@ parse_tunables (char *tunestr, char *valstring)
}
}
@ -54,7 +34,6 @@ index 3c84809d44381241..2c878e08ea197b29 100644
+ /* We reached the end while processing the tunable string. */
+ if (p[len] == '\0')
+ break;
+
+ p += len + 1;
}
+
@ -62,13 +41,13 @@ index 3c84809d44381241..2c878e08ea197b29 100644
+ if (__libc_enable_secure)
+ tunestr[off] = '\0';
}
#endif
/* Enable the glibc.malloc.check tunable in SETUID/SETGID programs only when
diff --git a/elf/tst-env-setuid-tunables.c b/elf/tst-env-setuid-tunables.c
index 0b9b075c40598c6f..8b0861c4ad853040 100644
index 7dfb0e07..2364d162 100644
--- a/elf/tst-env-setuid-tunables.c
+++ b/elf/tst-env-setuid-tunables.c
@@ -52,6 +52,8 @@ const char *teststrings[] =
@@ -50,6 +50,8 @@ const char *teststrings[] =
"glibc.malloc.perturb=0x800:not_valid.malloc.check=2:glibc.malloc.mmap_threshold=4096",
"glibc.not_valid.check=2:glibc.malloc.mmap_threshold=4096",
"not_valid.malloc.check=2:glibc.malloc.mmap_threshold=4096",
@ -77,7 +56,7 @@ index 0b9b075c40598c6f..8b0861c4ad853040 100644
"glibc.malloc.garbage=2:glibc.maoc.mmap_threshold=4096:glibc.malloc.check=2",
"glibc.malloc.check=4:glibc.malloc.garbage=2:glibc.maoc.mmap_threshold=4096",
":glibc.malloc.garbage=2:glibc.malloc.check=1",
@@ -70,6 +72,8 @@ const char *resultstrings[] =
@@ -68,6 +70,8 @@ const char *resultstrings[] =
"glibc.malloc.perturb=0x800:glibc.malloc.mmap_threshold=4096",
"glibc.malloc.mmap_threshold=4096",
"glibc.malloc.mmap_threshold=4096",
@ -86,10 +65,10 @@ index 0b9b075c40598c6f..8b0861c4ad853040 100644
"",
"",
"",
@@ -84,11 +88,18 @@ test_child (int off)
@@ -81,11 +85,17 @@ test_child (int off)
{
const char *val = getenv ("GLIBC_TUNABLES");
#if HAVE_TUNABLES
+ printf (" [%d] GLIBC_TUNABLES is %s\n", off, val);
+ fflush (stdout);
if (val != NULL && strcmp (val, resultstrings[off]) == 0)
@ -98,27 +77,26 @@ index 0b9b075c40598c6f..8b0861c4ad853040 100644
if (val != NULL)
- printf ("[%d] Unexpected GLIBC_TUNABLES VALUE %s\n", off, val);
+ printf (" [%d] Unexpected GLIBC_TUNABLES VALUE %s, expected %s\n",
+ off, val, resultstrings[off]);
+ off, val, resultstrings[off]);
+ else
+ printf (" [%d] GLIBC_TUNABLES environment variable absent\n", off);
+
+ fflush (stdout);
return 1;
#else
@@ -117,21 +128,26 @@ do_test (int argc, char **argv)
}
@@ -106,31 +116,42 @@ do_test (int argc, char **argv)
if (ret != 0)
exit (1);
- exit (EXIT_SUCCESS);
+ /* Special return code to make sure that the child executed all the way
+ through. */
+ through. */
+ exit (42);
}
else
{
- int ret = 0;
-
/* Spawn tests. */
for (int i = 0; i < array_length (teststrings); i++)
{
@ -130,28 +108,32 @@ index 0b9b075c40598c6f..8b0861c4ad853040 100644
+ fflush (stdout);
if (setenv ("GLIBC_TUNABLES", teststrings[i], 1) != 0)
- exit (1);
+ {
-
+ {
+ printf (" [%d] Failed to set GLIBC_TUNABLES: %m", i);
+ support_record_failure ();
+ continue;
+ }
+ continue;
+ }
+
int status = support_capture_subprogram_self_sgid (buf);
@@ -139,9 +155,14 @@ do_test (int argc, char **argv)
/* Bail out early if unsupported. */
if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
return EXIT_UNSUPPORTED;
- ret |= status;
+ if (WEXITSTATUS (status) != 42)
+ {
+ printf (" [%d] child failed with status %d\n", i,
+ WEXITSTATUS (status));
+ support_record_failure ();
+ }
+ {
+ printf (" [%d] child failed with status %d\n", i,
+ WEXITSTATUS (status));
+ support_record_failure ();
+ }
}
- return ret;
+ return 0;
}
}
--
2.41.0

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,40 @@
From 2c8dfc45a8009e5110a9d2148b62d802e989fde7 Mon Sep 17 00:00:00 2001
From: ticat_fp <fanpeng@loongson.cn>
Date: Thu, 29 Feb 2024 15:58:31 +0800
Subject: [PATCH] Decrease value of arch_minimum_kernel with LoongArch
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/unix/sysv/linux/loongarch/configure | 2 +-
sysdeps/unix/sysv/linux/loongarch/configure.ac | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/sysdeps/unix/sysv/linux/loongarch/configure b/sysdeps/unix/sysv/linux/loongarch/configure
index 0d1159e9..851b2285 100644
--- a/sysdeps/unix/sysv/linux/loongarch/configure
+++ b/sysdeps/unix/sysv/linux/loongarch/configure
@@ -1,7 +1,7 @@
# This file is generated from configure.ac by Autoconf. DO NOT EDIT!
# Local configure fragment for sysdeps/unix/sysv/linux/loongarch.
-arch_minimum_kernel=5.19.0
+arch_minimum_kernel=4.19.0
libc_cv_loongarch_int_abi=no
diff --git a/sysdeps/unix/sysv/linux/loongarch/configure.ac b/sysdeps/unix/sysv/linux/loongarch/configure.ac
index 04e9150a..00815c2f 100644
--- a/sysdeps/unix/sysv/linux/loongarch/configure.ac
+++ b/sysdeps/unix/sysv/linux/loongarch/configure.ac
@@ -2,7 +2,7 @@ sinclude(./aclocal.m4)dnl Autoconf lossage
GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
# Local configure fragment for sysdeps/unix/sysv/linux/loongarch.
-arch_minimum_kernel=5.19.0
+arch_minimum_kernel=4.19.0
libc_cv_loongarch_int_abi=no
AC_EGREP_CPP(4 8 8, [__SIZEOF_INT__ __SIZEOF_LONG__ __SIZEOF_POINTER__
--
2.33.0

View file

@ -1,34 +0,0 @@
From c5de7c407853b807e8d0c764e6325bb1311f39cd Mon Sep 17 00:00:00 2001
From: Xing Li <lixing@loongson.cn>
Date: Tue, 4 Jul 2023 15:10:03 +0800
Subject: [PATCH 2/2] Fix tst-cancel21.c to suit kernel struct sigcontext
change. * nptl/tst-cancel21.c
---
nptl/tst-cancel21.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/nptl/tst-cancel21.c b/nptl/tst-cancel21.c
index b10fdbc1..a3653f21 100644
--- a/nptl/tst-cancel21.c
+++ b/nptl/tst-cancel21.c
@@ -217,14 +217,14 @@ static int
do_test (void)
{
stack_t ss;
- ss.ss_sp = malloc (2 * SIGSTKSZ);
+ ss.ss_sp = malloc (4 * SIGSTKSZ);
if (ss.ss_sp == NULL)
{
puts ("failed to allocate alternate stack");
return 1;
}
ss.ss_flags = 0;
- ss.ss_size = 2 * SIGSTKSZ;
+ ss.ss_size = 4 * SIGSTKSZ;
if (sigaltstack (&ss, NULL) < 0)
{
printf ("sigaltstack failed %m\n");
--
2.27.0

View file

@ -0,0 +1,499 @@
From 8923e4e9c79e672fd6b3b89aba598a60d5c01211 Mon Sep 17 00:00:00 2001
From: caiyinyu <caiyinyu@loongson.cn>
Date: Fri, 15 Sep 2023 17:35:19 +0800
Subject: [PATCH 25/29] LoongArch: Add glibc.cpu.hwcap support.
Key Points:
1. On lasx & lsx platforms, We must use _dl_runtime_{profile, resolve}_{lsx, lasx}
to save vector registers.
2. Via "tunables", users can choose str/mem_{lasx,lsx,unaligned} functions with
`export GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX,...`.
Note: glibc.cpu.hwcaps doesn't affect _dl_runtime_{profile, resolve}_{lsx, lasx}
selection.
Usage Notes:
1. Only valid inputs: LASX, LSX, UAL. Case-sensitive, comma-separated, no spaces.
2. Example: `export GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX,UAL` turns on LASX & UAL.
Unmentioned features turn off. With default ifunc: lasx > lsx > unaligned >
aligned > generic, effect is: lasx > unaligned > aligned > generic; lsx off.
3. Incorrect GLIBC_TUNABLES settings will show error messages.
For example: On lsx platforms, you cannot enable lasx features. If you do
that, you will get error messages.
4. Valid input examples:
- GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX: lasx > aligned > generic.
- GLIBC_TUNABLES=glibc.cpu.hwcaps=LSX,UAL: lsx > unaligned > aligned > generic.
- GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX,UAL,LASX,UAL,LSX,LASX,UAL: Repetitions
allowed but not recommended. Results in: lasx > lsx > unaligned > aligned >
generic.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/Makefile | 4 +
sysdeps/loongarch/Versions | 5 ++
sysdeps/loongarch/cpu-tunables.c | 89 +++++++++++++++++++
sysdeps/loongarch/dl-get-cpu-features.c | 25 ++++++
sysdeps/loongarch/dl-machine.h | 27 +++++-
sysdeps/loongarch/dl-tunables.list | 25 ++++++
.../unix/sysv/linux/loongarch/cpu-features.c | 29 ++++++
.../unix/sysv/linux/loongarch/cpu-features.h | 18 +++-
.../unix/sysv/linux/loongarch/dl-procinfo.c | 60 +++++++++++++
sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c | 21 +++++
.../unix/sysv/linux/loongarch/libc-start.c | 34 +++++++
11 files changed, 329 insertions(+), 8 deletions(-)
create mode 100644 sysdeps/loongarch/Versions
create mode 100644 sysdeps/loongarch/cpu-tunables.c
create mode 100644 sysdeps/loongarch/dl-get-cpu-features.c
create mode 100644 sysdeps/loongarch/dl-tunables.list
create mode 100644 sysdeps/unix/sysv/linux/loongarch/cpu-features.c
create mode 100644 sysdeps/unix/sysv/linux/loongarch/dl-procinfo.c
create mode 100644 sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c
create mode 100644 sysdeps/unix/sysv/linux/loongarch/libc-start.c
diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
index 43d2f583..30a1f4a8 100644
--- a/sysdeps/loongarch/Makefile
+++ b/sysdeps/loongarch/Makefile
@@ -6,6 +6,10 @@ ifeq ($(subdir),elf)
gen-as-const-headers += dl-link.sym
endif
+ifeq ($(subdir),elf)
+ sysdep-dl-routines += dl-get-cpu-features
+endif
+
# LoongArch's assembler also needs to know about PIC as it changes the
# definition of some assembler macros.
ASFLAGS-.os += $(pic-ccflag)
diff --git a/sysdeps/loongarch/Versions b/sysdeps/loongarch/Versions
new file mode 100644
index 00000000..33ae2cc0
--- /dev/null
+++ b/sysdeps/loongarch/Versions
@@ -0,0 +1,5 @@
+ld {
+ GLIBC_PRIVATE {
+ _dl_larch_get_cpu_features;
+ }
+}
diff --git a/sysdeps/loongarch/cpu-tunables.c b/sysdeps/loongarch/cpu-tunables.c
new file mode 100644
index 00000000..8e9fab93
--- /dev/null
+++ b/sysdeps/loongarch/cpu-tunables.c
@@ -0,0 +1,89 @@
+/* LoongArch CPU feature tuning.
+ This file is part of the GNU C Library.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+# include <stdbool.h>
+# include <stdint.h>
+# include <unistd.h> /* Get STDOUT_FILENO for _dl_printf. */
+# include <elf/dl-tunables.h>
+# include <string.h>
+# include <cpu-features.h>
+# include <ldsodefs.h>
+# include <sys/auxv.h>
+
+# define HWCAP_LOONGARCH_IFUNC \
+ (HWCAP_LOONGARCH_UAL | HWCAP_LOONGARCH_LSX | HWCAP_LOONGARCH_LASX)
+
+# define CHECK_GLIBC_IFUNC_CPU_OFF(f, name, len) \
+ _Static_assert (sizeof (#name) - 1 == len, #name " != " #len); \
+ if (!memcmp (f, #name, len) && \
+ (GLRO (dl_hwcap) & HWCAP_LOONGARCH_##name)) \
+ { \
+ hwcap |= (HWCAP_LOONGARCH_##name | (~HWCAP_LOONGARCH_IFUNC)); \
+ break; \
+ } \
+
+attribute_hidden
+void
+TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
+{
+ const char *p = valp->strval;
+ size_t len;
+ unsigned long hwcap = 0;
+ const char *c;
+
+ do {
+ for (c = p; *c != ','; c++)
+ if (*c == '\0')
+ break;
+
+ len = c - p;
+
+ switch(len)
+ {
+ default:
+ _dl_fatal_printf (
+ "The valid values of glibc.cpu.hwcaps is UAL, LASX, LSX!!\n"
+ );
+ break;
+ case 3:
+ {
+ CHECK_GLIBC_IFUNC_CPU_OFF (p, LSX, 3);
+ CHECK_GLIBC_IFUNC_CPU_OFF (p, UAL, 3);
+ _dl_fatal_printf (
+ "Some features are invalid or not supported on this machine!!\n"
+ "The valid values of glibc.cpu.hwcaps is UAL, LASX, LSX!!\n"
+ );
+ }
+ break;
+ case 4:
+ {
+ CHECK_GLIBC_IFUNC_CPU_OFF (p, LASX, 4);
+ _dl_fatal_printf (
+ "Some features are invalid or not supported on this machine!!\n"
+ "The valid values of glibc.cpu.hwcaps is UAL, LASX, LSX!!\n"
+ );
+ }
+ break;
+ }
+
+ p += len + 1;
+ }
+ while (*c != '\0');
+
+ GLRO (dl_larch_cpu_features).hwcap &= hwcap;
+}
diff --git a/sysdeps/loongarch/dl-get-cpu-features.c b/sysdeps/loongarch/dl-get-cpu-features.c
new file mode 100644
index 00000000..7cd9bc15
--- /dev/null
+++ b/sysdeps/loongarch/dl-get-cpu-features.c
@@ -0,0 +1,25 @@
+/* Define _dl_larch_get_cpu_features.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+#include <ldsodefs.h>
+
+const struct cpu_features *
+_dl_larch_get_cpu_features (void)
+{
+ return &GLRO(dl_larch_cpu_features);
+}
diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
index 57913cef..b395a928 100644
--- a/sysdeps/loongarch/dl-machine.h
+++ b/sysdeps/loongarch/dl-machine.h
@@ -29,6 +29,8 @@
#include <dl-static-tls.h>
#include <dl-machine-rel.h>
+#include <cpu-features.c>
+
#ifndef _RTLD_PROLOGUE
# define _RTLD_PROLOGUE(entry) \
".globl\t" __STRING (entry) "\n\t" \
@@ -53,6 +55,23 @@
#define ELF_MACHINE_NO_REL 1
#define ELF_MACHINE_NO_RELA 0
+#define DL_PLATFORM_INIT dl_platform_init ()
+
+static inline void __attribute__ ((unused))
+dl_platform_init (void)
+{
+ if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0')
+ /* Avoid an empty string which would disturb us. */
+ GLRO(dl_platform) = NULL;
+
+#ifdef SHARED
+ /* init_cpu_features has been called early from __libc_start_main in
+ static executable. */
+ init_cpu_features (&GLRO(dl_larch_cpu_features));
+#endif
+}
+
+
/* Return nonzero iff ELF header is compatible with the running host. */
static inline int
elf_machine_matches_host (const ElfW (Ehdr) *ehdr)
@@ -290,9 +309,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
if (profile != 0)
{
#if !defined __loongarch_soft_float
- if (SUPPORT_LASX)
+ if (RTLD_SUPPORT_LASX)
gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile_lasx;
- else if (SUPPORT_LSX)
+ else if (RTLD_SUPPORT_LSX)
gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile_lsx;
else
#endif
@@ -310,9 +329,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
indicated by the offset on the stack, and then jump to
the resolved address. */
#if !defined __loongarch_soft_float
- if (SUPPORT_LASX)
+ if (RTLD_SUPPORT_LASX)
gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lasx;
- else if (SUPPORT_LSX)
+ else if (RTLD_SUPPORT_LSX)
gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lsx;
else
#endif
diff --git a/sysdeps/loongarch/dl-tunables.list b/sysdeps/loongarch/dl-tunables.list
new file mode 100644
index 00000000..66b34275
--- /dev/null
+++ b/sysdeps/loongarch/dl-tunables.list
@@ -0,0 +1,25 @@
+# LoongArch specific tunables.
+# Copyright (C) 2023 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+glibc {
+ cpu {
+ hwcaps {
+ type: STRING
+ }
+ }
+}
diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.c b/sysdeps/unix/sysv/linux/loongarch/cpu-features.c
new file mode 100644
index 00000000..1290c4ce
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.c
@@ -0,0 +1,29 @@
+/* Initialize CPU feature data. LoongArch64 version.
+ This file is part of the GNU C Library.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <cpu-features.h>
+#include <elf/dl-hwcaps.h>
+#include <elf/dl-tunables.h>
+extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *) attribute_hidden;
+
+static inline void
+init_cpu_features (struct cpu_features *cpu_features)
+{
+ GLRO (dl_larch_cpu_features).hwcap = GLRO (dl_hwcap);
+ TUNABLE_GET (glibc, cpu, hwcaps, tunable_val_t *, TUNABLE_CALLBACK (set_hwcaps));
+}
diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
index d1a280a5..450963ce 100644
--- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
+++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
@@ -19,13 +19,23 @@
#ifndef _CPU_FEATURES_LOONGARCH64_H
#define _CPU_FEATURES_LOONGARCH64_H
+#include <stdint.h>
#include <sys/auxv.h>
-#define SUPPORT_UAL (GLRO (dl_hwcap) & HWCAP_LOONGARCH_UAL)
-#define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX)
-#define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX)
+struct cpu_features
+ {
+ uint64_t hwcap;
+ };
+/* Get a pointer to the CPU features structure. */
+extern const struct cpu_features *_dl_larch_get_cpu_features (void)
+ __attribute__ ((pure));
+
+#define SUPPORT_UAL (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_UAL)
+#define SUPPORT_LSX (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_LSX)
+#define SUPPORT_LASX (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_LASX)
+#define RTLD_SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX)
+#define RTLD_SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX)
#define INIT_ARCH()
#endif /* _CPU_FEATURES_LOONGARCH64_H */
-
diff --git a/sysdeps/unix/sysv/linux/loongarch/dl-procinfo.c b/sysdeps/unix/sysv/linux/loongarch/dl-procinfo.c
new file mode 100644
index 00000000..6217fda9
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/loongarch/dl-procinfo.c
@@ -0,0 +1,60 @@
+/* Data for LoongArch64 version of processor capability information.
+ Linux version.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* If anything should be added here check whether the size of each string
+ is still ok with the given array size.
+
+ All the #ifdefs in the definitions are quite irritating but
+ necessary if we want to avoid duplicating the information. There
+ are three different modes:
+
+ - PROCINFO_DECL is defined. This means we are only interested in
+ declarations.
+
+ - PROCINFO_DECL is not defined:
+
+ + if SHARED is defined the file is included in an array
+ initializer. The .element = { ... } syntax is needed.
+
+ + if SHARED is not defined a normal array initialization is
+ needed.
+ */
+
+#ifndef PROCINFO_CLASS
+# define PROCINFO_CLASS
+#endif
+
+#if !IS_IN (ldconfig)
+# if !defined PROCINFO_DECL && defined SHARED
+ ._dl_larch_cpu_features
+# else
+PROCINFO_CLASS struct cpu_features _dl_larch_cpu_features
+# endif
+# ifndef PROCINFO_DECL
+= { }
+# endif
+# if !defined SHARED || defined PROCINFO_DECL
+;
+# else
+,
+# endif
+#endif
+
+#undef PROCINFO_DECL
+#undef PROCINFO_CLASS
diff --git a/sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c b/sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c
new file mode 100644
index 00000000..455fd71a
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c
@@ -0,0 +1,21 @@
+/* Operating system support for run-time dynamic linker. LoongArch version.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+#include <sysdeps/loongarch/cpu-tunables.c>
+#include <sysdeps/unix/sysv/linux/dl-sysdep.c>
diff --git a/sysdeps/unix/sysv/linux/loongarch/libc-start.c b/sysdeps/unix/sysv/linux/loongarch/libc-start.c
new file mode 100644
index 00000000..f1346ece
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/loongarch/libc-start.c
@@ -0,0 +1,34 @@
+/* Override csu/libc-start.c on LoongArch64.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef SHARED
+
+/* Mark symbols hidden in static PIE for early self relocation to work. */
+# if BUILD_PIE_DEFAULT
+# pragma GCC visibility push(hidden)
+# endif
+
+# include <ldsodefs.h>
+# include <cpu-features.c>
+
+extern struct cpu_features _dl_larch_cpu_features;
+
+# define ARCH_INIT_CPU_FEATURES() init_cpu_features (&_dl_larch_cpu_features)
+
+#endif
+#include <csu/libc-start.c>
--
2.33.0

View file

@ -0,0 +1,485 @@
From 3ee56bbc56faa7b85a6513340db4a4fdd6ce709d Mon Sep 17 00:00:00 2001
From: dengjianbo <dengjianbo@loongson.cn>
Date: Mon, 28 Aug 2023 10:08:36 +0800
Subject: [PATCH 15/29] LoongArch: Add ifunc support for memchr{aligned, lsx,
lasx}
According to glibc memchr microbenchmark, this implementation could reduce
the runtime as following:
Name Percent of runtime reduced
memchr-lasx 37%-83%
memchr-lsx 30%-66%
memchr-aligned 0%-15%
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/lp64/multiarch/Makefile | 3 +
.../lp64/multiarch/ifunc-impl-list.c | 7 ++
.../loongarch/lp64/multiarch/ifunc-memchr.h | 40 ++++++
.../loongarch/lp64/multiarch/memchr-aligned.S | 95 ++++++++++++++
.../loongarch/lp64/multiarch/memchr-lasx.S | 117 ++++++++++++++++++
sysdeps/loongarch/lp64/multiarch/memchr-lsx.S | 102 +++++++++++++++
sysdeps/loongarch/lp64/multiarch/memchr.c | 37 ++++++
7 files changed, 401 insertions(+)
create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-memchr.h
create mode 100644 sysdeps/loongarch/lp64/multiarch/memchr-aligned.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/memchr-lasx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/memchr-lsx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/memchr.c
diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
index 64416b02..2f4802cf 100644
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
@@ -24,5 +24,8 @@ sysdep_routines += \
rawmemchr-aligned \
rawmemchr-lsx \
rawmemchr-lasx \
+ memchr-aligned \
+ memchr-lsx \
+ memchr-lasx \
# sysdep_routines
endif
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
index 3db9af14..a567b9cf 100644
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
@@ -102,5 +102,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_aligned)
)
+ IFUNC_IMPL (i, name, memchr,
+#if !defined __loongarch_soft_float
+ IFUNC_IMPL_ADD (array, i, memchr, SUPPORT_LASX, __memchr_lasx)
+ IFUNC_IMPL_ADD (array, i, memchr, SUPPORT_LSX, __memchr_lsx)
+#endif
+ IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_aligned)
+ )
return i;
}
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-memchr.h b/sysdeps/loongarch/lp64/multiarch/ifunc-memchr.h
new file mode 100644
index 00000000..9060ccd5
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-memchr.h
@@ -0,0 +1,40 @@
+/* Common definition for memchr ifunc selections.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <ldsodefs.h>
+#include <ifunc-init.h>
+
+#if !defined __loongarch_soft_float
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
+#endif
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+#if !defined __loongarch_soft_float
+ if (SUPPORT_LASX)
+ return OPTIMIZE (lasx);
+ else if (SUPPORT_LSX)
+ return OPTIMIZE (lsx);
+ else
+#endif
+ return OPTIMIZE (aligned);
+}
diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S
new file mode 100644
index 00000000..81d0d004
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memchr-aligned.S
@@ -0,0 +1,95 @@
+/* Optimized memchr implementation using basic LoongArch instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+# define MEMCHR_NAME __memchr_aligned
+#else
+# define MEMCHR_NAME memchr
+#endif
+
+LEAF(MEMCHR_NAME, 6)
+ beqz a2, L(out)
+ andi t1, a0, 0x7
+ add.d a5, a0, a2
+ bstrins.d a0, zero, 2, 0
+
+ ld.d t0, a0, 0
+ bstrins.d a1, a1, 15, 8
+ lu12i.w a3, 0x01010
+ slli.d t2, t1, 03
+
+ bstrins.d a1, a1, 31, 16
+ ori a3, a3, 0x101
+ li.d t7, -1
+ li.d t8, 8
+
+ bstrins.d a1, a1, 63, 32
+ bstrins.d a3, a3, 63, 32
+ sll.d t2, t7, t2
+ xor t0, t0, a1
+
+
+ addi.d a6, a5, -1
+ slli.d a4, a3, 7
+ sub.d t1, t8, t1
+ orn t0, t0, t2
+
+ sub.d t2, t0, a3
+ andn t3, a4, t0
+ bstrins.d a6, zero, 2, 0
+ and t0, t2, t3
+
+ bgeu t1, a2, L(end)
+L(loop):
+ bnez t0, L(found)
+ ld.d t1, a0, 8
+ xor t0, t1, a1
+
+ addi.d a0, a0, 8
+ sub.d t2, t0, a3
+ andn t3, a4, t0
+ and t0, t2, t3
+
+
+ bne a0, a6, L(loop)
+L(end):
+ sub.d t1, a5, a6
+ ctz.d t0, t0
+ srli.d t0, t0, 3
+
+ sltu t1, t0, t1
+ add.d a0, a0, t0
+ maskeqz a0, a0, t1
+ jr ra
+
+L(found):
+ ctz.d t0, t0
+ srli.d t0, t0, 3
+ add.d a0, a0, t0
+ jr ra
+
+L(out):
+ move a0, zero
+ jr ra
+END(MEMCHR_NAME)
+
+libc_hidden_builtin_def (MEMCHR_NAME)
diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S
new file mode 100644
index 00000000..a26cdf48
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memchr-lasx.S
@@ -0,0 +1,117 @@
+/* Optimized memchr implementation using LoongArch LASX instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# define MEMCHR __memchr_lasx
+
+LEAF(MEMCHR, 6)
+ beqz a2, L(ret0)
+ add.d a3, a0, a2
+ andi t0, a0, 0x3f
+ bstrins.d a0, zero, 5, 0
+
+ xvld xr0, a0, 0
+ xvld xr1, a0, 32
+ li.d t1, -1
+ li.d t2, 64
+
+ xvreplgr2vr.b xr2, a1
+ sll.d t3, t1, t0
+ sub.d t2, t2, t0
+ xvseq.b xr0, xr0, xr2
+
+ xvseq.b xr1, xr1, xr2
+ xvmsknz.b xr0, xr0
+ xvmsknz.b xr1, xr1
+ xvpickve.w xr3, xr0, 4
+
+
+ xvpickve.w xr4, xr1, 4
+ vilvl.h vr0, vr3, vr0
+ vilvl.h vr1, vr4, vr1
+ vilvl.w vr0, vr1, vr0
+
+ movfr2gr.d t0, fa0
+ and t0, t0, t3
+ bgeu t2, a2, L(end)
+ bnez t0, L(found)
+
+ addi.d a4, a3, -1
+ bstrins.d a4, zero, 5, 0
+L(loop):
+ xvld xr0, a0, 64
+ xvld xr1, a0, 96
+
+ addi.d a0, a0, 64
+ xvseq.b xr0, xr0, xr2
+ xvseq.b xr1, xr1, xr2
+ beq a0, a4, L(out)
+
+
+ xvmax.bu xr3, xr0, xr1
+ xvseteqz.v fcc0, xr3
+ bcnez fcc0, L(loop)
+ xvmsknz.b xr0, xr0
+
+ xvmsknz.b xr1, xr1
+ xvpickve.w xr3, xr0, 4
+ xvpickve.w xr4, xr1, 4
+ vilvl.h vr0, vr3, vr0
+
+ vilvl.h vr1, vr4, vr1
+ vilvl.w vr0, vr1, vr0
+ movfr2gr.d t0, fa0
+L(found):
+ ctz.d t1, t0
+
+ add.d a0, a0, t1
+ jr ra
+L(ret0):
+ move a0, zero
+ jr ra
+
+
+L(out):
+ xvmsknz.b xr0, xr0
+ xvmsknz.b xr1, xr1
+ xvpickve.w xr3, xr0, 4
+ xvpickve.w xr4, xr1, 4
+
+ vilvl.h vr0, vr3, vr0
+ vilvl.h vr1, vr4, vr1
+ vilvl.w vr0, vr1, vr0
+ movfr2gr.d t0, fa0
+
+L(end):
+ sub.d t2, zero, a3
+ srl.d t1, t1, t2
+ and t0, t0, t1
+ ctz.d t1, t0
+
+ add.d a0, a0, t1
+ maskeqz a0, a0, t0
+ jr ra
+END(MEMCHR)
+
+libc_hidden_builtin_def (MEMCHR)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S
new file mode 100644
index 00000000..a73ecd25
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memchr-lsx.S
@@ -0,0 +1,102 @@
+/* Optimized memchr implementation using LoongArch LSX instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# define MEMCHR __memchr_lsx
+
+LEAF(MEMCHR, 6)
+ beqz a2, L(ret0)
+ add.d a3, a0, a2
+ andi t0, a0, 0x1f
+ bstrins.d a0, zero, 4, 0
+
+ vld vr0, a0, 0
+ vld vr1, a0, 16
+ li.d t1, -1
+ li.d t2, 32
+
+ vreplgr2vr.b vr2, a1
+ sll.d t3, t1, t0
+ sub.d t2, t2, t0
+ vseq.b vr0, vr0, vr2
+
+ vseq.b vr1, vr1, vr2
+ vmsknz.b vr0, vr0
+ vmsknz.b vr1, vr1
+ vilvl.h vr0, vr1, vr0
+
+
+ movfr2gr.s t0, fa0
+ and t0, t0, t3
+ bgeu t2, a2, L(end)
+ bnez t0, L(found)
+
+ addi.d a4, a3, -1
+ bstrins.d a4, zero, 4, 0
+L(loop):
+ vld vr0, a0, 32
+ vld vr1, a0, 48
+
+ addi.d a0, a0, 32
+ vseq.b vr0, vr0, vr2
+ vseq.b vr1, vr1, vr2
+ beq a0, a4, L(out)
+
+ vmax.bu vr3, vr0, vr1
+ vseteqz.v fcc0, vr3
+ bcnez fcc0, L(loop)
+ vmsknz.b vr0, vr0
+
+
+ vmsknz.b vr1, vr1
+ vilvl.h vr0, vr1, vr0
+ movfr2gr.s t0, fa0
+L(found):
+ ctz.w t0, t0
+
+ add.d a0, a0, t0
+ jr ra
+L(ret0):
+ move a0, zero
+ jr ra
+
+L(out):
+ vmsknz.b vr0, vr0
+ vmsknz.b vr1, vr1
+ vilvl.h vr0, vr1, vr0
+ movfr2gr.s t0, fa0
+
+L(end):
+ sub.d t2, zero, a3
+ srl.w t1, t1, t2
+ and t0, t0, t1
+ ctz.w t1, t0
+
+
+ add.d a0, a0, t1
+ maskeqz a0, a0, t0
+ jr ra
+END(MEMCHR)
+
+libc_hidden_builtin_def (MEMCHR)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/memchr.c b/sysdeps/loongarch/lp64/multiarch/memchr.c
new file mode 100644
index 00000000..059479c0
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memchr.c
@@ -0,0 +1,37 @@
+/* Multiple versions of memchr.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define memchr __redirect_memchr
+# include <string.h>
+# undef memchr
+
+# define SYMBOL_NAME memchr
+# include "ifunc-memchr.h"
+
+libc_ifunc_redirected (__redirect_memchr, memchr,
+ IFUNC_SELECTOR ());
+
+# ifdef SHARED
+__hidden_ver1 (memchr, __GI_memchr, __redirect_memchr)
+ __attribute__ ((visibility ("hidden"))) __attribute_copy__ (memchr);
+# endif
+
+#endif
--
2.33.0

View file

@ -0,0 +1,946 @@
From 60f4bbd1eec528ba8df044ae6b3091f6337a7fcc Mon Sep 17 00:00:00 2001
From: dengjianbo <dengjianbo@loongson.cn>
Date: Mon, 28 Aug 2023 10:08:39 +0800
Subject: [PATCH 18/29] LoongArch: Add ifunc support for memcmp{aligned, lsx,
lasx}
According to glibc memcmp microbenchmark test results(Add generic
memcmp), this implementation have performance improvement
except the length is less than 3, details as below:
Name Percent of time reduced
memcmp-lasx 16%-74%
memcmp-lsx 20%-50%
memcmp-aligned 5%-20%
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/lp64/multiarch/Makefile | 3 +
.../lp64/multiarch/ifunc-impl-list.c | 7 +
.../loongarch/lp64/multiarch/ifunc-memcmp.h | 40 +++
.../loongarch/lp64/multiarch/memcmp-aligned.S | 292 ++++++++++++++++++
.../loongarch/lp64/multiarch/memcmp-lasx.S | 207 +++++++++++++
sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S | 269 ++++++++++++++++
sysdeps/loongarch/lp64/multiarch/memcmp.c | 43 +++
7 files changed, 861 insertions(+)
create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-memcmp.h
create mode 100644 sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/memcmp.c
diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
index 216886c5..360a6718 100644
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
@@ -34,5 +34,8 @@ sysdep_routines += \
memset-unaligned \
memset-lsx \
memset-lasx \
+ memcmp-aligned \
+ memcmp-lsx \
+ memcmp-lasx \
# sysdep_routines
endif
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
index 37f60dde..e397d58c 100644
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
@@ -127,5 +127,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_aligned)
)
+ IFUNC_IMPL (i, name, memcmp,
+#if !defined __loongarch_soft_float
+ IFUNC_IMPL_ADD (array, i, memcmp, SUPPORT_LASX, __memcmp_lasx)
+ IFUNC_IMPL_ADD (array, i, memcmp, SUPPORT_LSX, __memcmp_lsx)
+#endif
+ IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_aligned)
+ )
return i;
}
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-memcmp.h b/sysdeps/loongarch/lp64/multiarch/ifunc-memcmp.h
new file mode 100644
index 00000000..04adc2e5
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-memcmp.h
@@ -0,0 +1,40 @@
+/* Common definition for memcmp ifunc selections.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <ldsodefs.h>
+#include <ifunc-init.h>
+
+#if !defined __loongarch_soft_float
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
+#endif
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+#if !defined __loongarch_soft_float
+ if (SUPPORT_LASX)
+ return OPTIMIZE (lasx);
+ else if (SUPPORT_LSX)
+ return OPTIMIZE (lsx);
+ else
+#endif
+ return OPTIMIZE (aligned);
+}
diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S
new file mode 100644
index 00000000..14a7caa9
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S
@@ -0,0 +1,292 @@
+/* Optimized memcmp implementation using basic LoongArch instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+# define MEMCMP_NAME __memcmp_aligned
+#else
+# define MEMCMP_NAME memcmp
+#endif
+
+LEAF(MEMCMP_NAME, 6)
+ beqz a2, L(ret)
+ andi a4, a1, 0x7
+ andi a3, a0, 0x7
+ sltu a5, a4, a3
+
+ xor t0, a0, a1
+ li.w t8, 8
+ maskeqz t0, t0, a5
+ li.w t7, -1
+
+ xor a0, a0, t0
+ xor a1, a1, t0
+ andi a3, a0, 0x7
+ andi a4, a1, 0x7
+
+ xor a0, a0, a3
+ xor a1, a1, a4
+ ld.d t2, a0, 0
+ ld.d t1, a1, 0
+
+ slli.d t3, a3, 3
+ slli.d t4, a4, 3
+ sub.d a6, t3, t4
+ srl.d t1, t1, t4
+
+ srl.d t0, t2, t3
+ srl.d t5, t7, t4
+ sub.d t6, t0, t1
+ and t6, t6, t5
+
+ sub.d t5, t8, a4
+ bnez t6, L(first_out)
+ bgeu t5, a2, L(ret)
+ sub.d a2, a2, t5
+
+ bnez a6, L(unaligned)
+ blt a2, t8, L(al_less_8bytes)
+ andi t1, a2, 31
+ beq t1, a2, L(al_less_32bytes)
+
+ sub.d t2, a2, t1
+ add.d a4, a0, t2
+ move a2, t1
+
+L(al_loop):
+ ld.d t0, a0, 8
+
+ ld.d t1, a1, 8
+ ld.d t2, a0, 16
+ ld.d t3, a1, 16
+ ld.d t4, a0, 24
+
+ ld.d t5, a1, 24
+ ld.d t6, a0, 32
+ ld.d t7, a1, 32
+ addi.d a0, a0, 32
+
+ addi.d a1, a1, 32
+ bne t0, t1, L(out1)
+ bne t2, t3, L(out2)
+ bne t4, t5, L(out3)
+
+ bne t6, t7, L(out4)
+ bne a0, a4, L(al_loop)
+
+L(al_less_32bytes):
+ srai.d a4, a2, 4
+ beqz a4, L(al_less_16bytes)
+
+ ld.d t0, a0, 8
+ ld.d t1, a1, 8
+ ld.d t2, a0, 16
+ ld.d t3, a1, 16
+
+ addi.d a0, a0, 16
+ addi.d a1, a1, 16
+ addi.d a2, a2, -16
+ bne t0, t1, L(out1)
+
+ bne t2, t3, L(out2)
+
+L(al_less_16bytes):
+ srai.d a4, a2, 3
+ beqz a4, L(al_less_8bytes)
+ ld.d t0, a0, 8
+
+ ld.d t1, a1, 8
+ addi.d a0, a0, 8
+ addi.d a1, a1, 8
+ addi.d a2, a2, -8
+
+ bne t0, t1, L(out1)
+
+L(al_less_8bytes):
+ beqz a2, L(ret)
+ ld.d t0, a0, 8
+ ld.d t1, a1, 8
+
+ li.d t7, -1
+ slli.d t2, a2, 3
+ sll.d t2, t7, t2
+ sub.d t3, t0, t1
+
+ andn t6, t3, t2
+ bnez t6, L(count_diff)
+
+L(ret):
+ move a0, zero
+ jr ra
+
+L(out4):
+ move t0, t6
+ move t1, t7
+ sub.d t6, t6, t7
+ b L(count_diff)
+
+L(out3):
+ move t0, t4
+ move t1, t5
+ sub.d t6, t4, t5
+ b L(count_diff)
+
+L(out2):
+ move t0, t2
+ move t1, t3
+L(out1):
+ sub.d t6, t0, t1
+ b L(count_diff)
+
+L(first_out):
+ slli.d t4, a2, 3
+ slt t3, a2, t5
+ sll.d t4, t7, t4
+ maskeqz t4, t4, t3
+
+ andn t6, t6, t4
+
+L(count_diff):
+ ctz.d t2, t6
+ bstrins.d t2, zero, 2, 0
+ srl.d t0, t0, t2
+
+ srl.d t1, t1, t2
+ andi t0, t0, 0xff
+ andi t1, t1, 0xff
+ sub.d t2, t0, t1
+
+ sub.d t3, t1, t0
+ masknez t2, t2, a5
+ maskeqz t3, t3, a5
+ or a0, t2, t3
+
+ jr ra
+
+L(unaligned):
+ sub.d a7, zero, a6
+ srl.d t0, t2, a6
+ blt a2, t8, L(un_less_8bytes)
+
+ andi t1, a2, 31
+ beq t1, a2, L(un_less_32bytes)
+ sub.d t2, a2, t1
+ add.d a4, a0, t2
+
+ move a2, t1
+
+L(un_loop):
+ ld.d t2, a0, 8
+ ld.d t1, a1, 8
+ ld.d t4, a0, 16
+
+ ld.d t3, a1, 16
+ ld.d t6, a0, 24
+ ld.d t5, a1, 24
+ ld.d t8, a0, 32
+
+ ld.d t7, a1, 32
+ addi.d a0, a0, 32
+ addi.d a1, a1, 32
+ sll.d a3, t2, a7
+
+ or t0, a3, t0
+ bne t0, t1, L(out1)
+ srl.d t0, t2, a6
+ sll.d a3, t4, a7
+
+ or t2, a3, t0
+ bne t2, t3, L(out2)
+ srl.d t0, t4, a6
+ sll.d a3, t6, a7
+
+ or t4, a3, t0
+ bne t4, t5, L(out3)
+ srl.d t0, t6, a6
+ sll.d a3, t8, a7
+
+ or t6, t0, a3
+ bne t6, t7, L(out4)
+ srl.d t0, t8, a6
+ bne a0, a4, L(un_loop)
+
+L(un_less_32bytes):
+ srai.d a4, a2, 4
+ beqz a4, L(un_less_16bytes)
+ ld.d t2, a0, 8
+ ld.d t1, a1, 8
+
+ ld.d t4, a0, 16
+ ld.d t3, a1, 16
+ addi.d a0, a0, 16
+ addi.d a1, a1, 16
+
+ addi.d a2, a2, -16
+ sll.d a3, t2, a7
+ or t0, a3, t0
+ bne t0, t1, L(out1)
+
+ srl.d t0, t2, a6
+ sll.d a3, t4, a7
+ or t2, a3, t0
+ bne t2, t3, L(out2)
+
+ srl.d t0, t4, a6
+
+L(un_less_16bytes):
+ srai.d a4, a2, 3
+ beqz a4, L(un_less_8bytes)
+ ld.d t2, a0, 8
+
+ ld.d t1, a1, 8
+ addi.d a0, a0, 8
+ addi.d a1, a1, 8
+ addi.d a2, a2, -8
+
+ sll.d a3, t2, a7
+ or t0, a3, t0
+ bne t0, t1, L(out1)
+ srl.d t0, t2, a6
+
+L(un_less_8bytes):
+ beqz a2, L(ret)
+ andi a7, a7, 63
+ slli.d a4, a2, 3
+ bgeu a7, a4, L(last_cmp)
+
+ ld.d t2, a0, 8
+ sll.d a3, t2, a7
+ or t0, a3, t0
+
+L(last_cmp):
+ ld.d t1, a1, 8
+
+ li.d t7, -1
+ sll.d t2, t7, a4
+ sub.d t3, t0, t1
+ andn t6, t3, t2
+
+ bnez t6, L(count_diff)
+ move a0, zero
+ jr ra
+END(MEMCMP_NAME)
+
+libc_hidden_builtin_def (MEMCMP_NAME)
diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S
new file mode 100644
index 00000000..3151a179
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S
@@ -0,0 +1,207 @@
+/* Optimized memcmp implementation using LoongArch LASX instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# define MEMCMP __memcmp_lasx
+
+LEAF(MEMCMP, 6)
+ li.d t2, 32
+ add.d a3, a0, a2
+ add.d a4, a1, a2
+ bgeu t2, a2, L(less32)
+
+ li.d t1, 160
+ bgeu a2, t1, L(make_aligned)
+L(loop32):
+ xvld xr0, a0, 0
+ xvld xr1, a1, 0
+
+ addi.d a0, a0, 32
+ addi.d a1, a1, 32
+ addi.d a2, a2, -32
+ xvseq.b xr2, xr0, xr1
+
+ xvsetanyeqz.b fcc0, xr2
+ bcnez fcc0, L(end)
+L(last_bytes):
+ bltu t2, a2, L(loop32)
+ xvld xr0, a3, -32
+
+
+ xvld xr1, a4, -32
+ xvseq.b xr2, xr0, xr1
+L(end):
+ xvmsknz.b xr2, xr2
+ xvpermi.q xr4, xr0, 1
+
+ xvpickve.w xr3, xr2, 4
+ xvpermi.q xr5, xr1, 1
+ vilvl.h vr2, vr3, vr2
+ movfr2gr.s t0, fa2
+
+ cto.w t0, t0
+ vreplgr2vr.b vr2, t0
+ vshuf.b vr0, vr4, vr0, vr2
+ vshuf.b vr1, vr5, vr1, vr2
+
+ vpickve2gr.bu t0, vr0, 0
+ vpickve2gr.bu t1, vr1, 0
+ sub.d a0, t0, t1
+ jr ra
+
+
+L(less32):
+ srli.d t0, a2, 4
+ beqz t0, L(less16)
+ vld vr0, a0, 0
+ vld vr1, a1, 0
+
+ vld vr2, a3, -16
+ vld vr3, a4, -16
+L(short_ret):
+ vseq.b vr4, vr0, vr1
+ vseq.b vr5, vr2, vr3
+
+ vmsknz.b vr4, vr4
+ vmsknz.b vr5, vr5
+ vilvl.h vr4, vr5, vr4
+ movfr2gr.s t0, fa4
+
+ cto.w t0, t0
+ vreplgr2vr.b vr4, t0
+ vshuf.b vr0, vr2, vr0, vr4
+ vshuf.b vr1, vr3, vr1, vr4
+
+
+ vpickve2gr.bu t0, vr0, 0
+ vpickve2gr.bu t1, vr1, 0
+ sub.d a0, t0, t1
+ jr ra
+
+L(less16):
+ srli.d t0, a2, 3
+ beqz t0, L(less8)
+ vldrepl.d vr0, a0, 0
+ vldrepl.d vr1, a1, 0
+
+ vldrepl.d vr2, a3, -8
+ vldrepl.d vr3, a4, -8
+ b L(short_ret)
+ nop
+
+L(less8):
+ srli.d t0, a2, 2
+ beqz t0, L(less4)
+ vldrepl.w vr0, a0, 0
+ vldrepl.w vr1, a1, 0
+
+
+ vldrepl.w vr2, a3, -4
+ vldrepl.w vr3, a4, -4
+ b L(short_ret)
+ nop
+
+L(less4):
+ srli.d t0, a2, 1
+ beqz t0, L(less2)
+ vldrepl.h vr0, a0, 0
+ vldrepl.h vr1, a1, 0
+
+ vldrepl.h vr2, a3, -2
+ vldrepl.h vr3, a4, -2
+ b L(short_ret)
+ nop
+
+L(less2):
+ beqz a2, L(ret0)
+ ld.bu t0, a0, 0
+ ld.bu t1, a1, 0
+ sub.d a0, t0, t1
+
+ jr ra
+L(ret0):
+ move a0, zero
+ jr ra
+
+L(make_aligned):
+ xvld xr0, a0, 0
+
+ xvld xr1, a1, 0
+ xvseq.b xr2, xr0, xr1
+ xvsetanyeqz.b fcc0, xr2
+ bcnez fcc0, L(end)
+
+ andi t0, a0, 0x1f
+ sub.d t0, t2, t0
+ sub.d t1, a2, t0
+ add.d a0, a0, t0
+
+ add.d a1, a1, t0
+ andi a2, t1, 0x3f
+ sub.d t0, t1, a2
+ add.d a5, a0, t0
+
+
+L(loop_align):
+ xvld xr0, a0, 0
+ xvld xr1, a1, 0
+ xvld xr2, a0, 32
+ xvld xr3, a1, 32
+
+ xvseq.b xr0, xr0, xr1
+ xvseq.b xr1, xr2, xr3
+ xvmin.bu xr2, xr1, xr0
+ xvsetanyeqz.b fcc0, xr2
+
+ bcnez fcc0, L(pair_end)
+ addi.d a0, a0, 64
+ addi.d a1, a1, 64
+ bne a0, a5, L(loop_align)
+
+ bnez a2, L(last_bytes)
+ move a0, zero
+ jr ra
+ nop
+
+
+L(pair_end):
+ xvmsknz.b xr0, xr0
+ xvmsknz.b xr1, xr1
+ xvpickve.w xr2, xr0, 4
+ xvpickve.w xr3, xr1, 4
+
+ vilvl.h vr0, vr2, vr0
+ vilvl.h vr1, vr3, vr1
+ vilvl.w vr0, vr1, vr0
+ movfr2gr.d t0, fa0
+
+ cto.d t0, t0
+ ldx.bu t1, a0, t0
+ ldx.bu t2, a1, t0
+ sub.d a0, t1, t2
+
+ jr ra
+END(MEMCMP)
+
+libc_hidden_builtin_def (MEMCMP)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S
new file mode 100644
index 00000000..38a50a4c
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S
@@ -0,0 +1,269 @@
+/* Optimized memcmp implementation using LoongArch LSX instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+#define MEMCMP __memcmp_lsx
+
+LEAF(MEMCMP, 6)
+ beqz a2, L(out)
+ pcalau12i t0, %pc_hi20(L(INDEX))
+ andi a3, a0, 0xf
+ vld vr5, t0, %pc_lo12(L(INDEX))
+
+ andi a4, a1, 0xf
+ bne a3, a4, L(unaligned)
+ bstrins.d a0, zero, 3, 0
+ xor a1, a1, a4
+
+ vld vr0, a0, 0
+ vld vr1, a1, 0
+ li.d t0, 16
+ vreplgr2vr.b vr3, a3
+
+ sub.d t1, t0, a3
+ vadd.b vr3, vr3, vr5
+ vshuf.b vr0, vr3, vr0, vr3
+ vshuf.b vr1, vr3, vr1, vr3
+
+
+ vseq.b vr4, vr0, vr1
+ bgeu t1, a2, L(al_end)
+ vsetanyeqz.b fcc0, vr4
+ bcnez fcc0, L(al_found)
+
+ sub.d t1, a2, t1
+ andi a2, t1, 31
+ beq a2, t1, L(al_less_32bytes)
+ sub.d t2, t1, a2
+
+ add.d a4, a0, t2
+L(al_loop):
+ vld vr0, a0, 16
+ vld vr1, a1, 16
+ vld vr2, a0, 32
+
+ vld vr3, a1, 32
+ addi.d a0, a0, 32
+ addi.d a1, a1, 32
+ vseq.b vr4, vr0, vr1
+
+
+ vseq.b vr6, vr2, vr3
+ vand.v vr6, vr4, vr6
+ vsetanyeqz.b fcc0, vr6
+ bcnez fcc0, L(al_pair_end)
+
+ bne a0, a4, L(al_loop)
+L(al_less_32bytes):
+ bgeu t0, a2, L(al_less_16bytes)
+ vld vr0, a0, 16
+ vld vr1, a1, 16
+
+ vld vr2, a0, 32
+ vld vr3, a1, 32
+ addi.d a2, a2, -16
+ vreplgr2vr.b vr6, a2
+
+ vslt.b vr5, vr5, vr6
+ vseq.b vr4, vr0, vr1
+ vseq.b vr6, vr2, vr3
+ vorn.v vr6, vr6, vr5
+
+
+L(al_pair_end):
+ vsetanyeqz.b fcc0, vr4
+ bcnez fcc0, L(al_found)
+ vnori.b vr4, vr6, 0
+ vfrstpi.b vr4, vr4, 0
+
+ vshuf.b vr0, vr2, vr2, vr4
+ vshuf.b vr1, vr3, vr3, vr4
+ vpickve2gr.bu t0, vr0, 0
+ vpickve2gr.bu t1, vr1, 0
+
+ sub.d a0, t0, t1
+ jr ra
+ nop
+ nop
+
+L(al_less_16bytes):
+ beqz a2, L(out)
+ vld vr0, a0, 16
+ vld vr1, a1, 16
+ vseq.b vr4, vr0, vr1
+
+
+L(al_end):
+ vreplgr2vr.b vr6, a2
+ vslt.b vr5, vr5, vr6
+ vorn.v vr4, vr4, vr5
+ nop
+
+L(al_found):
+ vnori.b vr4, vr4, 0
+ vfrstpi.b vr4, vr4, 0
+ vshuf.b vr0, vr0, vr0, vr4
+ vshuf.b vr1, vr1, vr1, vr4
+
+ vpickve2gr.bu t0, vr0, 0
+ vpickve2gr.bu t1, vr1, 0
+ sub.d a0, t0, t1
+ jr ra
+
+L(out):
+ move a0, zero
+ jr ra
+ nop
+ nop
+
+
+L(unaligned):
+ xor t2, a0, a1
+ sltu a5, a3, a4
+ masknez t2, t2, a5
+ xor a0, a0, t2
+
+ xor a1, a1, t2
+ andi a3, a0, 0xf
+ andi a4, a1, 0xf
+ bstrins.d a0, zero, 3, 0
+
+ xor a1, a1, a4
+ vld vr4, a0, 0
+ vld vr1, a1, 0
+ li.d t0, 16
+
+ vreplgr2vr.b vr2, a4
+ sub.d a6, a4, a3
+ sub.d t1, t0, a4
+ sub.d t2, t0, a6
+
+
+ vadd.b vr2, vr2, vr5
+ vreplgr2vr.b vr6, t2
+ vadd.b vr6, vr6, vr5
+ vshuf.b vr0, vr4, vr4, vr6
+
+ vshuf.b vr1, vr2, vr1, vr2
+ vshuf.b vr0, vr2, vr0, vr2
+ vseq.b vr7, vr0, vr1
+ bgeu t1, a2, L(un_end)
+
+ vsetanyeqz.b fcc0, vr7
+ bcnez fcc0, L(un_found)
+ sub.d a2, a2, t1
+ andi t1, a2, 31
+
+ beq a2, t1, L(un_less_32bytes)
+ sub.d t2, a2, t1
+ move a2, t1
+ add.d a4, a1, t2
+
+
+L(un_loop):
+ vld vr2, a0, 16
+ vld vr1, a1, 16
+ vld vr3, a1, 32
+ addi.d a1, a1, 32
+
+ addi.d a0, a0, 32
+ vshuf.b vr0, vr2, vr4, vr6
+ vld vr4, a0, 0
+ vseq.b vr7, vr0, vr1
+
+ vshuf.b vr2, vr4, vr2, vr6
+ vseq.b vr8, vr2, vr3
+ vand.v vr8, vr7, vr8
+ vsetanyeqz.b fcc0, vr8
+
+ bcnez fcc0, L(un_pair_end)
+ bne a1, a4, L(un_loop)
+
+L(un_less_32bytes):
+ bltu a2, t0, L(un_less_16bytes)
+ vld vr2, a0, 16
+ vld vr1, a1, 16
+ addi.d a0, a0, 16
+
+ addi.d a1, a1, 16
+ addi.d a2, a2, -16
+ vshuf.b vr0, vr2, vr4, vr6
+ vor.v vr4, vr2, vr2
+
+ vseq.b vr7, vr0, vr1
+ vsetanyeqz.b fcc0, vr7
+ bcnez fcc0, L(un_found)
+L(un_less_16bytes):
+ beqz a2, L(out)
+ vld vr1, a1, 16
+ bgeu a6, a2, 1f
+
+ vld vr2, a0, 16
+1:
+ vshuf.b vr0, vr2, vr4, vr6
+ vseq.b vr7, vr0, vr1
+L(un_end):
+ vreplgr2vr.b vr3, a2
+
+
+ vslt.b vr3, vr5, vr3
+ vorn.v vr7, vr7, vr3
+
+L(un_found):
+ vnori.b vr7, vr7, 0
+ vfrstpi.b vr7, vr7, 0
+
+ vshuf.b vr0, vr0, vr0, vr7
+ vshuf.b vr1, vr1, vr1, vr7
+L(calc_result):
+ vpickve2gr.bu t0, vr0, 0
+ vpickve2gr.bu t1, vr1, 0
+
+ sub.d t2, t0, t1
+ sub.d t3, t1, t0
+ masknez t0, t3, a5
+ maskeqz t1, t2, a5
+
+ or a0, t0, t1
+ jr ra
+L(un_pair_end):
+ vsetanyeqz.b fcc0, vr7
+ bcnez fcc0, L(un_found)
+
+
+ vnori.b vr7, vr8, 0
+ vfrstpi.b vr7, vr7, 0
+ vshuf.b vr0, vr2, vr2, vr7
+ vshuf.b vr1, vr3, vr3, vr7
+
+ b L(calc_result)
+END(MEMCMP)
+
+ .section .rodata.cst16,"M",@progbits,16
+ .align 4
+L(INDEX):
+ .dword 0x0706050403020100
+ .dword 0x0f0e0d0c0b0a0908
+
+libc_hidden_builtin_def (MEMCMP)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp.c b/sysdeps/loongarch/lp64/multiarch/memcmp.c
new file mode 100644
index 00000000..32eccac2
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memcmp.c
@@ -0,0 +1,43 @@
+/* Multiple versions of memcmp.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define memcmp __redirect_memcmp
+# include <string.h>
+# undef memcmp
+
+# define SYMBOL_NAME memcmp
+# include "ifunc-memcmp.h"
+
+libc_ifunc_redirected (__redirect_memcmp, memcmp,
+ IFUNC_SELECTOR ());
+# undef bcmp
+weak_alias (memcmp, bcmp)
+
+# undef __memcmpeq
+strong_alias (memcmp, __memcmpeq)
+libc_hidden_def (__memcmpeq)
+
+# ifdef SHARED
+__hidden_ver1 (memcmp, __GI_memcmp, __redirect_memcmp)
+ __attribute__ ((visibility ("hidden"))) __attribute_copy__ (memcmp);
+# endif
+
+#endif
--
2.33.0

View file

@ -0,0 +1,417 @@
From c4c272fb8067364530a2a78df92c37403acc963f Mon Sep 17 00:00:00 2001
From: dengjianbo <dengjianbo@loongson.cn>
Date: Mon, 28 Aug 2023 10:08:37 +0800
Subject: [PATCH 16/29] LoongArch: Add ifunc support for memrchr{lsx, lasx}
According to glibc memrchr microbenchmark, this implementation could reduce
the runtime as following:
Name Percent of rutime reduced
memrchr-lasx 20%-83%
memrchr-lsx 20%-64%
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/lp64/multiarch/Makefile | 3 +
.../lp64/multiarch/ifunc-impl-list.c | 8 ++
.../loongarch/lp64/multiarch/ifunc-memrchr.h | 40 ++++++
.../lp64/multiarch/memrchr-generic.c | 23 ++++
.../loongarch/lp64/multiarch/memrchr-lasx.S | 123 ++++++++++++++++++
.../loongarch/lp64/multiarch/memrchr-lsx.S | 105 +++++++++++++++
sysdeps/loongarch/lp64/multiarch/memrchr.c | 33 +++++
7 files changed, 335 insertions(+)
create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-memrchr.h
create mode 100644 sysdeps/loongarch/lp64/multiarch/memrchr-generic.c
create mode 100644 sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/memrchr.c
diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
index 2f4802cf..7b87bc90 100644
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
@@ -27,5 +27,8 @@ sysdep_routines += \
memchr-aligned \
memchr-lsx \
memchr-lasx \
+ memrchr-generic \
+ memrchr-lsx \
+ memrchr-lasx \
# sysdep_routines
endif
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
index a567b9cf..8bd5489e 100644
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
@@ -109,5 +109,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
#endif
IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_aligned)
)
+
+ IFUNC_IMPL (i, name, memrchr,
+#if !defined __loongarch_soft_float
+ IFUNC_IMPL_ADD (array, i, memrchr, SUPPORT_LASX, __memrchr_lasx)
+ IFUNC_IMPL_ADD (array, i, memrchr, SUPPORT_LSX, __memrchr_lsx)
+#endif
+ IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_generic)
+ )
return i;
}
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-memrchr.h b/sysdeps/loongarch/lp64/multiarch/ifunc-memrchr.h
new file mode 100644
index 00000000..8215f9ad
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-memrchr.h
@@ -0,0 +1,40 @@
+/* Common definition for memrchr implementation.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <ldsodefs.h>
+#include <ifunc-init.h>
+
+#if !defined __loongarch_soft_float
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
+#endif
+extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+#if !defined __loongarch_soft_float
+ if (SUPPORT_LASX)
+ return OPTIMIZE (lasx);
+ else if (SUPPORT_LSX)
+ return OPTIMIZE (lsx);
+ else
+#endif
+ return OPTIMIZE (generic);
+}
diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-generic.c b/sysdeps/loongarch/lp64/multiarch/memrchr-generic.c
new file mode 100644
index 00000000..ced61ebc
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memrchr-generic.c
@@ -0,0 +1,23 @@
+/* Generic implementation of memrchr.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# define MEMRCHR __memrchr_generic
+#endif
+
+#include <string/memrchr.c>
diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S
new file mode 100644
index 00000000..5f3e0d06
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memrchr-lasx.S
@@ -0,0 +1,123 @@
+/* Optimized memrchr implementation using LoongArch LASX instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+#ifndef MEMRCHR
+# define MEMRCHR __memrchr_lasx
+#endif
+
+LEAF(MEMRCHR, 6)
+ beqz a2, L(ret0)
+ addi.d a2, a2, -1
+ add.d a3, a0, a2
+ andi t1, a3, 0x3f
+
+ bstrins.d a3, zero, 5, 0
+ addi.d t1, t1, 1
+ xvld xr0, a3, 0
+ xvld xr1, a3, 32
+
+ sub.d t2, zero, t1
+ li.d t3, -1
+ xvreplgr2vr.b xr2, a1
+ andi t4, a0, 0x3f
+
+ srl.d t2, t3, t2
+ xvseq.b xr0, xr0, xr2
+ xvseq.b xr1, xr1, xr2
+ xvmsknz.b xr0, xr0
+
+
+ xvmsknz.b xr1, xr1
+ xvpickve.w xr3, xr0, 4
+ xvpickve.w xr4, xr1, 4
+ vilvl.h vr0, vr3, vr0
+
+ vilvl.h vr1, vr4, vr1
+ vilvl.w vr0, vr1, vr0
+ movfr2gr.d t0, fa0
+ and t0, t0, t2
+
+ bltu a2, t1, L(end)
+ bnez t0, L(found)
+ bstrins.d a0, zero, 5, 0
+L(loop):
+ xvld xr0, a3, -64
+
+ xvld xr1, a3, -32
+ addi.d a3, a3, -64
+ xvseq.b xr0, xr0, xr2
+ xvseq.b xr1, xr1, xr2
+
+
+ beq a0, a3, L(out)
+ xvmax.bu xr3, xr0, xr1
+ xvseteqz.v fcc0, xr3
+ bcnez fcc0, L(loop)
+
+ xvmsknz.b xr0, xr0
+ xvmsknz.b xr1, xr1
+ xvpickve.w xr3, xr0, 4
+ xvpickve.w xr4, xr1, 4
+
+ vilvl.h vr0, vr3, vr0
+ vilvl.h vr1, vr4, vr1
+ vilvl.w vr0, vr1, vr0
+ movfr2gr.d t0, fa0
+
+L(found):
+ addi.d a0, a3, 63
+ clz.d t1, t0
+ sub.d a0, a0, t1
+ jr ra
+
+
+L(out):
+ xvmsknz.b xr0, xr0
+ xvmsknz.b xr1, xr1
+ xvpickve.w xr3, xr0, 4
+ xvpickve.w xr4, xr1, 4
+
+ vilvl.h vr0, vr3, vr0
+ vilvl.h vr1, vr4, vr1
+ vilvl.w vr0, vr1, vr0
+ movfr2gr.d t0, fa0
+
+L(end):
+ sll.d t2, t3, t4
+ and t0, t0, t2
+ addi.d a0, a3, 63
+ clz.d t1, t0
+
+ sub.d a0, a0, t1
+ maskeqz a0, a0, t0
+ jr ra
+L(ret0):
+ move a0, zero
+
+
+ jr ra
+END(MEMRCHR)
+
+libc_hidden_builtin_def (MEMRCHR)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S
new file mode 100644
index 00000000..39a7c8b0
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memrchr-lsx.S
@@ -0,0 +1,105 @@
+/* Optimized memrchr implementation using LoongArch LSX instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# define MEMRCHR __memrchr_lsx
+
+LEAF(MEMRCHR, 6)
+ beqz a2, L(ret0)
+ addi.d a2, a2, -1
+ add.d a3, a0, a2
+ andi t1, a3, 0x1f
+
+ bstrins.d a3, zero, 4, 0
+ addi.d t1, t1, 1
+ vld vr0, a3, 0
+ vld vr1, a3, 16
+
+ sub.d t2, zero, t1
+ li.d t3, -1
+ vreplgr2vr.b vr2, a1
+ andi t4, a0, 0x1f
+
+ srl.d t2, t3, t2
+ vseq.b vr0, vr0, vr2
+ vseq.b vr1, vr1, vr2
+ vmsknz.b vr0, vr0
+
+
+ vmsknz.b vr1, vr1
+ vilvl.h vr0, vr1, vr0
+ movfr2gr.s t0, fa0
+ and t0, t0, t2
+
+ bltu a2, t1, L(end)
+ bnez t0, L(found)
+ bstrins.d a0, zero, 4, 0
+L(loop):
+ vld vr0, a3, -32
+
+ vld vr1, a3, -16
+ addi.d a3, a3, -32
+ vseq.b vr0, vr0, vr2
+ vseq.b vr1, vr1, vr2
+
+ beq a0, a3, L(out)
+ vmax.bu vr3, vr0, vr1
+ vseteqz.v fcc0, vr3
+ bcnez fcc0, L(loop)
+
+
+ vmsknz.b vr0, vr0
+ vmsknz.b vr1, vr1
+ vilvl.h vr0, vr1, vr0
+ movfr2gr.s t0, fa0
+
+L(found):
+ addi.d a0, a3, 31
+ clz.w t1, t0
+ sub.d a0, a0, t1
+ jr ra
+
+L(out):
+ vmsknz.b vr0, vr0
+ vmsknz.b vr1, vr1
+ vilvl.h vr0, vr1, vr0
+ movfr2gr.s t0, fa0
+
+L(end):
+ sll.d t2, t3, t4
+ and t0, t0, t2
+ addi.d a0, a3, 31
+ clz.w t1, t0
+
+
+ sub.d a0, a0, t1
+ maskeqz a0, a0, t0
+ jr ra
+L(ret0):
+ move a0, zero
+
+ jr ra
+END(MEMRCHR)
+
+libc_hidden_builtin_def (MEMRCHR)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/memrchr.c b/sysdeps/loongarch/lp64/multiarch/memrchr.c
new file mode 100644
index 00000000..8baba9ab
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memrchr.c
@@ -0,0 +1,33 @@
+/* Multiple versions of memrchr.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define memrchr __redirect_memrchr
+# include <string.h>
+# undef memrchr
+
+# define SYMBOL_NAME memrchr
+# include "ifunc-memrchr.h"
+
+libc_ifunc_redirected (__redirect_memrchr, __memrchr, IFUNC_SELECTOR ());
+libc_hidden_def (__memrchr)
+weak_alias (__memrchr, memrchr)
+
+#endif
--
2.33.0

View file

@ -0,0 +1,784 @@
From 14032f7bbe18443af8492f5d0365f72b76701673 Mon Sep 17 00:00:00 2001
From: dengjianbo <dengjianbo@loongson.cn>
Date: Mon, 28 Aug 2023 10:08:38 +0800
Subject: [PATCH 17/29] LoongArch: Add ifunc support for memset{aligned,
unaligned, lsx, lasx}
According to glibc memset microbenchmark test results, for LSX and LASX
versions, A few cases with length less than 8 experience performace
degradation, overall, the LASX version could reduce the runtime about
15% - 75%, LSX version could reduce the runtime about 15%-50%.
The unaligned version uses unaligned memmory access to set data which
length is less than 64 and make address aligned with 8. For this part,
the performace is better than aligned version. Comparing with the generic
version, the performance is close when the length is larger than 128. When
the length is 8-128, the unaligned version could reduce the runtime about
30%-70%, the aligned version could reduce the runtime about 20%-50%.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/lp64/multiarch/Makefile | 4 +
.../lp64/multiarch/dl-symbol-redir-ifunc.h | 24 +++
.../lp64/multiarch/ifunc-impl-list.c | 10 +
.../loongarch/lp64/multiarch/memset-aligned.S | 174 ++++++++++++++++++
.../loongarch/lp64/multiarch/memset-lasx.S | 142 ++++++++++++++
sysdeps/loongarch/lp64/multiarch/memset-lsx.S | 135 ++++++++++++++
.../lp64/multiarch/memset-unaligned.S | 162 ++++++++++++++++
sysdeps/loongarch/lp64/multiarch/memset.c | 37 ++++
8 files changed, 688 insertions(+)
create mode 100644 sysdeps/loongarch/lp64/multiarch/dl-symbol-redir-ifunc.h
create mode 100644 sysdeps/loongarch/lp64/multiarch/memset-aligned.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/memset-lasx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/memset-lsx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/memset.c
diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
index 7b87bc90..216886c5 100644
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
@@ -30,5 +30,9 @@ sysdep_routines += \
memrchr-generic \
memrchr-lsx \
memrchr-lasx \
+ memset-aligned \
+ memset-unaligned \
+ memset-lsx \
+ memset-lasx \
# sysdep_routines
endif
diff --git a/sysdeps/loongarch/lp64/multiarch/dl-symbol-redir-ifunc.h b/sysdeps/loongarch/lp64/multiarch/dl-symbol-redir-ifunc.h
new file mode 100644
index 00000000..e2723873
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/dl-symbol-redir-ifunc.h
@@ -0,0 +1,24 @@
+/* Symbol rediretion for loader/static initialization code.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _DL_IFUNC_GENERIC_H
+#define _DL_IFUNC_GENERIC_H
+
+asm ("memset = __memset_aligned");
+
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
index 8bd5489e..37f60dde 100644
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
@@ -117,5 +117,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
#endif
IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_generic)
)
+
+ IFUNC_IMPL (i, name, memset,
+#if !defined __loongarch_soft_float
+ IFUNC_IMPL_ADD (array, i, memset, SUPPORT_LASX, __memset_lasx)
+ IFUNC_IMPL_ADD (array, i, memset, SUPPORT_LSX, __memset_lsx)
+#endif
+ IFUNC_IMPL_ADD (array, i, memset, SUPPORT_UAL, __memset_unaligned)
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_aligned)
+ )
+
return i;
}
diff --git a/sysdeps/loongarch/lp64/multiarch/memset-aligned.S b/sysdeps/loongarch/lp64/multiarch/memset-aligned.S
new file mode 100644
index 00000000..1fce95b7
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memset-aligned.S
@@ -0,0 +1,174 @@
+/* Optimized memset aligned implementation using basic LoongArch instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+# define MEMSET_NAME __memset_aligned
+#else
+# define MEMSET_NAME memset
+#endif
+
+LEAF(MEMSET_NAME, 6)
+ move t0, a0
+ andi a3, a0, 0x7
+ li.w t6, 16
+ beqz a3, L(align)
+ bltu a2, t6, L(short_data)
+
+L(make_align):
+ li.w t8, 8
+ sub.d t2, t8, a3
+ pcaddi t1, 11
+ slli.d t3, t2, 2
+ sub.d t1, t1, t3
+ jr t1
+
+L(al7):
+ st.b a1, t0, 6
+L(al6):
+ st.b a1, t0, 5
+L(al5):
+ st.b a1, t0, 4
+L(al4):
+ st.b a1, t0, 3
+L(al3):
+ st.b a1, t0, 2
+L(al2):
+ st.b a1, t0, 1
+L(al1):
+ st.b a1, t0, 0
+L(al0):
+ add.d t0, t0, t2
+ sub.d a2, a2, t2
+
+L(align):
+ bstrins.d a1, a1, 15, 8
+ bstrins.d a1, a1, 31, 16
+ bstrins.d a1, a1, 63, 32
+ bltu a2, t6, L(less_16bytes)
+
+ andi a4, a2, 0x3f
+ beq a4, a2, L(less_64bytes)
+
+ sub.d t1, a2, a4
+ move a2, a4
+ add.d a5, t0, t1
+
+L(loop_64bytes):
+ addi.d t0, t0, 64
+ st.d a1, t0, -64
+ st.d a1, t0, -56
+ st.d a1, t0, -48
+ st.d a1, t0, -40
+
+ st.d a1, t0, -32
+ st.d a1, t0, -24
+ st.d a1, t0, -16
+ st.d a1, t0, -8
+ bne t0, a5, L(loop_64bytes)
+
+L(less_64bytes):
+ srai.d a4, a2, 5
+ beqz a4, L(less_32bytes)
+ addi.d a2, a2, -32
+ st.d a1, t0, 0
+
+ st.d a1, t0, 8
+ st.d a1, t0, 16
+ st.d a1, t0, 24
+ addi.d t0, t0, 32
+
+L(less_32bytes):
+ bltu a2, t6, L(less_16bytes)
+ addi.d a2, a2, -16
+ st.d a1, t0, 0
+ st.d a1, t0, 8
+ addi.d t0, t0, 16
+
+L(less_16bytes):
+ srai.d a4, a2, 3
+ beqz a4, L(less_8bytes)
+ addi.d a2, a2, -8
+ st.d a1, t0, 0
+ addi.d t0, t0, 8
+
+L(less_8bytes):
+ beqz a2, L(less_1byte)
+ srai.d a4, a2, 2
+ beqz a4, L(less_4bytes)
+ addi.d a2, a2, -4
+ st.w a1, t0, 0
+ addi.d t0, t0, 4
+
+L(less_4bytes):
+ srai.d a3, a2, 1
+ beqz a3, L(less_2bytes)
+ addi.d a2, a2, -2
+ st.h a1, t0, 0
+ addi.d t0, t0, 2
+
+L(less_2bytes):
+ beqz a2, L(less_1byte)
+ st.b a1, t0, 0
+L(less_1byte):
+ jr ra
+
+L(short_data):
+ pcaddi t1, 19
+ slli.d t3, a2, 2
+ sub.d t1, t1, t3
+ jr t1
+L(short_15):
+ st.b a1, a0, 14
+L(short_14):
+ st.b a1, a0, 13
+L(short_13):
+ st.b a1, a0, 12
+L(short_12):
+ st.b a1, a0, 11
+L(short_11):
+ st.b a1, a0, 10
+L(short_10):
+ st.b a1, a0, 9
+L(short_9):
+ st.b a1, a0, 8
+L(short_8):
+ st.b a1, a0, 7
+L(short_7):
+ st.b a1, a0, 6
+L(short_6):
+ st.b a1, a0, 5
+L(short_5):
+ st.b a1, a0, 4
+L(short_4):
+ st.b a1, a0, 3
+L(short_3):
+ st.b a1, a0, 2
+L(short_2):
+ st.b a1, a0, 1
+L(short_1):
+ st.b a1, a0, 0
+L(short_0):
+ jr ra
+END(MEMSET_NAME)
+
+libc_hidden_builtin_def (MEMSET_NAME)
diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lasx.S b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S
new file mode 100644
index 00000000..041abbac
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S
@@ -0,0 +1,142 @@
+/* Optimized memset implementation using LoongArch LASX instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# define MEMSET __memset_lasx
+
+LEAF(MEMSET, 6)
+ li.d t1, 32
+ move a3, a0
+ xvreplgr2vr.b xr0, a1
+ add.d a4, a0, a2
+
+ bgeu t1, a2, L(less_32bytes)
+ li.d t3, 128
+ li.d t2, 64
+ blt t3, a2, L(long_bytes)
+
+L(less_128bytes):
+ bgeu t2, a2, L(less_64bytes)
+ xvst xr0, a3, 0
+ xvst xr0, a3, 32
+ xvst xr0, a4, -32
+
+ xvst xr0, a4, -64
+ jr ra
+L(less_64bytes):
+ xvst xr0, a3, 0
+ xvst xr0, a4, -32
+
+
+ jr ra
+L(less_32bytes):
+ srli.d t0, a2, 4
+ beqz t0, L(less_16bytes)
+ vst vr0, a3, 0
+
+ vst vr0, a4, -16
+ jr ra
+L(less_16bytes):
+ srli.d t0, a2, 3
+ beqz t0, L(less_8bytes)
+
+ vstelm.d vr0, a3, 0, 0
+ vstelm.d vr0, a4, -8, 0
+ jr ra
+L(less_8bytes):
+ srli.d t0, a2, 2
+
+ beqz t0, L(less_4bytes)
+ vstelm.w vr0, a3, 0, 0
+ vstelm.w vr0, a4, -4, 0
+ jr ra
+
+
+L(less_4bytes):
+ srli.d t0, a2, 1
+ beqz t0, L(less_2bytes)
+ vstelm.h vr0, a3, 0, 0
+ vstelm.h vr0, a4, -2, 0
+
+ jr ra
+L(less_2bytes):
+ beqz a2, L(less_1bytes)
+ st.b a1, a3, 0
+L(less_1bytes):
+ jr ra
+
+L(long_bytes):
+ xvst xr0, a3, 0
+ bstrins.d a3, zero, 4, 0
+ addi.d a3, a3, 32
+ sub.d a2, a4, a3
+
+ andi t0, a2, 0xff
+ beq t0, a2, L(long_end)
+ move a2, t0
+ sub.d t0, a4, t0
+
+
+L(loop_256):
+ xvst xr0, a3, 0
+ xvst xr0, a3, 32
+ xvst xr0, a3, 64
+ xvst xr0, a3, 96
+
+ xvst xr0, a3, 128
+ xvst xr0, a3, 160
+ xvst xr0, a3, 192
+ xvst xr0, a3, 224
+
+ addi.d a3, a3, 256
+ bne a3, t0, L(loop_256)
+L(long_end):
+ bltu a2, t3, L(end_less_128)
+ addi.d a2, a2, -128
+
+ xvst xr0, a3, 0
+ xvst xr0, a3, 32
+ xvst xr0, a3, 64
+ xvst xr0, a3, 96
+
+
+ addi.d a3, a3, 128
+L(end_less_128):
+ bltu a2, t2, L(end_less_64)
+ addi.d a2, a2, -64
+ xvst xr0, a3, 0
+
+ xvst xr0, a3, 32
+ addi.d a3, a3, 64
+L(end_less_64):
+ bltu a2, t1, L(end_less_32)
+ xvst xr0, a3, 0
+
+L(end_less_32):
+ xvst xr0, a4, -32
+ jr ra
+END(MEMSET)
+
+libc_hidden_builtin_def (MEMSET)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lsx.S b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S
new file mode 100644
index 00000000..3d3982aa
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S
@@ -0,0 +1,135 @@
+/* Optimized memset implementation using LoongArch LSX instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# define MEMSET __memset_lsx
+
+LEAF(MEMSET, 6)
+ li.d t1, 16
+ move a3, a0
+ vreplgr2vr.b vr0, a1
+ add.d a4, a0, a2
+
+ bgeu t1, a2, L(less_16bytes)
+ li.d t3, 64
+ li.d t2, 32
+ bgeu a2, t3, L(long_bytes)
+
+L(less_64bytes):
+ bgeu t2, a2, L(less_32bytes)
+ vst vr0, a3, 0
+ vst vr0, a3, 16
+ vst vr0, a4, -32
+
+ vst vr0, a4, -16
+ jr ra
+L(less_32bytes):
+ vst vr0, a3, 0
+ vst vr0, a4, -16
+
+
+ jr ra
+L(less_16bytes):
+ srli.d t0, a2, 3
+ beqz t0, L(less_8bytes)
+ vstelm.d vr0, a3, 0, 0
+
+ vstelm.d vr0, a4, -8, 0
+ jr ra
+L(less_8bytes):
+ srli.d t0, a2, 2
+ beqz t0, L(less_4bytes)
+
+ vstelm.w vr0, a3, 0, 0
+ vstelm.w vr0, a4, -4, 0
+ jr ra
+L(less_4bytes):
+ srli.d t0, a2, 1
+
+ beqz t0, L(less_2bytes)
+ vstelm.h vr0, a3, 0, 0
+ vstelm.h vr0, a4, -2, 0
+ jr ra
+
+
+L(less_2bytes):
+ beqz a2, L(less_1bytes)
+ vstelm.b vr0, a3, 0, 0
+L(less_1bytes):
+ jr ra
+L(long_bytes):
+ vst vr0, a3, 0
+
+ bstrins.d a3, zero, 3, 0
+ addi.d a3, a3, 16
+ sub.d a2, a4, a3
+ andi t0, a2, 0x7f
+
+ beq t0, a2, L(long_end)
+ move a2, t0
+ sub.d t0, a4, t0
+
+L(loop_128):
+ vst vr0, a3, 0
+
+ vst vr0, a3, 16
+ vst vr0, a3, 32
+ vst vr0, a3, 48
+ vst vr0, a3, 64
+
+
+ vst vr0, a3, 80
+ vst vr0, a3, 96
+ vst vr0, a3, 112
+ addi.d a3, a3, 128
+
+ bne a3, t0, L(loop_128)
+L(long_end):
+ bltu a2, t3, L(end_less_64)
+ addi.d a2, a2, -64
+ vst vr0, a3, 0
+
+ vst vr0, a3, 16
+ vst vr0, a3, 32
+ vst vr0, a3, 48
+ addi.d a3, a3, 64
+
+L(end_less_64):
+ bltu a2, t2, L(end_less_32)
+ addi.d a2, a2, -32
+ vst vr0, a3, 0
+ vst vr0, a3, 16
+
+ addi.d a3, a3, 32
+L(end_less_32):
+ bltu a2, t1, L(end_less_16)
+ vst vr0, a3, 0
+
+L(end_less_16):
+ vst vr0, a4, -16
+ jr ra
+END(MEMSET)
+
+libc_hidden_builtin_def (MEMSET)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
new file mode 100644
index 00000000..f7d32039
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
@@ -0,0 +1,162 @@
+/* Optimized memset unaligned implementation using basic LoongArch instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+
+# define MEMSET_NAME __memset_unaligned
+
+#define ST_128(n) \
+ st.d a1, a0, n; \
+ st.d a1, a0, n+8 ; \
+ st.d a1, a0, n+16 ; \
+ st.d a1, a0, n+24 ; \
+ st.d a1, a0, n+32 ; \
+ st.d a1, a0, n+40 ; \
+ st.d a1, a0, n+48 ; \
+ st.d a1, a0, n+56 ; \
+ st.d a1, a0, n+64 ; \
+ st.d a1, a0, n+72 ; \
+ st.d a1, a0, n+80 ; \
+ st.d a1, a0, n+88 ; \
+ st.d a1, a0, n+96 ; \
+ st.d a1, a0, n+104; \
+ st.d a1, a0, n+112; \
+ st.d a1, a0, n+120;
+
+LEAF(MEMSET_NAME, 6)
+ bstrins.d a1, a1, 15, 8
+ add.d t7, a0, a2
+ bstrins.d a1, a1, 31, 16
+ move t0, a0
+
+ bstrins.d a1, a1, 63, 32
+ srai.d t8, a2, 4
+ beqz t8, L(less_16bytes)
+ srai.d t8, a2, 6
+
+ bnez t8, L(more_64bytes)
+ srai.d t8, a2, 5
+ beqz t8, L(less_32bytes)
+
+ st.d a1, a0, 0
+ st.d a1, a0, 8
+ st.d a1, a0, 16
+ st.d a1, a0, 24
+
+ st.d a1, t7, -32
+ st.d a1, t7, -24
+ st.d a1, t7, -16
+ st.d a1, t7, -8
+
+ jr ra
+
+L(less_32bytes):
+ st.d a1, a0, 0
+ st.d a1, a0, 8
+ st.d a1, t7, -16
+ st.d a1, t7, -8
+
+ jr ra
+
+L(less_16bytes):
+ srai.d t8, a2, 3
+ beqz t8, L(less_8bytes)
+ st.d a1, a0, 0
+ st.d a1, t7, -8
+
+ jr ra
+
+L(less_8bytes):
+ srai.d t8, a2, 2
+ beqz t8, L(less_4bytes)
+ st.w a1, a0, 0
+ st.w a1, t7, -4
+
+ jr ra
+
+L(less_4bytes):
+ srai.d t8, a2, 1
+ beqz t8, L(less_2bytes)
+ st.h a1, a0, 0
+ st.h a1, t7, -2
+
+ jr ra
+
+L(less_2bytes):
+ beqz a2, L(less_1bytes)
+ st.b a1, a0, 0
+
+ jr ra
+
+L(less_1bytes):
+ jr ra
+
+L(more_64bytes):
+ srli.d a0, a0, 3
+ slli.d a0, a0, 3
+ addi.d a0, a0, 0x8
+ st.d a1, t0, 0
+
+ sub.d t2, t0, a0
+ add.d a2, t2, a2
+ addi.d a2, a2, -0x80
+ blt a2, zero, L(end_unalign_proc)
+
+L(loop_less):
+ ST_128(0)
+ addi.d a0, a0, 0x80
+ addi.d a2, a2, -0x80
+ bge a2, zero, L(loop_less)
+
+L(end_unalign_proc):
+ addi.d a2, a2, 0x80
+ pcaddi t1, 20
+ andi t5, a2, 0x78
+ srli.d t5, t5, 1
+
+ sub.d t1, t1, t5
+ jr t1
+
+ st.d a1, a0, 112
+ st.d a1, a0, 104
+ st.d a1, a0, 96
+ st.d a1, a0, 88
+ st.d a1, a0, 80
+ st.d a1, a0, 72
+ st.d a1, a0, 64
+ st.d a1, a0, 56
+ st.d a1, a0, 48
+ st.d a1, a0, 40
+ st.d a1, a0, 32
+ st.d a1, a0, 24
+ st.d a1, a0, 16
+ st.d a1, a0, 8
+ st.d a1, a0, 0
+ st.d a1, t7, -8
+
+ move a0, t0
+ jr ra
+END(MEMSET_NAME)
+
+libc_hidden_builtin_def (MEMSET_NAME)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/memset.c b/sysdeps/loongarch/lp64/multiarch/memset.c
new file mode 100644
index 00000000..3ff60d8a
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memset.c
@@ -0,0 +1,37 @@
+/* Multiple versions of memset.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define memset __redirect_memset
+# include <string.h>
+# undef memset
+
+# define SYMBOL_NAME memset
+# include "ifunc-lasx.h"
+
+libc_ifunc_redirected (__redirect_memset, memset,
+ IFUNC_SELECTOR ());
+
+# ifdef SHARED
+__hidden_ver1 (memset, __GI_memset, __redirect_memset)
+ __attribute__ ((visibility ("hidden"))) __attribute_copy__ (memset);
+# endif
+
+#endif
--
2.33.0

View file

@ -0,0 +1,448 @@
From b412bcb2cf4914a664bcd24924d670a2e37394b3 Mon Sep 17 00:00:00 2001
From: dengjianbo <dengjianbo@loongson.cn>
Date: Mon, 28 Aug 2023 10:08:35 +0800
Subject: [PATCH 14/29] LoongArch: Add ifunc support for rawmemchr{aligned,
lsx, lasx}
According to glibc rawmemchr microbenchmark, A few cases tested with
char '\0' experience performance degradation due to the lasx and lsx
versions don't handle the '\0' separately. Overall, rawmemchr-lasx
implementation could reduce the runtime about 40%-80%, rawmemchr-lsx
implementation could reduce the runtime about 40%-66%, rawmemchr-aligned
implementation could reduce the runtime about 20%-40%.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/lp64/multiarch/Makefile | 3 +
.../lp64/multiarch/ifunc-impl-list.c | 8 ++
.../lp64/multiarch/ifunc-rawmemchr.h | 40 ++++++
.../lp64/multiarch/rawmemchr-aligned.S | 124 ++++++++++++++++++
.../loongarch/lp64/multiarch/rawmemchr-lasx.S | 82 ++++++++++++
.../loongarch/lp64/multiarch/rawmemchr-lsx.S | 71 ++++++++++
sysdeps/loongarch/lp64/multiarch/rawmemchr.c | 37 ++++++
7 files changed, 365 insertions(+)
create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-rawmemchr.h
create mode 100644 sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/rawmemchr.c
diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
index 5d7ae7ae..64416b02 100644
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
@@ -21,5 +21,8 @@ sysdep_routines += \
memmove-unaligned \
memmove-lsx \
memmove-lasx \
+ rawmemchr-aligned \
+ rawmemchr-lsx \
+ rawmemchr-lasx \
# sysdep_routines
endif
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
index c8ba87bd..3db9af14 100644
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
@@ -94,5 +94,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_aligned)
)
+ IFUNC_IMPL (i, name, rawmemchr,
+#if !defined __loongarch_soft_float
+ IFUNC_IMPL_ADD (array, i, rawmemchr, SUPPORT_LASX, __rawmemchr_lasx)
+ IFUNC_IMPL_ADD (array, i, rawmemchr, SUPPORT_LSX, __rawmemchr_lsx)
+#endif
+ IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_aligned)
+ )
+
return i;
}
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-rawmemchr.h b/sysdeps/loongarch/lp64/multiarch/ifunc-rawmemchr.h
new file mode 100644
index 00000000..a7bb4cf9
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-rawmemchr.h
@@ -0,0 +1,40 @@
+/* Common definition for rawmemchr ifunc selections.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <ldsodefs.h>
+#include <ifunc-init.h>
+
+#if !defined __loongarch_soft_float
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
+#endif
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+#if !defined __loongarch_soft_float
+ if (SUPPORT_LASX)
+ return OPTIMIZE (lasx);
+ else if (SUPPORT_LSX)
+ return OPTIMIZE (lsx);
+ else
+#endif
+ return OPTIMIZE (aligned);
+}
diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S
new file mode 100644
index 00000000..9c7155ae
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-aligned.S
@@ -0,0 +1,124 @@
+/* Optimized rawmemchr implementation using basic LoongArch instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+# define RAWMEMCHR_NAME __rawmemchr_aligned
+#else
+# define RAWMEMCHR_NAME __rawmemchr
+#endif
+
+LEAF(RAWMEMCHR_NAME, 6)
+ andi t1, a0, 0x7
+ bstrins.d a0, zero, 2, 0
+ lu12i.w a2, 0x01010
+ bstrins.d a1, a1, 15, 8
+
+ ld.d t0, a0, 0
+ slli.d t1, t1, 3
+ ori a2, a2, 0x101
+ bstrins.d a1, a1, 31, 16
+
+ li.w t8, -1
+ bstrins.d a1, a1, 63, 32
+ bstrins.d a2, a2, 63, 32
+ sll.d t2, t8, t1
+
+ sll.d t3, a1, t1
+ orn t0, t0, t2
+ slli.d a3, a2, 7
+ beqz a1, L(find_zero)
+
+ xor t0, t0, t3
+ sub.d t1, t0, a2
+ andn t2, a3, t0
+ and t3, t1, t2
+
+ bnez t3, L(count_pos)
+ addi.d a0, a0, 8
+
+L(loop):
+ ld.d t0, a0, 0
+ xor t0, t0, a1
+
+ sub.d t1, t0, a2
+ andn t2, a3, t0
+ and t3, t1, t2
+ bnez t3, L(count_pos)
+
+ ld.d t0, a0, 8
+ addi.d a0, a0, 16
+ xor t0, t0, a1
+ sub.d t1, t0, a2
+
+ andn t2, a3, t0
+ and t3, t1, t2
+ beqz t3, L(loop)
+ addi.d a0, a0, -8
+L(count_pos):
+ ctz.d t0, t3
+ srli.d t0, t0, 3
+ add.d a0, a0, t0
+ jr ra
+
+L(loop_7bit):
+ ld.d t0, a0, 0
+L(find_zero):
+ sub.d t1, t0, a2
+ and t2, t1, a3
+ bnez t2, L(more_check)
+
+ ld.d t0, a0, 8
+ addi.d a0, a0, 16
+ sub.d t1, t0, a2
+ and t2, t1, a3
+
+ beqz t2, L(loop_7bit)
+ addi.d a0, a0, -8
+
+L(more_check):
+ andn t2, a3, t0
+ and t3, t1, t2
+ bnez t3, L(count_pos)
+ addi.d a0, a0, 8
+
+L(loop_8bit):
+ ld.d t0, a0, 0
+
+ sub.d t1, t0, a2
+ andn t2, a3, t0
+ and t3, t1, t2
+ bnez t3, L(count_pos)
+
+ ld.d t0, a0, 8
+ addi.d a0, a0, 16
+ sub.d t1, t0, a2
+
+ andn t2, a3, t0
+ and t3, t1, t2
+ beqz t3, L(loop_8bit)
+
+ addi.d a0, a0, -8
+ b L(count_pos)
+
+END(RAWMEMCHR_NAME)
+
+libc_hidden_builtin_def (__rawmemchr)
diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S
new file mode 100644
index 00000000..be2eb59d
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lasx.S
@@ -0,0 +1,82 @@
+/* Optimized rawmemchr implementation using LoongArch LASX instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/asm.h>
+#include <sys/regdef.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# define RAWMEMCHR __rawmemchr_lasx
+
+LEAF(RAWMEMCHR, 6)
+ move a2, a0
+ bstrins.d a0, zero, 5, 0
+ xvld xr0, a0, 0
+ xvld xr1, a0, 32
+
+ xvreplgr2vr.b xr2, a1
+ xvseq.b xr0, xr0, xr2
+ xvseq.b xr1, xr1, xr2
+ xvmsknz.b xr0, xr0
+
+ xvmsknz.b xr1, xr1
+ xvpickve.w xr3, xr0, 4
+ xvpickve.w xr4, xr1, 4
+ vilvl.h vr0, vr3, vr0
+
+ vilvl.h vr1, vr4, vr1
+ vilvl.w vr0, vr1, vr0
+ movfr2gr.d t0, fa0
+ sra.d t0, t0, a2
+
+
+ beqz t0, L(loop)
+ ctz.d t0, t0
+ add.d a0, a2, t0
+ jr ra
+
+L(loop):
+ xvld xr0, a0, 64
+ xvld xr1, a0, 96
+ addi.d a0, a0, 64
+ xvseq.b xr0, xr0, xr2
+
+ xvseq.b xr1, xr1, xr2
+ xvmax.bu xr3, xr0, xr1
+ xvseteqz.v fcc0, xr3
+ bcnez fcc0, L(loop)
+
+ xvmsknz.b xr0, xr0
+ xvmsknz.b xr1, xr1
+ xvpickve.w xr3, xr0, 4
+ xvpickve.w xr4, xr1, 4
+
+
+ vilvl.h vr0, vr3, vr0
+ vilvl.h vr1, vr4, vr1
+ vilvl.w vr0, vr1, vr0
+ movfr2gr.d t0, fa0
+
+ ctz.d t0, t0
+ add.d a0, a0, t0
+ jr ra
+END(RAWMEMCHR)
+
+libc_hidden_builtin_def (RAWMEMCHR)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S
new file mode 100644
index 00000000..2f6fe024
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr-lsx.S
@@ -0,0 +1,71 @@
+/* Optimized rawmemchr implementation using LoongArch LSX instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# define RAWMEMCHR __rawmemchr_lsx
+
+LEAF(RAWMEMCHR, 6)
+ move a2, a0
+ bstrins.d a0, zero, 4, 0
+ vld vr0, a0, 0
+ vld vr1, a0, 16
+
+ vreplgr2vr.b vr2, a1
+ vseq.b vr0, vr0, vr2
+ vseq.b vr1, vr1, vr2
+ vmsknz.b vr0, vr0
+
+ vmsknz.b vr1, vr1
+ vilvl.h vr0, vr1, vr0
+ movfr2gr.s t0, fa0
+ sra.w t0, t0, a2
+
+ beqz t0, L(loop)
+ ctz.w t0, t0
+ add.d a0, a2, t0
+ jr ra
+
+
+L(loop):
+ vld vr0, a0, 32
+ vld vr1, a0, 48
+ addi.d a0, a0, 32
+ vseq.b vr0, vr0, vr2
+
+ vseq.b vr1, vr1, vr2
+ vmax.bu vr3, vr0, vr1
+ vseteqz.v fcc0, vr3
+ bcnez fcc0, L(loop)
+
+ vmsknz.b vr0, vr0
+ vmsknz.b vr1, vr1
+ vilvl.h vr0, vr1, vr0
+ movfr2gr.s t0, fa0
+
+ ctz.w t0, t0
+ add.d a0, a0, t0
+ jr ra
+END(RAWMEMCHR)
+
+libc_hidden_builtin_def (RAWMEMCHR)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/rawmemchr.c b/sysdeps/loongarch/lp64/multiarch/rawmemchr.c
new file mode 100644
index 00000000..89c7ffff
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/rawmemchr.c
@@ -0,0 +1,37 @@
+/* Multiple versions of rawmemchr.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# define rawmemchr __redirect_rawmemchr
+# define __rawmemchr __redirect___rawmemchr
+# include <string.h>
+# undef rawmemchr
+# undef __rawmemchr
+
+# define SYMBOL_NAME rawmemchr
+# include "ifunc-rawmemchr.h"
+
+libc_ifunc_redirected (__redirect_rawmemchr, __rawmemchr,
+ IFUNC_SELECTOR ());
+weak_alias (__rawmemchr, rawmemchr)
+# ifdef SHARED
+__hidden_ver1 (__rawmemchr, __GI___rawmemchr, __redirect___rawmemchr)
+ __attribute__((visibility ("hidden")));
+# endif
+#endif
--
2.33.0

View file

@ -0,0 +1,499 @@
From e258cfcf92f5e31e902fa045b41652f00fcf2521 Mon Sep 17 00:00:00 2001
From: dengjianbo <dengjianbo@loongson.cn>
Date: Thu, 24 Aug 2023 16:50:18 +0800
Subject: [PATCH 09/29] LoongArch: Add ifunc support for strcmp{aligned, lsx}
Based on the glibc microbenchmark, strcmp-aligned implementation could
reduce the runtime 0%-10% for aligned comparison, 10%-20% for unaligned
comparison, strcmp-lsx implemenation could reduce the runtime 0%-50%.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/lp64/multiarch/Makefile | 2 +
.../lp64/multiarch/ifunc-impl-list.c | 7 +
.../loongarch/lp64/multiarch/ifunc-strcmp.h | 38 ++++
.../loongarch/lp64/multiarch/strcmp-aligned.S | 179 ++++++++++++++++++
sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S | 165 ++++++++++++++++
sysdeps/loongarch/lp64/multiarch/strcmp.c | 35 ++++
6 files changed, 426 insertions(+)
create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strcmp.h
create mode 100644 sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/strcmp.c
diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
index c4dd3143..d5a500de 100644
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
@@ -12,6 +12,8 @@ sysdep_routines += \
strchrnul-aligned \
strchrnul-lsx \
strchrnul-lasx \
+ strcmp-aligned \
+ strcmp-lsx \
memcpy-aligned \
memcpy-unaligned \
memmove-unaligned \
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
index 7cec0b77..9183b7da 100644
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
@@ -62,6 +62,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_aligned)
)
+ IFUNC_IMPL (i, name, strcmp,
+#if !defined __loongarch_soft_float
+ IFUNC_IMPL_ADD (array, i, strcmp, SUPPORT_LSX, __strcmp_lsx)
+#endif
+ IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_aligned)
+ )
+
IFUNC_IMPL (i, name, memcpy,
#if !defined __loongarch_soft_float
IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_LASX, __memcpy_lasx)
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strcmp.h b/sysdeps/loongarch/lp64/multiarch/ifunc-strcmp.h
new file mode 100644
index 00000000..ca26352b
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strcmp.h
@@ -0,0 +1,38 @@
+/* Common definition for strcmp ifunc selection.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <ldsodefs.h>
+#include <ifunc-init.h>
+
+#if !defined __loongarch_soft_float
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
+#endif
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+#if !defined __loongarch_soft_float
+ if (SUPPORT_LSX)
+ return OPTIMIZE (lsx);
+ else
+#endif
+ return OPTIMIZE (aligned);
+}
diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S
new file mode 100644
index 00000000..f5f4f336
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S
@@ -0,0 +1,179 @@
+/* Optimized strcmp implementation using basic Loongarch instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+# define STRCMP_NAME __strcmp_aligned
+#else
+# define STRCMP_NAME strcmp
+#endif
+
+LEAF(STRCMP_NAME, 6)
+ lu12i.w a4, 0x01010
+ andi a2, a0, 0x7
+ ori a4, a4, 0x101
+ andi a3, a1, 0x7
+
+ bstrins.d a4, a4, 63, 32
+ li.d t7, -1
+ li.d t8, 8
+ slli.d a5, a4, 7
+
+ bne a2, a3, L(unaligned)
+ bstrins.d a0, zero, 2, 0
+ bstrins.d a1, zero, 2, 0
+ ld.d t0, a0, 0
+
+ ld.d t1, a1, 0
+ slli.d t3, a2, 3
+ sll.d t2, t7, t3
+ orn t0, t0, t2
+
+
+ orn t1, t1, t2
+ sub.d t2, t0, a4
+ andn t3, a5, t0
+ and t2, t2, t3
+
+ bne t0, t1, L(al_end)
+L(al_loop):
+ bnez t2, L(ret0)
+ ldx.d t0, a0, t8
+ ldx.d t1, a1, t8
+
+ addi.d t8, t8, 8
+ sub.d t2, t0, a4
+ andn t3, a5, t0
+ and t2, t2, t3
+
+ beq t0, t1, L(al_loop)
+L(al_end):
+ xor t3, t0, t1
+ or t2, t2, t3
+ ctz.d t3, t2
+
+
+ bstrins.d t3, zero, 2, 0
+ srl.d t0, t0, t3
+ srl.d t1, t1, t3
+ andi t0, t0, 0xff
+
+ andi t1, t1, 0xff
+ sub.d a0, t0, t1
+ jr ra
+ nop
+
+L(ret0):
+ move a0, zero
+ jr ra
+ nop
+ nop
+
+L(unaligned):
+ slt a6, a3, a2
+ xor t0, a0, a1
+ maskeqz t0, t0, a6
+ xor a0, a0, t0
+
+
+ xor a1, a1, t0
+ andi a2, a0, 0x7
+ andi a3, a1, 0x7
+ bstrins.d a0, zero, 2, 0
+
+ bstrins.d a1, zero, 2, 0
+ ld.d t4, a0, 0
+ ld.d t1, a1, 0
+ slli.d a2, a2, 3
+
+ slli.d a3, a3, 3
+ srl.d t0, t4, a2
+ srl.d t1, t1, a3
+ srl.d t5, t7, a3
+
+ orn t0, t0, t5
+ orn t1, t1, t5
+ bne t0, t1, L(not_equal)
+ sll.d t5, t7, a2
+
+
+ sub.d a3, a2, a3
+ orn t4, t4, t5
+ sub.d a2, zero, a3
+ sub.d t2, t4, a4
+
+ andn t3, a5, t4
+ and t2, t2, t3
+ bnez t2, L(find_zero)
+L(un_loop):
+ srl.d t5, t4, a3
+
+ ldx.d t4, a0, t8
+ ldx.d t1, a1, t8
+ addi.d t8, t8, 8
+ sll.d t0, t4, a2
+
+ or t0, t0, t5
+ bne t0, t1, L(not_equal)
+ sub.d t2, t4, a4
+ andn t3, a5, t4
+
+
+ and t2, t2, t3
+ beqz t2, L(un_loop)
+L(find_zero):
+ sub.d t2, t0, a4
+ andn t3, a5, t0
+
+ and t2, t2, t3
+ bnez t2, L(ret0)
+ ldx.d t1, a1, t8
+ srl.d t0, t4, a3
+
+L(not_equal):
+ sub.d t2, t0, a4
+ andn t3, a5, t0
+ and t2, t2, t3
+ xor t3, t0, t1
+
+ or t2, t2, t3
+L(un_end):
+ ctz.d t3, t2
+ bstrins.d t3, zero, 2, 0
+ srl.d t0, t0, t3
+
+
+ srl.d t1, t1, t3
+ andi t0, t0, 0xff
+ andi t1, t1, 0xff
+ sub.d t2, t0, t1
+
+
+ sub.d t3, t1, t0
+ masknez t0, t2, a6
+ maskeqz t1, t3, a6
+ or a0, t0, t1
+
+ jr ra
+END(STRCMP_NAME)
+
+libc_hidden_builtin_def (STRCMP_NAME)
diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
new file mode 100644
index 00000000..2e177a38
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
@@ -0,0 +1,165 @@
+/* Optimized strcmp implementation using Loongarch LSX instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# define STRCMP __strcmp_lsx
+
+LEAF(STRCMP, 6)
+ pcalau12i t0, %pc_hi20(L(INDEX))
+ andi a2, a0, 0xf
+ vld vr2, t0, %pc_lo12(L(INDEX))
+ andi a3, a1, 0xf
+
+ bne a2, a3, L(unaligned)
+ bstrins.d a0, zero, 3, 0
+ bstrins.d a1, zero, 3, 0
+ vld vr0, a0, 0
+
+ vld vr1, a1, 0
+ vreplgr2vr.b vr3, a2
+ vslt.b vr2, vr2, vr3
+ vseq.b vr3, vr0, vr1
+
+ vmin.bu vr3, vr0, vr3
+ vor.v vr3, vr3, vr2
+ vsetanyeqz.b fcc0, vr3
+ bcnez fcc0, L(al_out)
+
+
+L(al_loop):
+ vld vr0, a0, 16
+ vld vr1, a1, 16
+ addi.d a0, a0, 16
+ addi.d a1, a1, 16
+
+ vseq.b vr3, vr0, vr1
+ vmin.bu vr3, vr0, vr3
+ vsetanyeqz.b fcc0, vr3
+ bceqz fcc0, L(al_loop)
+
+L(al_out):
+ vseqi.b vr3, vr3, 0
+ vfrstpi.b vr3, vr3, 0
+ vshuf.b vr0, vr0, vr0, vr3
+ vshuf.b vr1, vr1, vr1, vr3
+
+ vpickve2gr.bu t0, vr0, 0
+ vpickve2gr.bu t1, vr1, 0
+ sub.d a0, t0, t1
+ jr ra
+
+
+L(unaligned):
+ slt a4, a3, a2
+ xor t0, a0, a1
+ maskeqz t0, t0, a4
+ xor a0, a0, t0
+
+ xor a1, a1, t0
+ andi a2, a0, 0xf
+ andi a3, a1, 0xf
+ bstrins.d a0, zero, 3, 0
+
+ bstrins.d a1, zero, 3, 0
+ vld vr3, a0, 0
+ vld vr1, a1, 0
+ vreplgr2vr.b vr4, a2
+
+ vreplgr2vr.b vr5, a3
+ vslt.b vr7, vr2, vr5
+ vsub.b vr5, vr5, vr4
+ vaddi.bu vr6, vr2, 16
+
+
+ vsub.b vr6, vr6, vr5
+ vshuf.b vr0, vr3, vr3, vr6
+ vor.v vr0, vr0, vr7
+ vor.v vr1, vr1, vr7
+
+ vseq.b vr5, vr0, vr1
+ vsetanyeqz.b fcc0, vr5
+ bcnez fcc0, L(not_equal)
+ vslt.b vr4, vr2, vr4
+
+ vor.v vr0, vr3, vr4
+ vsetanyeqz.b fcc0, vr0
+ bcnez fcc0, L(find_zero)
+ nop
+
+L(un_loop):
+ vld vr3, a0, 16
+ vld vr1, a1, 16
+ addi.d a0, a0, 16
+ addi.d a1, a1, 16
+
+
+ vshuf.b vr0, vr3, vr0, vr6
+ vseq.b vr5, vr0, vr1
+ vsetanyeqz.b fcc0, vr5
+ bcnez fcc0, L(not_equal)
+
+ vsetanyeqz.b fcc0, vr3
+ vor.v vr0, vr3, vr3
+ bceqz fcc0, L(un_loop)
+L(find_zero):
+ vmin.bu vr5, vr1, vr5
+
+ vsetanyeqz.b fcc0, vr5
+ bcnez fcc0, L(ret0)
+ vld vr1, a1, 16
+ vshuf.b vr0, vr3, vr3, vr6
+
+ vseq.b vr5, vr0, vr1
+L(not_equal):
+ vmin.bu vr5, vr0, vr5
+L(un_end):
+ vseqi.b vr5, vr5, 0
+ vfrstpi.b vr5, vr5, 0
+
+
+ vshuf.b vr0, vr0, vr0, vr5
+ vshuf.b vr1, vr1, vr1, vr5
+ vpickve2gr.bu t0, vr0, 0
+ vpickve2gr.bu t1, vr1, 0
+
+ sub.d t3, t0, t1
+ sub.d t4, t1, t0
+ masknez t0, t3, a4
+ maskeqz t1, t4, a4
+
+ or a0, t0, t1
+ jr ra
+L(ret0):
+ move a0, zero
+ jr ra
+END(STRCMP)
+
+ .section .rodata.cst16,"M",@progbits,16
+ .align 4
+L(INDEX):
+ .dword 0x0706050403020100
+ .dword 0x0f0e0d0c0b0a0908
+
+libc_hidden_builtin_def (STRCMP)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp.c b/sysdeps/loongarch/lp64/multiarch/strcmp.c
new file mode 100644
index 00000000..6f249c0b
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strcmp.c
@@ -0,0 +1,35 @@
+/* Multiple versions of strcmp.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define strcmp __redirect_strcmp
+# include <string.h>
+# undef strcmp
+
+# define SYMBOL_NAME strcmp
+# include "ifunc-strcmp.h"
+
+libc_ifunc_redirected (__redirect_strcmp, strcmp, IFUNC_SELECTOR ());
+
+# ifdef SHARED
+__hidden_ver1 (strcmp, __GI_strcmp, __redirect_strcmp)
+ __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strcmp);
+# endif
+#endif
--
2.33.0

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,583 @@
From 6f03da2d7ef218c0f78375cf706dada59c3fee63 Mon Sep 17 00:00:00 2001
From: dengjianbo <dengjianbo@loongson.cn>
Date: Thu, 24 Aug 2023 16:50:19 +0800
Subject: [PATCH 10/29] LoongArch: Add ifunc support for strncmp{aligned, lsx}
Based on the glibc microbenchmark, only a few short inputs with this
strncmp-aligned and strncmp-lsx implementation experience performance
degradation, overall, strncmp-aligned could reduce the runtime 0%-10%
for aligned comparision, 10%-25% for unaligend comparision, strncmp-lsx
could reduce the runtime about 0%-60%.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/lp64/multiarch/Makefile | 2 +
.../lp64/multiarch/ifunc-impl-list.c | 7 +
.../loongarch/lp64/multiarch/ifunc-strncmp.h | 38 +++
.../lp64/multiarch/strncmp-aligned.S | 218 ++++++++++++++++++
.../loongarch/lp64/multiarch/strncmp-lsx.S | 208 +++++++++++++++++
sysdeps/loongarch/lp64/multiarch/strncmp.c | 35 +++
6 files changed, 508 insertions(+)
create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strncmp.h
create mode 100644 sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/strncmp.c
diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
index d5a500de..5d7ae7ae 100644
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
@@ -14,6 +14,8 @@ sysdep_routines += \
strchrnul-lasx \
strcmp-aligned \
strcmp-lsx \
+ strncmp-aligned \
+ strncmp-lsx \
memcpy-aligned \
memcpy-unaligned \
memmove-unaligned \
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
index 9183b7da..c8ba87bd 100644
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
@@ -69,6 +69,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_aligned)
)
+ IFUNC_IMPL (i, name, strncmp,
+#if !defined __loongarch_soft_float
+ IFUNC_IMPL_ADD (array, i, strncmp, SUPPORT_LSX, __strncmp_lsx)
+#endif
+ IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_aligned)
+ )
+
IFUNC_IMPL (i, name, memcpy,
#if !defined __loongarch_soft_float
IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_LASX, __memcpy_lasx)
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strncmp.h b/sysdeps/loongarch/lp64/multiarch/ifunc-strncmp.h
new file mode 100644
index 00000000..1a7dc36b
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strncmp.h
@@ -0,0 +1,38 @@
+/* Common definition for strncmp ifunc selection.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <ldsodefs.h>
+#include <ifunc-init.h>
+
+#if !defined __loongarch_soft_float
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
+#endif
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+#if !defined __loongarch_soft_float
+ if (SUPPORT_LSX)
+ return OPTIMIZE (lsx);
+ else
+#endif
+ return OPTIMIZE (aligned);
+}
diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S
new file mode 100644
index 00000000..e2687fa7
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S
@@ -0,0 +1,218 @@
+/* Optimized strncmp implementation using basic Loongarch instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+# define STRNCMP __strncmp_aligned
+#else
+# define STRNCMP strncmp
+#endif
+
+LEAF(STRNCMP, 6)
+ beqz a2, L(ret0)
+ lu12i.w a5, 0x01010
+ andi a3, a0, 0x7
+ ori a5, a5, 0x101
+
+ andi a4, a1, 0x7
+ bstrins.d a5, a5, 63, 32
+ li.d t7, -1
+ li.d t8, 8
+
+ addi.d a2, a2, -1
+ slli.d a6, a5, 7
+ bne a3, a4, L(unaligned)
+ bstrins.d a0, zero, 2, 0
+
+ bstrins.d a1, zero, 2, 0
+ ld.d t0, a0, 0
+ ld.d t1, a1, 0
+ slli.d t2, a3, 3
+
+
+ sub.d t5, t8, a3
+ srl.d t3, t7, t2
+ srl.d t0, t0, t2
+ srl.d t1, t1, t2
+
+ orn t0, t0, t3
+ orn t1, t1, t3
+ sub.d t2, t0, a5
+ andn t3, a6, t0
+
+ and t2, t2, t3
+ bne t0, t1, L(al_end)
+ sltu t4, a2, t5
+ sub.d a2, a2, t5
+
+L(al_loop):
+ or t4, t2, t4
+ bnez t4, L(ret0)
+ ldx.d t0, a0, t8
+ ldx.d t1, a1, t8
+
+
+ addi.d t8, t8, 8
+ sltui t4, a2, 8
+ addi.d a2, a2, -8
+ sub.d t2, t0, a5
+
+ andn t3, a6, t0
+ and t2, t2, t3
+ beq t0, t1, L(al_loop)
+ addi.d a2, a2, 8
+
+L(al_end):
+ xor t3, t0, t1
+ or t2, t2, t3
+ ctz.d t2, t2
+ srli.d t4, t2, 3
+
+ bstrins.d t2, zero, 2, 0
+ srl.d t0, t0, t2
+ srl.d t1, t1, t2
+ andi t0, t0, 0xff
+
+
+ andi t1, t1, 0xff
+ sltu t2, a2, t4
+ sub.d a0, t0, t1
+ masknez a0, a0, t2
+
+ jr ra
+L(ret0):
+ move a0, zero
+ jr ra
+ nop
+
+L(unaligned):
+ slt a7, a4, a3
+ xor t0, a0, a1
+ maskeqz t0, t0, a7
+ xor a0, a0, t0
+
+ xor a1, a1, t0
+ andi a3, a0, 0x7
+ andi a4, a1, 0x7
+ bstrins.d a0, zero, 2, 0
+
+
+ bstrins.d a1, zero, 2, 0
+ ld.d t4, a0, 0
+ ld.d t1, a1, 0
+ slli.d t2, a3, 3
+
+ slli.d t3, a4, 3
+ srl.d t5, t7, t3
+ srl.d t0, t4, t2
+ srl.d t1, t1, t3
+
+ orn t0, t0, t5
+ orn t1, t1, t5
+ bne t0, t1, L(not_equal)
+ sub.d t6, t8, a4
+
+ sub.d a4, t2, t3
+ sll.d t2, t7, t2
+ sub.d t5, t8, a3
+ orn t4, t4, t2
+
+
+ sub.d t2, t4, a5
+ andn t3, a6, t4
+ sltu t7, a2, t5
+ and t2, t2, t3
+
+ sub.d a3, zero, a4
+ or t2, t2, t7
+ bnez t2, L(un_end)
+ sub.d t7, t5, t6
+
+ sub.d a2, a2, t5
+ sub.d t6, t8, t7
+L(un_loop):
+ srl.d t5, t4, a4
+ ldx.d t4, a0, t8
+
+ ldx.d t1, a1, t8
+ addi.d t8, t8, 8
+ sll.d t0, t4, a3
+ or t0, t0, t5
+
+
+ bne t0, t1, L(loop_not_equal)
+ sub.d t2, t4, a5
+ andn t3, a6, t4
+ sltui t5, a2, 8
+
+ and t2, t2, t3
+ addi.d a2, a2, -8
+ or t3, t2, t5
+ beqz t3, L(un_loop)
+
+ addi.d a2, a2, 8
+L(un_end):
+ sub.d t2, t0, a5
+ andn t3, a6, t0
+ sltu t5, a2, t6
+
+ and t2, t2, t3
+ or t2, t2, t5
+ bnez t2, L(ret0)
+ ldx.d t1, a1, t8
+
+
+ srl.d t0, t4, a4
+ sub.d a2, a2, t6
+L(not_equal):
+ sub.d t2, t0, a5
+ andn t3, a6, t0
+
+ xor t4, t0, t1
+ and t2, t2, t3
+ or t2, t2, t4
+ ctz.d t2, t2
+
+ bstrins.d t2, zero, 2, 0
+ srli.d t4, t2, 3
+ srl.d t0, t0, t2
+ srl.d t1, t1, t2
+
+ andi t0, t0, 0xff
+ andi t1, t1, 0xff
+ sub.d t2, t0, t1
+ sub.d t3, t1, t0
+
+
+ masknez t0, t2, a7
+ maskeqz t1, t3, a7
+ sltu t2, a2, t4
+ or a0, t0, t1
+
+ masknez a0, a0, t2
+ jr ra
+L(loop_not_equal):
+ add.d a2, a2, t7
+ b L(not_equal)
+END(STRNCMP)
+
+libc_hidden_builtin_def (STRNCMP)
diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
new file mode 100644
index 00000000..0b4eee2a
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
@@ -0,0 +1,208 @@
+/* Optimized strncmp implementation using Loongarch LSX instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# define STRNCMP __strncmp_lsx
+
+LEAF(STRNCMP, 6)
+ beqz a2, L(ret0)
+ pcalau12i t0, %pc_hi20(L(INDEX))
+ andi a3, a0, 0xf
+ vld vr2, t0, %pc_lo12(L(INDEX))
+
+ andi a4, a1, 0xf
+ li.d t2, 16
+ bne a3, a4, L(unaligned)
+ xor t0, a0, a3
+
+ xor t1, a1, a4
+ vld vr0, t0, 0
+ vld vr1, t1, 0
+ vreplgr2vr.b vr3, a3
+
+
+ sub.d t2, t2, a3
+ vadd.b vr3, vr3, vr2
+ vshuf.b vr0, vr3, vr0, vr3
+ vshuf.b vr1, vr3, vr1, vr3
+
+ vseq.b vr3, vr0, vr1
+ vmin.bu vr3, vr0, vr3
+ bgeu t2, a2, L(al_early_end)
+ vsetanyeqz.b fcc0, vr3
+
+ bcnez fcc0, L(al_end)
+ add.d a3, a0, a2
+ addi.d a4, a3, -1
+ bstrins.d a4, zero, 3, 0
+
+ sub.d a2, a3, a4
+L(al_loop):
+ vld vr0, t0, 16
+ vld vr1, t1, 16
+ addi.d t0, t0, 16
+
+
+ addi.d t1, t1, 16
+ vseq.b vr3, vr0, vr1
+ vmin.bu vr3, vr0, vr3
+ beq t0, a4, L(al_early_end)
+
+ vsetanyeqz.b fcc0, vr3
+ bceqz fcc0, L(al_loop)
+L(al_end):
+ vseqi.b vr3, vr3, 0
+ vfrstpi.b vr3, vr3, 0
+
+ vshuf.b vr0, vr0, vr0, vr3
+ vshuf.b vr1, vr1, vr1, vr3
+ vpickve2gr.bu t0, vr0, 0
+ vpickve2gr.bu t1, vr1, 0
+
+ sub.d a0, t0, t1
+ jr ra
+L(al_early_end):
+ vreplgr2vr.b vr4, a2
+ vslt.b vr4, vr2, vr4
+
+
+ vorn.v vr3, vr3, vr4
+ b L(al_end)
+L(unaligned):
+ slt a5, a3, a4
+ xor t0, a0, a1
+
+ maskeqz t0, t0, a5
+ xor a0, a0, t0
+ xor a1, a1, t0
+ andi a3, a0, 0xf
+
+ andi a4, a1, 0xf
+ xor t0, a0, a3
+ xor t1, a1, a4
+ vld vr0, t0, 0
+
+ vld vr3, t1, 0
+ sub.d t2, t2, a3
+ vreplgr2vr.b vr4, a3
+ vreplgr2vr.b vr5, a4
+
+
+ vaddi.bu vr6, vr2, 16
+ vsub.b vr7, vr4, vr5
+ vsub.b vr6, vr6, vr7
+ vadd.b vr4, vr2, vr4
+
+ vshuf.b vr1, vr3, vr3, vr6
+ vshuf.b vr0, vr7, vr0, vr4
+ vshuf.b vr1, vr7, vr1, vr4
+ vseq.b vr4, vr0, vr1
+
+ vmin.bu vr4, vr0, vr4
+ bgeu t2, a2, L(un_early_end)
+ vsetanyeqz.b fcc0, vr4
+ bcnez fcc0, L(un_end)
+
+ add.d a6, a0, a2
+ vslt.b vr5, vr2, vr5
+ addi.d a7, a6, -1
+ vor.v vr3, vr3, vr5
+
+
+ bstrins.d a7, zero, 3, 0
+ sub.d a2, a6, a7
+L(un_loop):
+ vld vr0, t0, 16
+ addi.d t0, t0, 16
+
+ vsetanyeqz.b fcc0, vr3
+ bcnez fcc0, L(has_zero)
+ beq t0, a7, L(end_with_len)
+ vor.v vr1, vr3, vr3
+
+ vld vr3, t1, 16
+ addi.d t1, t1, 16
+ vshuf.b vr1, vr3, vr1, vr6
+ vseq.b vr4, vr0, vr1
+
+ vmin.bu vr4, vr0, vr4
+ vsetanyeqz.b fcc0, vr4
+ bceqz fcc0, L(un_loop)
+L(un_end):
+ vseqi.b vr4, vr4, 0
+
+
+ vfrstpi.b vr4, vr4, 0
+ vshuf.b vr0, vr0, vr0, vr4
+ vshuf.b vr1, vr1, vr1, vr4
+ vpickve2gr.bu t0, vr0, 0
+
+ vpickve2gr.bu t1, vr1, 0
+ sub.d t2, t0, t1
+ sub.d t3, t1, t0
+ masknez t0, t2, a5
+
+ maskeqz t1, t3, a5
+ or a0, t0, t1
+ jr ra
+L(has_zero):
+ vshuf.b vr1, vr3, vr3, vr6
+
+ vseq.b vr4, vr0, vr1
+ vmin.bu vr4, vr0, vr4
+ bne t0, a7, L(un_end)
+L(un_early_end):
+ vreplgr2vr.b vr5, a2
+
+ vslt.b vr5, vr2, vr5
+ vorn.v vr4, vr4, vr5
+ b L(un_end)
+L(end_with_len):
+ sub.d a6, a3, a4
+
+ bgeu a6, a2, 1f
+ vld vr4, t1, 16
+1:
+ vshuf.b vr1, vr4, vr3, vr6
+ vseq.b vr4, vr0, vr1
+
+ vmin.bu vr4, vr0, vr4
+ vreplgr2vr.b vr5, a2
+ vslt.b vr5, vr2, vr5
+ vorn.v vr4, vr4, vr5
+
+ b L(un_end)
+L(ret0):
+ move a0, zero
+ jr ra
+END(STRNCMP)
+
+ .section .rodata.cst16,"M",@progbits,16
+ .align 4
+L(INDEX):
+ .dword 0x0706050403020100
+ .dword 0x0f0e0d0c0b0a0908
+
+libc_hidden_builtin_def (STRNCMP)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp.c b/sysdeps/loongarch/lp64/multiarch/strncmp.c
new file mode 100644
index 00000000..af6d0bc4
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strncmp.c
@@ -0,0 +1,35 @@
+/* Multiple versions of strncmp.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define strncmp __redirect_strncmp
+# include <string.h>
+# undef strncmp
+
+# define SYMBOL_NAME strncmp
+# include "ifunc-strncmp.h"
+
+libc_ifunc_redirected (__redirect_strncmp, strncmp, IFUNC_SELECTOR ());
+
+# ifdef SHARED
+__hidden_ver1 (strncmp, __GI_strncmp, __redirect_strncmp)
+ __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strncmp);
+# endif
+#endif
--
2.33.0

View file

@ -0,0 +1,465 @@
From e494d32d3b76eee0d59cfab37789a356459b517a Mon Sep 17 00:00:00 2001
From: dengjianbo <dengjianbo@loongson.cn>
Date: Thu, 24 Aug 2023 16:50:17 +0800
Subject: [PATCH 08/29] LoongArch: Add ifunc support for strnlen{aligned, lsx,
lasx}
Based on the glibc microbenchmark, strnlen-aligned implementation could
reduce the runtime more than 10%, strnlen-lsx implementation could reduce
the runtime about 50%-78%, strnlen-lasx implementation could reduce the
runtime about 50%-88%.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/lp64/multiarch/Makefile | 3 +
.../lp64/multiarch/ifunc-impl-list.c | 8 ++
.../loongarch/lp64/multiarch/ifunc-strnlen.h | 41 +++++++
.../lp64/multiarch/strnlen-aligned.S | 102 ++++++++++++++++++
.../loongarch/lp64/multiarch/strnlen-lasx.S | 100 +++++++++++++++++
.../loongarch/lp64/multiarch/strnlen-lsx.S | 89 +++++++++++++++
sysdeps/loongarch/lp64/multiarch/strnlen.c | 39 +++++++
7 files changed, 382 insertions(+)
create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strnlen.h
create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/strnlen.c
diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
index afa51041..c4dd3143 100644
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
@@ -3,6 +3,9 @@ sysdep_routines += \
strlen-aligned \
strlen-lsx \
strlen-lasx \
+ strnlen-aligned \
+ strnlen-lsx \
+ strnlen-lasx \
strchr-aligned \
strchr-lsx \
strchr-lasx \
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
index 25eb96b0..7cec0b77 100644
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
@@ -38,6 +38,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_aligned)
)
+ IFUNC_IMPL (i, name, strnlen,
+#if !defined __loongarch_soft_float
+ IFUNC_IMPL_ADD (array, i, strnlen, SUPPORT_LASX, __strnlen_lasx)
+ IFUNC_IMPL_ADD (array, i, strnlen, SUPPORT_LSX, __strnlen_lsx)
+#endif
+ IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_aligned)
+ )
+
IFUNC_IMPL (i, name, strchr,
#if !defined __loongarch_soft_float
IFUNC_IMPL_ADD (array, i, strchr, SUPPORT_LASX, __strchr_lasx)
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strnlen.h b/sysdeps/loongarch/lp64/multiarch/ifunc-strnlen.h
new file mode 100644
index 00000000..5cf89810
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strnlen.h
@@ -0,0 +1,41 @@
+/* Common definition for strnlen ifunc selections.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <ldsodefs.h>
+#include <ifunc-init.h>
+
+#if !defined __loongarch_soft_float
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
+#endif
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+#if !defined __loongarch_soft_float
+ if (SUPPORT_LASX)
+ return OPTIMIZE (lasx);
+ else if (SUPPORT_LSX)
+ return OPTIMIZE (lsx);
+ else
+#endif
+ return OPTIMIZE (aligned);
+}
diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S
new file mode 100644
index 00000000..b900430a
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S
@@ -0,0 +1,102 @@
+/* Optimized strnlen implementation using basic Loongarch instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+# define STRNLEN __strnlen_aligned
+#else
+# define STRNLEN __strnlen
+#endif
+
+LEAF(STRNLEN, 6)
+ beqz a1, L(out)
+ lu12i.w a2, 0x01010
+ andi t1, a0, 0x7
+ move t4, a0
+
+ bstrins.d a0, zero, 2, 0
+ ori a2, a2, 0x101
+ li.w t0, -1
+ ld.d t2, a0, 0
+
+ slli.d t3, t1, 3
+ bstrins.d a2, a2, 63, 32
+ li.w t5, 8
+ slli.d a3, a2, 7
+
+ sub.w t1, t5, t1
+ sll.d t0, t0, t3
+ orn t2, t2, t0
+ sub.d t0, t2, a2
+
+
+ andn t3, a3, t2
+ and t0, t0, t3
+ bnez t0, L(count_pos)
+ sub.d t5, a1, t1
+
+ bgeu t1, a1, L(out)
+ addi.d a0, a0, 8
+L(loop):
+ ld.d t2, a0, 0
+ sub.d t0, t2, a2
+
+ andn t1, a3, t2
+ sltui t6, t5, 9
+ and t0, t0, t1
+ or t7, t0, t6
+
+ bnez t7, L(count_pos)
+ ld.d t2, a0, 8
+ addi.d a0, a0, 16
+ sub.d t0, t2, a2
+
+
+ andn t1, a3, t2
+ sltui t6, t5, 17
+ and t0, t0, t1
+ addi.d t5, t5, -16
+
+ or t7, t0, t6
+ beqz t7, L(loop)
+ addi.d a0, a0, -8
+L(count_pos):
+ ctz.d t1, t0
+
+ sub.d a0, a0, t4
+ srli.d t1, t1, 3
+ add.d a0, t1, a0
+ sltu t0, a0, a1
+
+ masknez t1, a1, t0
+ maskeqz a0, a0, t0
+ or a0, a0, t1
+ jr ra
+
+
+L(out):
+ move a0, a1
+ jr ra
+END(STRNLEN)
+
+weak_alias (STRNLEN, strnlen)
+libc_hidden_builtin_def (STRNLEN)
diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
new file mode 100644
index 00000000..2c03d3d9
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
@@ -0,0 +1,100 @@
+/* Optimized strnlen implementation using loongarch LASX instructions
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# define STRNLEN __strnlen_lasx
+
+LEAF(STRNLEN, 6)
+ beqz a1, L(ret0)
+ andi t1, a0, 0x3f
+ li.d t3, 65
+ sub.d a2, a0, t1
+
+ xvld xr0, a2, 0
+ xvld xr1, a2, 32
+ sub.d t1, t3, t1
+ move a3, a0
+
+ sltu t1, a1, t1
+ xvmsknz.b xr0, xr0
+ xvmsknz.b xr1, xr1
+ xvpickve.w xr2, xr0, 4
+
+ xvpickve.w xr3, xr1, 4
+ vilvl.h vr0, vr2, vr0
+ vilvl.h vr1, vr3, vr1
+ vilvl.w vr0, vr1, vr0
+
+
+ movfr2gr.d t0, fa0
+ sra.d t0, t0, a0
+ orn t1, t1, t0
+ bnez t1, L(end)
+
+ add.d a4, a0, a1
+ move a0, a2
+ addi.d a4, a4, -1
+ bstrins.d a4, zero, 5, 0
+
+L(loop):
+ xvld xr0, a0, 64
+ xvld xr1, a0, 96
+ addi.d a0, a0, 64
+ beq a0, a4, L(out)
+
+ xvmin.bu xr2, xr0, xr1
+ xvsetanyeqz.b fcc0, xr2
+ bceqz fcc0, L(loop)
+L(out):
+ xvmsknz.b xr0, xr0
+
+
+ xvmsknz.b xr1, xr1
+ xvpickve.w xr2, xr0, 4
+ xvpickve.w xr3, xr1, 4
+ vilvl.h vr0, vr2, vr0
+
+ vilvl.h vr1, vr3, vr1
+ vilvl.w vr0, vr1, vr0
+ movfr2gr.d t0, fa0
+L(end):
+ sub.d a0, a0, a3
+
+ cto.d t0, t0
+ add.d a0, a0, t0
+ sltu t1, a0, a1
+ masknez t0, a1, t1
+
+ maskeqz t1, a0, t1
+ or a0, t0, t1
+ jr ra
+L(ret0):
+ move a0, zero
+
+
+ jr ra
+END(STRNLEN)
+
+libc_hidden_def (STRNLEN)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
new file mode 100644
index 00000000..b769a895
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
@@ -0,0 +1,89 @@
+/* Optimized strnlen implementation using loongarch LSX instructions
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# define STRNLEN __strnlen_lsx
+
+LEAF(STRNLEN, 6)
+ beqz a1, L(ret0)
+ andi t1, a0, 0x1f
+ li.d t3, 33
+ sub.d a2, a0, t1
+
+ vld vr0, a2, 0
+ vld vr1, a2, 16
+ sub.d t1, t3, t1
+ move a3, a0
+
+ sltu t1, a1, t1
+ vmsknz.b vr0, vr0
+ vmsknz.b vr1, vr1
+ vilvl.h vr0, vr1, vr0
+
+ movfr2gr.s t0, fa0
+ sra.w t0, t0, a0
+ orn t1, t1, t0
+ bnez t1, L(end)
+
+
+ add.d a4, a0, a1
+ move a0, a2
+ addi.d a4, a4, -1
+ bstrins.d a4, zero, 4, 0
+
+L(loop):
+ vld vr0, a0, 32
+ vld vr1, a0, 48
+ addi.d a0, a0, 32
+ beq a0, a4, L(out)
+
+ vmin.bu vr2, vr0, vr1
+ vsetanyeqz.b fcc0, vr2
+ bceqz fcc0, L(loop)
+L(out):
+ vmsknz.b vr0, vr0
+
+ vmsknz.b vr1, vr1
+ vilvl.h vr0, vr1, vr0
+ movfr2gr.s t0, fa0
+L(end):
+ sub.d a0, a0, a3
+
+
+ cto.w t0, t0
+ add.d a0, a0, t0
+ sltu t1, a0, a1
+ masknez t0, a1, t1
+
+ maskeqz t1, a0, t1
+ or a0, t0, t1
+ jr ra
+L(ret0):
+ move a0, zero
+
+ jr ra
+END(STRNLEN)
+
+libc_hidden_builtin_def (STRNLEN)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen.c b/sysdeps/loongarch/lp64/multiarch/strnlen.c
new file mode 100644
index 00000000..38b7a25a
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strnlen.c
@@ -0,0 +1,39 @@
+/* Multiple versions of strnlen.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define strnlen __redirect_strnlen
+# define __strnlen __redirect___strnlen
+# include <string.h>
+# undef __strnlen
+# undef strnlen
+
+# define SYMBOL_NAME strnlen
+# include "ifunc-strnlen.h"
+
+libc_ifunc_redirected (__redirect_strnlen, __strnlen, IFUNC_SELECTOR ());
+weak_alias (__strnlen, strnlen);
+# ifdef SHARED
+__hidden_ver1 (__strnlen, __GI___strnlen, __redirect___strnlen)
+ __attribute__((visibility ("hidden"))) __attribute_copy__ (strnlen);
+__hidden_ver1 (strnlen, __GI_strnlen, __redirect_strnlen)
+ __attribute__((weak, visibility ("hidden"))) __attribute_copy__ (strnlen);
+# endif
+#endif
--
2.33.0

View file

@ -0,0 +1,670 @@
From d537d0ab45a55048c8da483e73be4448ddb45525 Mon Sep 17 00:00:00 2001
From: dengjianbo <dengjianbo@loongson.cn>
Date: Wed, 13 Sep 2023 15:35:00 +0800
Subject: [PATCH 23/29] LoongArch: Add ifunc support for strrchr{aligned, lsx,
lasx}
According to glibc strrchr microbenchmark test results, this implementation
could reduce the runtime time as following:
Name Percent of rutime reduced
strrchr-lasx 10%-50%
strrchr-lsx 0%-50%
strrchr-aligned 5%-50%
Generic strrchr is implemented by function strlen + memrchr, the lasx version
will compare with generic strrchr implemented by strlen-lasx + memrchr-lasx,
the lsx version will compare with generic strrchr implemented by strlen-lsx +
memrchr-lsx, the aligned version will compare with generic strrchr implemented
by strlen-aligned + memrchr-generic.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/lp64/multiarch/Makefile | 3 +
.../lp64/multiarch/ifunc-impl-list.c | 8 +
.../loongarch/lp64/multiarch/ifunc-strrchr.h | 41 ++++
.../lp64/multiarch/strrchr-aligned.S | 170 +++++++++++++++++
.../loongarch/lp64/multiarch/strrchr-lasx.S | 176 ++++++++++++++++++
.../loongarch/lp64/multiarch/strrchr-lsx.S | 144 ++++++++++++++
sysdeps/loongarch/lp64/multiarch/strrchr.c | 36 ++++
7 files changed, 578 insertions(+)
create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strrchr.h
create mode 100644 sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/strrchr.c
diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
index 39550bea..fe863e1b 100644
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
@@ -9,6 +9,9 @@ sysdep_routines += \
strchr-aligned \
strchr-lsx \
strchr-lasx \
+ strrchr-aligned \
+ strrchr-lsx \
+ strrchr-lasx \
strchrnul-aligned \
strchrnul-lsx \
strchrnul-lasx \
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
index 39a14f1d..529e2369 100644
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
@@ -94,6 +94,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_aligned)
)
+ IFUNC_IMPL (i, name, strrchr,
+#if !defined __loongarch_soft_float
+ IFUNC_IMPL_ADD (array, i, strrchr, SUPPORT_LASX, __strrchr_lasx)
+ IFUNC_IMPL_ADD (array, i, strrchr, SUPPORT_LSX, __strrchr_lsx)
+#endif
+ IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_aligned)
+ )
+
IFUNC_IMPL (i, name, memcpy,
#if !defined __loongarch_soft_float
IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_LASX, __memcpy_lasx)
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strrchr.h b/sysdeps/loongarch/lp64/multiarch/ifunc-strrchr.h
new file mode 100644
index 00000000..bbb34089
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strrchr.h
@@ -0,0 +1,41 @@
+/* Common definition for strrchr ifunc selections.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <ldsodefs.h>
+#include <ifunc-init.h>
+
+#if !defined __loongarch_soft_float
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
+#endif
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+#if !defined __loongarch_soft_float
+ if (SUPPORT_LASX)
+ return OPTIMIZE (lasx);
+ else if (SUPPORT_LSX)
+ return OPTIMIZE (lsx);
+ else
+#endif
+ return OPTIMIZE (aligned);
+}
diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S
new file mode 100644
index 00000000..a73deb78
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-aligned.S
@@ -0,0 +1,170 @@
+/* Optimized strrchr implementation using basic LoongArch instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+# define STRRCHR __strrchr_aligned
+#else
+# define STRRCHR strrchr
+#endif
+
+LEAF(STRRCHR, 6)
+ slli.d t0, a0, 3
+ bstrins.d a0, zero, 2, 0
+ lu12i.w a2, 0x01010
+ ld.d t2, a0, 0
+
+ andi a1, a1, 0xff
+ ori a2, a2, 0x101
+ li.d t3, -1
+ bstrins.d a2, a2, 63, 32
+
+ sll.d t5, t3, t0
+ slli.d a3, a2, 7
+ orn t4, t2, t5
+ mul.d a1, a1, a2
+
+ sub.d t0, t4, a2
+ andn t1, a3, t4
+ and t1, t0, t1
+ beqz t1, L(find_tail)
+
+
+ ctz.d t0, t1
+ orn t0, zero, t0
+ xor t2, t4, a1
+ srl.d t0, t3, t0
+
+ orn t2, t2, t0
+ orn t2, t2, t5
+ revb.d t2, t2
+ sub.d t1, t2, a2
+
+ andn t0, a3, t2
+ and t1, t0, t1
+ ctz.d t0, t1
+ srli.d t0, t0, 3
+
+ addi.d a0, a0, 7
+ sub.d a0, a0, t0
+ maskeqz a0, a0, t1
+ jr ra
+
+
+L(find_tail):
+ addi.d a4, a0, 8
+ addi.d a0, a0, 8
+L(loop_ascii):
+ ld.d t2, a0, 0
+ sub.d t1, t2, a2
+
+ and t0, t1, a3
+ bnez t0, L(more_check)
+ ld.d t2, a0, 8
+ sub.d t1, t2, a2
+
+ and t0, t1, a3
+ addi.d a0, a0, 16
+ beqz t0, L(loop_ascii)
+ addi.d a0, a0, -8
+
+L(more_check):
+ andn t0, a3, t2
+ and t1, t1, t0
+ bnez t1, L(tail)
+ addi.d a0, a0, 8
+
+
+L(loop_nonascii):
+ ld.d t2, a0, 0
+ sub.d t1, t2, a2
+ andn t0, a3, t2
+ and t1, t0, t1
+
+ bnez t1, L(tail)
+ ld.d t2, a0, 8
+ addi.d a0, a0, 16
+ sub.d t1, t2, a2
+
+ andn t0, a3, t2
+ and t1, t0, t1
+ beqz t1, L(loop_nonascii)
+ addi.d a0, a0, -8
+
+L(tail):
+ ctz.d t0, t1
+ orn t0, zero, t0
+ xor t2, t2, a1
+ srl.d t0, t3, t0
+
+
+ orn t2, t2, t0
+ revb.d t2, t2
+ sub.d t1, t2, a2
+ andn t0, a3, t2
+
+ and t1, t0, t1
+ bnez t1, L(count_pos)
+L(find_loop):
+ beq a0, a4, L(find_end)
+ ld.d t2, a0, -8
+
+ addi.d a0, a0, -8
+ xor t2, t2, a1
+ sub.d t1, t2, a2
+ andn t0, a3, t2
+
+ and t1, t0, t1
+ beqz t1, L(find_loop)
+ revb.d t2, t2
+ sub.d t1, t2, a2
+
+
+ andn t0, a3, t2
+ and t1, t0, t1
+L(count_pos):
+ ctz.d t0, t1
+ addi.d a0, a0, 7
+
+ srli.d t0, t0, 3
+ sub.d a0, a0, t0
+ jr ra
+ nop
+
+L(find_end):
+ xor t2, t4, a1
+ orn t2, t2, t5
+ revb.d t2, t2
+ sub.d t1, t2, a2
+
+
+ andn t0, a3, t2
+ and t1, t0, t1
+ ctz.d t0, t1
+ srli.d t0, t0, 3
+
+ addi.d a0, a4, -1
+ sub.d a0, a0, t0
+ maskeqz a0, a0, t1
+ jr ra
+END(STRRCHR)
+
+libc_hidden_builtin_def(STRRCHR)
diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S
new file mode 100644
index 00000000..5a6e2297
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lasx.S
@@ -0,0 +1,176 @@
+/* Optimized strrchr implementation using LoongArch LASX instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+#define STRRCHR __strrchr_lasx
+
+LEAF(STRRCHR, 6)
+ move a2, a0
+ bstrins.d a0, zero, 5, 0
+ xvld xr0, a0, 0
+ xvld xr1, a0, 32
+
+ li.d t2, -1
+ xvreplgr2vr.b xr4, a1
+ xvmsknz.b xr2, xr0
+ xvmsknz.b xr3, xr1
+
+ xvpickve.w xr5, xr2, 4
+ xvpickve.w xr6, xr3, 4
+ vilvl.h vr2, vr5, vr2
+ vilvl.h vr3, vr6, vr3
+
+ vilvl.w vr2, vr3, vr2
+ movfr2gr.d t0, fa2
+ sra.d t0, t0, a2
+ beq t0, t2, L(find_tail)
+
+
+ xvseq.b xr2, xr0, xr4
+ xvseq.b xr3, xr1, xr4
+ xvmsknz.b xr2, xr2
+ xvmsknz.b xr3, xr3
+
+ xvpickve.w xr4, xr2, 4
+ xvpickve.w xr5, xr3, 4
+ vilvl.h vr2, vr4, vr2
+ vilvl.h vr3, vr5, vr3
+
+ vilvl.w vr1, vr3, vr2
+ slli.d t3, t2, 1
+ movfr2gr.d t1, fa1
+ cto.d t0, t0
+
+ srl.d t1, t1, a2
+ sll.d t3, t3, t0
+ addi.d a0, a2, 63
+ andn t1, t1, t3
+
+
+ clz.d t0, t1
+ sub.d a0, a0, t0
+ maskeqz a0, a0, t1
+ jr ra
+
+ .align 5
+L(find_tail):
+ addi.d a3, a0, 64
+L(loop):
+ xvld xr2, a0, 64
+ xvld xr3, a0, 96
+ addi.d a0, a0, 64
+
+ xvmin.bu xr5, xr2, xr3
+ xvsetanyeqz.b fcc0, xr5
+ bceqz fcc0, L(loop)
+ xvmsknz.b xr5, xr2
+
+
+ xvmsknz.b xr6, xr3
+ xvpickve.w xr7, xr5, 4
+ xvpickve.w xr8, xr6, 4
+ vilvl.h vr5, vr7, vr5
+
+ vilvl.h vr6, vr8, vr6
+ xvseq.b xr2, xr2, xr4
+ xvseq.b xr3, xr3, xr4
+ xvmsknz.b xr2, xr2
+
+ xvmsknz.b xr3, xr3
+ xvpickve.w xr7, xr2, 4
+ xvpickve.w xr8, xr3, 4
+ vilvl.h vr2, vr7, vr2
+
+ vilvl.h vr3, vr8, vr3
+ vilvl.w vr5, vr6, vr5
+ vilvl.w vr2, vr3, vr2
+ movfr2gr.d t0, fa5
+
+
+ movfr2gr.d t1, fa2
+ slli.d t3, t2, 1
+ cto.d t0, t0
+ sll.d t3, t3, t0
+
+ andn t1, t1, t3
+ beqz t1, L(find_loop)
+ clz.d t0, t1
+ addi.d a0, a0, 63
+
+ sub.d a0, a0, t0
+ jr ra
+L(find_loop):
+ beq a0, a3, L(find_end)
+ xvld xr2, a0, -64
+
+ xvld xr3, a0, -32
+ addi.d a0, a0, -64
+ xvseq.b xr2, xr2, xr4
+ xvseq.b xr3, xr3, xr4
+
+
+ xvmax.bu xr5, xr2, xr3
+ xvseteqz.v fcc0, xr5
+ bcnez fcc0, L(find_loop)
+ xvmsknz.b xr0, xr2
+
+ xvmsknz.b xr1, xr3
+ xvpickve.w xr2, xr0, 4
+ xvpickve.w xr3, xr1, 4
+ vilvl.h vr0, vr2, vr0
+
+ vilvl.h vr1, vr3, vr1
+ vilvl.w vr0, vr1, vr0
+ movfr2gr.d t0, fa0
+ addi.d a0, a0, 63
+
+ clz.d t0, t0
+ sub.d a0, a0, t0
+ jr ra
+ nop
+
+
+L(find_end):
+ xvseq.b xr2, xr0, xr4
+ xvseq.b xr3, xr1, xr4
+ xvmsknz.b xr2, xr2
+ xvmsknz.b xr3, xr3
+
+ xvpickve.w xr4, xr2, 4
+ xvpickve.w xr5, xr3, 4
+ vilvl.h vr2, vr4, vr2
+ vilvl.h vr3, vr5, vr3
+
+ vilvl.w vr1, vr3, vr2
+ movfr2gr.d t1, fa1
+ addi.d a0, a2, 63
+ srl.d t1, t1, a2
+
+ clz.d t0, t1
+ sub.d a0, a0, t0
+ maskeqz a0, a0, t1
+ jr ra
+END(STRRCHR)
+
+libc_hidden_builtin_def(STRRCHR)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S
new file mode 100644
index 00000000..8f2fd22e
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strrchr-lsx.S
@@ -0,0 +1,144 @@
+/* Optimized strrchr implementation using LoongArch LSX instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+#define STRRCHR __strrchr_lsx
+
+LEAF(STRRCHR, 6)
+ move a2, a0
+ bstrins.d a0, zero, 4, 0
+ vld vr0, a0, 0
+ vld vr1, a0, 16
+
+ li.d t2, -1
+ vreplgr2vr.b vr4, a1
+ vmsknz.b vr2, vr0
+ vmsknz.b vr3, vr1
+
+ vilvl.h vr2, vr3, vr2
+ movfr2gr.s t0, fa2
+ sra.w t0, t0, a2
+ beq t0, t2, L(find_tail)
+
+ vseq.b vr2, vr0, vr4
+ vseq.b vr3, vr1, vr4
+ vmsknz.b vr2, vr2
+ vmsknz.b vr3, vr3
+
+
+ vilvl.h vr1, vr3, vr2
+ slli.d t3, t2, 1
+ movfr2gr.s t1, fa1
+ cto.w t0, t0
+
+ srl.w t1, t1, a2
+ sll.d t3, t3, t0
+ addi.d a0, a2, 31
+ andn t1, t1, t3
+
+ clz.w t0, t1
+ sub.d a0, a0, t0
+ maskeqz a0, a0, t1
+ jr ra
+
+ .align 5
+L(find_tail):
+ addi.d a3, a0, 32
+L(loop):
+ vld vr2, a0, 32
+ vld vr3, a0, 48
+ addi.d a0, a0, 32
+
+ vmin.bu vr5, vr2, vr3
+ vsetanyeqz.b fcc0, vr5
+ bceqz fcc0, L(loop)
+ vmsknz.b vr5, vr2
+
+ vmsknz.b vr6, vr3
+ vilvl.h vr5, vr6, vr5
+ vseq.b vr2, vr2, vr4
+ vseq.b vr3, vr3, vr4
+
+ vmsknz.b vr2, vr2
+ vmsknz.b vr3, vr3
+ vilvl.h vr2, vr3, vr2
+ movfr2gr.s t0, fa5
+
+
+ movfr2gr.s t1, fa2
+ slli.d t3, t2, 1
+ cto.w t0, t0
+ sll.d t3, t3, t0
+
+ andn t1, t1, t3
+ beqz t1, L(find_loop)
+ clz.w t0, t1
+ addi.d a0, a0, 31
+
+ sub.d a0, a0, t0
+ jr ra
+L(find_loop):
+ beq a0, a3, L(find_end)
+ vld vr2, a0, -32
+
+ vld vr3, a0, -16
+ addi.d a0, a0, -32
+ vseq.b vr2, vr2, vr4
+ vseq.b vr3, vr3, vr4
+
+
+ vmax.bu vr5, vr2, vr3
+ vseteqz.v fcc0, vr5
+ bcnez fcc0, L(find_loop)
+ vmsknz.b vr0, vr2
+
+ vmsknz.b vr1, vr3
+ vilvl.h vr0, vr1, vr0
+ movfr2gr.s t0, fa0
+ addi.d a0, a0, 31
+
+ clz.w t0, t0
+ sub.d a0, a0, t0
+ jr ra
+ nop
+
+L(find_end):
+ vseq.b vr2, vr0, vr4
+ vseq.b vr3, vr1, vr4
+ vmsknz.b vr2, vr2
+ vmsknz.b vr3, vr3
+
+
+ vilvl.h vr1, vr3, vr2
+ movfr2gr.s t1, fa1
+ addi.d a0, a2, 31
+ srl.w t1, t1, a2
+
+ clz.w t0, t1
+ sub.d a0, a0, t0
+ maskeqz a0, a0, t1
+ jr ra
+END(STRRCHR)
+
+libc_hidden_builtin_def(STRRCHR)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strrchr.c b/sysdeps/loongarch/lp64/multiarch/strrchr.c
new file mode 100644
index 00000000..d9c9f660
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strrchr.c
@@ -0,0 +1,36 @@
+/* Multiple versions of strrchr.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define strrchr __redirect_strrchr
+# include <string.h>
+# undef strrchr
+
+# define SYMBOL_NAME strrchr
+# include "ifunc-strrchr.h"
+
+libc_ifunc_redirected (__redirect_strrchr, strrchr, IFUNC_SELECTOR ());
+weak_alias (strrchr, rindex)
+# ifdef SHARED
+__hidden_ver1 (strrchr, __GI_strrchr, __redirect_strrchr)
+ __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strrchr);
+# endif
+
+#endif
--
2.33.0

View file

@ -0,0 +1,626 @@
From b5979df8ad07823c79a934c1fa0a91ec0abffb61 Mon Sep 17 00:00:00 2001
From: caiyinyu <caiyinyu@loongson.cn>
Date: Fri, 8 Sep 2023 14:10:55 +0800
Subject: [PATCH 20/29] LoongArch: Add lasx/lsx support for
_dl_runtime_profile.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/bits/link.h | 24 ++-
sysdeps/loongarch/bits/link_lavcurrent.h | 25 +++
sysdeps/loongarch/dl-audit-check.h | 23 +++
sysdeps/loongarch/dl-link.sym | 8 +-
sysdeps/loongarch/dl-machine.h | 11 +-
sysdeps/loongarch/dl-trampoline.S | 177 +----------------
sysdeps/loongarch/dl-trampoline.h | 242 +++++++++++++++++++++++
7 files changed, 331 insertions(+), 179 deletions(-)
create mode 100644 sysdeps/loongarch/bits/link_lavcurrent.h
create mode 100644 sysdeps/loongarch/dl-audit-check.h
diff --git a/sysdeps/loongarch/bits/link.h b/sysdeps/loongarch/bits/link.h
index 7fa61312..00f6f25f 100644
--- a/sysdeps/loongarch/bits/link.h
+++ b/sysdeps/loongarch/bits/link.h
@@ -20,10 +20,26 @@
#error "Never include <bits/link.h> directly; use <link.h> instead."
#endif
+#ifndef __loongarch_soft_float
+typedef float La_loongarch_vr
+ __attribute__ ((__vector_size__ (16), __aligned__ (16)));
+typedef float La_loongarch_xr
+ __attribute__ ((__vector_size__ (32), __aligned__ (16)));
+
+typedef union
+{
+ double fpreg[4];
+ La_loongarch_vr vr[2];
+ La_loongarch_xr xr[1];
+} La_loongarch_vector __attribute__ ((__aligned__ (16)));
+#endif
+
typedef struct La_loongarch_regs
{
unsigned long int lr_reg[8]; /* a0 - a7 */
- double lr_fpreg[8]; /* fa0 - fa7 */
+#ifndef __loongarch_soft_float
+ La_loongarch_vector lr_vec[8]; /* fa0 - fa7 or vr0 - vr7 or xr0 - xr7*/
+#endif
unsigned long int lr_ra;
unsigned long int lr_sp;
} La_loongarch_regs;
@@ -33,8 +49,10 @@ typedef struct La_loongarch_retval
{
unsigned long int lrv_a0;
unsigned long int lrv_a1;
- double lrv_fa0;
- double lrv_fa1;
+#ifndef __loongarch_soft_float
+ La_loongarch_vector lrv_vec0;
+ La_loongarch_vector lrv_vec1;
+#endif
} La_loongarch_retval;
__BEGIN_DECLS
diff --git a/sysdeps/loongarch/bits/link_lavcurrent.h b/sysdeps/loongarch/bits/link_lavcurrent.h
new file mode 100644
index 00000000..15f1eb84
--- /dev/null
+++ b/sysdeps/loongarch/bits/link_lavcurrent.h
@@ -0,0 +1,25 @@
+/* Data structure for communication from the run-time dynamic linker for
+ loaded ELF shared objects. LAV_CURRENT definition.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _LINK_H
+# error "Never include <bits/link_lavcurrent.h> directly; use <link.h> instead."
+#endif
+
+/* Version numbers for la_version handshake interface. */
+#define LAV_CURRENT 3
diff --git a/sysdeps/loongarch/dl-audit-check.h b/sysdeps/loongarch/dl-audit-check.h
new file mode 100644
index 00000000..a139c939
--- /dev/null
+++ b/sysdeps/loongarch/dl-audit-check.h
@@ -0,0 +1,23 @@
+/* rtld-audit version check. LoongArch version.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+static inline bool
+_dl_audit_check_version (unsigned int lav)
+{
+ return lav == LAV_CURRENT;
+}
diff --git a/sysdeps/loongarch/dl-link.sym b/sysdeps/loongarch/dl-link.sym
index 868ab7c6..b534968e 100644
--- a/sysdeps/loongarch/dl-link.sym
+++ b/sysdeps/loongarch/dl-link.sym
@@ -6,9 +6,13 @@ DL_SIZEOF_RG sizeof(struct La_loongarch_regs)
DL_SIZEOF_RV sizeof(struct La_loongarch_retval)
DL_OFFSET_RG_A0 offsetof(struct La_loongarch_regs, lr_reg)
-DL_OFFSET_RG_FA0 offsetof(struct La_loongarch_regs, lr_fpreg)
+#ifndef __loongarch_soft_float
+DL_OFFSET_RG_VEC0 offsetof(struct La_loongarch_regs, lr_vec)
+#endif
DL_OFFSET_RG_RA offsetof(struct La_loongarch_regs, lr_ra)
DL_OFFSET_RG_SP offsetof(struct La_loongarch_regs, lr_sp)
DL_OFFSET_RV_A0 offsetof(struct La_loongarch_retval, lrv_a0)
-DL_OFFSET_RV_FA0 offsetof(struct La_loongarch_retval, lrv_a1)
+#ifndef __loongarch_soft_float
+DL_OFFSET_RV_VEC0 offsetof(struct La_loongarch_retval, lrv_vec0)
+#endif
diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
index 066bb233..8a2db9de 100644
--- a/sysdeps/loongarch/dl-machine.h
+++ b/sysdeps/loongarch/dl-machine.h
@@ -273,6 +273,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
#if !defined __loongarch_soft_float
extern void _dl_runtime_resolve_lasx (void) attribute_hidden;
extern void _dl_runtime_resolve_lsx (void) attribute_hidden;
+ extern void _dl_runtime_profile_lasx (void) attribute_hidden;
+ extern void _dl_runtime_profile_lsx (void) attribute_hidden;
#endif
extern void _dl_runtime_resolve (void) attribute_hidden;
extern void _dl_runtime_profile (void) attribute_hidden;
@@ -287,7 +289,14 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
end in this function. */
if (profile != 0)
{
- gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile;
+#if !defined __loongarch_soft_float
+ if (SUPPORT_LASX)
+ gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile_lasx;
+ else if (SUPPORT_LSX)
+ gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile_lsx;
+ else
+#endif
+ gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile;
if (GLRO(dl_profile) != NULL
&& _dl_name_match_p (GLRO(dl_profile), l))
diff --git a/sysdeps/loongarch/dl-trampoline.S b/sysdeps/loongarch/dl-trampoline.S
index 8fd91469..bb449ecf 100644
--- a/sysdeps/loongarch/dl-trampoline.S
+++ b/sysdeps/loongarch/dl-trampoline.S
@@ -22,190 +22,21 @@
#if !defined __loongarch_soft_float
#define USE_LASX
#define _dl_runtime_resolve _dl_runtime_resolve_lasx
+#define _dl_runtime_profile _dl_runtime_profile_lasx
#include "dl-trampoline.h"
#undef FRAME_SIZE
#undef USE_LASX
#undef _dl_runtime_resolve
+#undef _dl_runtime_profile
#define USE_LSX
#define _dl_runtime_resolve _dl_runtime_resolve_lsx
+#define _dl_runtime_profile _dl_runtime_profile_lsx
#include "dl-trampoline.h"
#undef FRAME_SIZE
#undef USE_LSX
#undef _dl_runtime_resolve
+#undef _dl_runtime_profile
#endif
#include "dl-trampoline.h"
-
-#include "dl-link.h"
-
-ENTRY (_dl_runtime_profile)
- /* LoongArch we get called with:
- t0 linkr_map pointer
- t1 the scaled offset stored in t0, which can be used
- to calculate the offset of the current symbol in .rela.plt
- t2 %hi(%pcrel(.got.plt)) stored in t2, no use in this function
- t3 dl resolver entry point, no use in this function
-
- Stack frame layout:
- [sp, #96] La_loongarch_regs
- [sp, #48] La_loongarch_retval
- [sp, #40] frame size return from pltenter
- [sp, #32] dl_profile_call saved a1
- [sp, #24] dl_profile_call saved a0
- [sp, #16] T1
- [sp, #0] ra, fp <- fp
- */
-
-# define OFFSET_T1 16
-# define OFFSET_SAVED_CALL_A0 OFFSET_T1 + 8
-# define OFFSET_FS OFFSET_SAVED_CALL_A0 + 16
-# define OFFSET_RV OFFSET_FS + 8
-# define OFFSET_RG OFFSET_RV + DL_SIZEOF_RV
-
-# define SF_SIZE (-(-(OFFSET_RG + DL_SIZEOF_RG) & ALMASK))
-
- /* Save arguments to stack. */
- ADDI sp, sp, -SF_SIZE
- REG_S ra, sp, 0
- REG_S fp, sp, 8
-
- or fp, sp, zero
-
- REG_S a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
- REG_S a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
- REG_S a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
- REG_S a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
- REG_S a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
- REG_S a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
- REG_S a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
- REG_S a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
-
-#ifndef __loongarch_soft_float
- FREG_S fa0, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 0*SZFREG
- FREG_S fa1, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 1*SZFREG
- FREG_S fa2, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 2*SZFREG
- FREG_S fa3, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 3*SZFREG
- FREG_S fa4, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 4*SZFREG
- FREG_S fa5, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 5*SZFREG
- FREG_S fa6, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 6*SZFREG
- FREG_S fa7, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 7*SZFREG
-#endif
-
- /* Update .got.plt and obtain runtime address of callee. */
- SLLI a1, t1, 1
- or a0, t0, zero
- ADD a1, a1, t1
- or a2, ra, zero /* return addr */
- ADDI a3, fp, OFFSET_RG /* La_loongarch_regs pointer */
- ADDI a4, fp, OFFSET_FS /* frame size return from pltenter */
-
- REG_S a0, fp, OFFSET_SAVED_CALL_A0
- REG_S a1, fp, OFFSET_SAVED_CALL_A0 + SZREG
-
- la t2, _dl_profile_fixup
- jirl ra, t2, 0
-
- REG_L t3, fp, OFFSET_FS
- bge t3, zero, 1f
-
- /* Save the return. */
- or t4, v0, zero
-
- /* Restore arguments from stack. */
- REG_L a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
- REG_L a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
- REG_L a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
- REG_L a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
- REG_L a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
- REG_L a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
- REG_L a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
- REG_L a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
-
-#ifndef __loongarch_soft_float
- FREG_L fa0, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 0*SZFREG
- FREG_L fa1, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 1*SZFREG
- FREG_L fa2, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 2*SZFREG
- FREG_L fa3, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 3*SZFREG
- FREG_L fa4, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 4*SZFREG
- FREG_L fa5, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 5*SZFREG
- FREG_L fa6, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 6*SZFREG
- FREG_L fa7, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 7*SZFREG
-#endif
-
- REG_L ra, fp, 0
- REG_L fp, fp, SZREG
-
- ADDI sp, sp, SF_SIZE
- jirl zero, t4, 0
-
-1:
- /* The new frame size is in t3. */
- SUB sp, fp, t3
- BSTRINS sp, zero, 3, 0
-
- REG_S a0, fp, OFFSET_T1
-
- or a0, sp, zero
- ADDI a1, fp, SF_SIZE
- or a2, t3, zero
- la t5, memcpy
- jirl ra, t5, 0
-
- REG_L t6, fp, OFFSET_T1
-
- /* Call the function. */
- REG_L a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
- REG_L a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
- REG_L a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
- REG_L a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
- REG_L a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
- REG_L a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
- REG_L a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
- REG_L a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
-
-#ifndef __loongarch_soft_float
- FREG_L fa0, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 0*SZFREG
- FREG_L fa1, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 1*SZFREG
- FREG_L fa2, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 2*SZFREG
- FREG_L fa3, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 3*SZFREG
- FREG_L fa4, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 4*SZFREG
- FREG_L fa5, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 5*SZFREG
- FREG_L fa6, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 6*SZFREG
- FREG_L fa7, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 7*SZFREG
-#endif
- jirl ra, t6, 0
-
- REG_S a0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_A0
- REG_S a1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_A0 + SZREG
-
-#ifndef __loongarch_soft_float
- FREG_S fa0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_FA0
- FREG_S fa1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_FA0 + SZFREG
-#endif
-
- /* Setup call to pltexit. */
- REG_L a0, fp, OFFSET_SAVED_CALL_A0
- REG_L a1, fp, OFFSET_SAVED_CALL_A0 + SZREG
- ADDI a2, fp, OFFSET_RG
- ADDI a3, fp, OFFSET_RV
- la t7, _dl_audit_pltexit
- jirl ra, t7, 0
-
- REG_L a0, fp, OFFSET_RV + DL_OFFSET_RV_A0
- REG_L a1, fp, OFFSET_RV + DL_OFFSET_RV_A0 + SZREG
-
-#ifndef __loongarch_soft_float
- FREG_L fa0, fp, OFFSET_RV + DL_OFFSET_RV_FA0
- FREG_L fa1, fp, OFFSET_RV + DL_OFFSET_RV_FA0 + SZFREG
-#endif
-
- /* RA from within La_loongarch_reg. */
- REG_L ra, fp, OFFSET_RG + DL_OFFSET_RG_RA
- or sp, fp, zero
- ADDI sp, sp, SF_SIZE
- REG_S fp, fp, SZREG
-
- jirl zero, ra, 0
-
-END (_dl_runtime_profile)
diff --git a/sysdeps/loongarch/dl-trampoline.h b/sysdeps/loongarch/dl-trampoline.h
index 99fcacab..e298439d 100644
--- a/sysdeps/loongarch/dl-trampoline.h
+++ b/sysdeps/loongarch/dl-trampoline.h
@@ -125,3 +125,245 @@ ENTRY (_dl_runtime_resolve)
/* Invoke the callee. */
jirl zero, t1, 0
END (_dl_runtime_resolve)
+
+#include "dl-link.h"
+
+ENTRY (_dl_runtime_profile)
+ /* LoongArch we get called with:
+ t0 linkr_map pointer
+ t1 the scaled offset stored in t0, which can be used
+ to calculate the offset of the current symbol in .rela.plt
+ t2 %hi(%pcrel(.got.plt)) stored in t2, no use in this function
+ t3 dl resolver entry point, no use in this function
+
+ Stack frame layout:
+ [sp, #208] La_loongarch_regs
+ [sp, #128] La_loongarch_retval // align: 16
+ [sp, #112] frame size return from pltenter
+ [sp, #80 ] dl_profile_call saved vec1
+ [sp, #48 ] dl_profile_call saved vec0 // align: 16
+ [sp, #32 ] dl_profile_call saved a1
+ [sp, #24 ] dl_profile_call saved a0
+ [sp, #16 ] T1
+ [sp, #0 ] ra, fp <- fp
+ */
+
+# define OFFSET_T1 16
+# define OFFSET_SAVED_CALL_A0 OFFSET_T1 + 8
+# define OFFSET_FS OFFSET_SAVED_CALL_A0 + 16 + 8 + 64
+# define OFFSET_RV OFFSET_FS + 8 + 8
+# define OFFSET_RG OFFSET_RV + DL_SIZEOF_RV
+
+# define SF_SIZE (-(-(OFFSET_RG + DL_SIZEOF_RG) & ALMASK))
+
+ /* Save arguments to stack. */
+ ADDI sp, sp, -SF_SIZE
+ REG_S ra, sp, 0
+ REG_S fp, sp, 8
+
+ or fp, sp, zero
+
+ REG_S a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
+ REG_S a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
+ REG_S a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
+ REG_S a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
+ REG_S a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
+ REG_S a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
+ REG_S a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
+ REG_S a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
+
+#ifdef USE_LASX
+ xvst xr0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZXREG
+ xvst xr1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZXREG
+ xvst xr2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZXREG
+ xvst xr3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZXREG
+ xvst xr4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZXREG
+ xvst xr5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZXREG
+ xvst xr6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZXREG
+ xvst xr7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZXREG
+#elif defined USE_LSX
+ vst vr0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZVREG
+ vst vr1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZVREG
+ vst vr2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZVREG
+ vst vr3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZVREG
+ vst vr4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZVREG
+ vst vr5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZVREG
+ vst vr6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZVREG
+ vst vr7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZVREG
+#elif !defined __loongarch_soft_float
+ FREG_S fa0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZFREG
+ FREG_S fa1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZFREG
+ FREG_S fa2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZFREG
+ FREG_S fa3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZFREG
+ FREG_S fa4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZFREG
+ FREG_S fa5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZFREG
+ FREG_S fa6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZFREG
+ FREG_S fa7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZFREG
+#endif
+
+ /* Update .got.plt and obtain runtime address of callee. */
+ SLLI a1, t1, 1
+ or a0, t0, zero
+ ADD a1, a1, t1
+ or a2, ra, zero /* return addr */
+ ADDI a3, fp, OFFSET_RG /* La_loongarch_regs pointer */
+ ADDI a4, fp, OFFSET_FS /* frame size return from pltenter */
+
+ REG_S a0, fp, OFFSET_SAVED_CALL_A0
+ REG_S a1, fp, OFFSET_SAVED_CALL_A0 + SZREG
+
+ la t2, _dl_profile_fixup
+ jirl ra, t2, 0
+
+ REG_L t3, fp, OFFSET_FS
+ bge t3, zero, 1f
+
+ /* Save the return. */
+ or t4, v0, zero
+
+ /* Restore arguments from stack. */
+ REG_L a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
+ REG_L a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
+ REG_L a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
+ REG_L a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
+ REG_L a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
+ REG_L a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
+ REG_L a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
+ REG_L a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
+
+#ifdef USE_LASX
+ xvld xr0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZXREG
+ xvld xr1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZXREG
+ xvld xr2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZXREG
+ xvld xr3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZXREG
+ xvld xr4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZXREG
+ xvld xr5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZXREG
+ xvld xr6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZXREG
+ xvld xr7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZXREG
+#elif defined USE_LSX
+ vld vr0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZVREG
+ vld vr1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZVREG
+ vld vr2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZVREG
+ vld vr3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZVREG
+ vld vr4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZVREG
+ vld vr5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZVREG
+ vld vr6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZVREG
+ vld vr7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZVREG
+#elif !defined __loongarch_soft_float
+ FREG_L fa0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZFREG
+ FREG_L fa1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZFREG
+ FREG_L fa2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZFREG
+ FREG_L fa3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZFREG
+ FREG_L fa4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZFREG
+ FREG_L fa5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZFREG
+ FREG_L fa6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZFREG
+ FREG_L fa7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZFREG
+#endif
+
+ REG_L ra, fp, 0
+ REG_L fp, fp, SZREG
+
+ ADDI sp, sp, SF_SIZE
+ jirl zero, t4, 0
+
+1:
+ /* The new frame size is in t3. */
+ SUB sp, fp, t3
+ BSTRINS sp, zero, 3, 0
+
+ REG_S a0, fp, OFFSET_T1
+
+ or a0, sp, zero
+ ADDI a1, fp, SF_SIZE
+ or a2, t3, zero
+ la t5, memcpy
+ jirl ra, t5, 0
+
+ REG_L t6, fp, OFFSET_T1
+
+ /* Call the function. */
+ REG_L a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
+ REG_L a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
+ REG_L a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
+ REG_L a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
+ REG_L a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
+ REG_L a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
+ REG_L a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
+ REG_L a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
+
+#ifdef USE_LASX
+ xvld xr0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZXREG
+ xvld xr1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZXREG
+ xvld xr2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZXREG
+ xvld xr3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZXREG
+ xvld xr4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZXREG
+ xvld xr5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZXREG
+ xvld xr6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZXREG
+ xvld xr7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZXREG
+#elif defined USE_LSX
+ vld vr0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZVREG
+ vld vr1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZVREG
+ vld vr2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZVREG
+ vld vr3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZVREG
+ vld vr4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZVREG
+ vld vr5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZVREG
+ vld vr6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZVREG
+ vld vr7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZVREG
+#elif !defined __loongarch_soft_float
+ FREG_L fa0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZFREG
+ FREG_L fa1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZFREG
+ FREG_L fa2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZFREG
+ FREG_L fa3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZFREG
+ FREG_L fa4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZFREG
+ FREG_L fa5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZFREG
+ FREG_L fa6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZFREG
+ FREG_L fa7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZFREG
+#endif
+
+ jirl ra, t6, 0
+
+ REG_S a0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_A0
+ REG_S a1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_A0 + SZREG
+
+#ifdef USE_LASX
+ xvst xr0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0
+ xvst xr1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0 + SZXREG
+#elif defined USE_LSX
+ vst vr0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0
+ vst vr1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0 + SZVREG
+#elif !defined __loongarch_soft_float
+ FREG_S fa0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0
+ FREG_S fa1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0 + SZFREG
+#endif
+
+ /* Setup call to pltexit. */
+ REG_L a0, fp, OFFSET_SAVED_CALL_A0
+ REG_L a1, fp, OFFSET_SAVED_CALL_A0 + SZREG
+ ADDI a2, fp, OFFSET_RG
+ ADDI a3, fp, OFFSET_RV
+ la t7, _dl_audit_pltexit
+ jirl ra, t7, 0
+
+ REG_L a0, fp, OFFSET_RV + DL_OFFSET_RV_A0
+ REG_L a1, fp, OFFSET_RV + DL_OFFSET_RV_A0 + SZREG
+
+#ifdef USE_LASX
+ xvld xr0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0
+ xvld xr1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0 + SZXREG
+#elif defined USE_LSX
+ vld vr0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0
+ vld vr1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0 + SZVREG
+#elif !defined __loongarch_soft_float
+ FREG_L fa0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0
+ FREG_L fa1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0 + SZFREG
+#endif
+
+ /* RA from within La_loongarch_reg. */
+ REG_L ra, fp, OFFSET_RG + DL_OFFSET_RG_RA
+ or sp, fp, zero
+ ADDI sp, sp, SF_SIZE
+ REG_S fp, fp, SZREG
+
+ jirl zero, ra, 0
+
+END (_dl_runtime_profile)
--
2.33.0

View file

@ -0,0 +1,102 @@
From 7353f21f6ed1754b67e455e2b80123787efa9e91 Mon Sep 17 00:00:00 2001
From: dengjianbo <dengjianbo@loongson.cn>
Date: Tue, 8 Aug 2023 14:15:43 +0800
Subject: [PATCH 02/29] LoongArch: Add minuimum binutils required version
LoongArch glibc can add some LASX/LSX vector instructions codes,
change the required minimum binutils version to 2.41 which could
support vector instructions. HAVE_LOONGARCH_VEC_ASM is removed
accordingly.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
config.h.in | 5 -----
sysdeps/loongarch/configure | 5 ++---
sysdeps/loongarch/configure.ac | 4 ++--
sysdeps/loongarch/dl-machine.h | 4 ++--
sysdeps/loongarch/dl-trampoline.S | 2 +-
5 files changed, 7 insertions(+), 13 deletions(-)
diff --git a/config.h.in b/config.h.in
index 0dedc124..44a34072 100644
--- a/config.h.in
+++ b/config.h.in
@@ -141,11 +141,6 @@
/* LOONGARCH floating-point ABI for ld.so. */
#undef LOONGARCH_ABI_FRLEN
-/* Assembler support LoongArch LASX/LSX vector instructions.
- This macro becomes obsolete when glibc increased the minimum
- required version of GNU 'binutils' to 2.41 or later. */
-#define HAVE_LOONGARCH_VEC_ASM 0
-
/* Linux specific: minimum supported kernel version. */
#undef __LINUX_KERNEL_VERSION
diff --git a/sysdeps/loongarch/configure b/sysdeps/loongarch/configure
index 5843c7cf..395ddc92 100644
--- a/sysdeps/loongarch/configure
+++ b/sysdeps/loongarch/configure
@@ -128,8 +128,7 @@ rm -f conftest*
fi
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_loongarch_vec_asm" >&5
printf "%s\n" "$libc_cv_loongarch_vec_asm" >&6; }
-if test $libc_cv_loongarch_vec_asm = yes; then
- printf "%s\n" "#define HAVE_LOONGARCH_VEC_ASM 1" >>confdefs.h
-
+if test $libc_cv_loongarch_vec_asm = no; then
+ as_fn_error $? "binutils version is too old, use 2.41 or newer version" "$LINENO" 5
fi
diff --git a/sysdeps/loongarch/configure.ac b/sysdeps/loongarch/configure.ac
index ba89d834..989287c6 100644
--- a/sysdeps/loongarch/configure.ac
+++ b/sysdeps/loongarch/configure.ac
@@ -74,6 +74,6 @@ else
libc_cv_loongarch_vec_asm=no
fi
rm -f conftest*])
-if test $libc_cv_loongarch_vec_asm = yes; then
- AC_DEFINE(HAVE_LOONGARCH_VEC_ASM)
+if test $libc_cv_loongarch_vec_asm = no; then
+ AC_MSG_ERROR([binutils version is too old, use 2.41 or newer version])
fi
diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
index 51ce9af8..066bb233 100644
--- a/sysdeps/loongarch/dl-machine.h
+++ b/sysdeps/loongarch/dl-machine.h
@@ -270,7 +270,7 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
/* If using PLTs, fill in the first two entries of .got.plt. */
if (l->l_info[DT_JMPREL])
{
-#if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float
+#if !defined __loongarch_soft_float
extern void _dl_runtime_resolve_lasx (void) attribute_hidden;
extern void _dl_runtime_resolve_lsx (void) attribute_hidden;
#endif
@@ -300,7 +300,7 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
/* This function will get called to fix up the GOT entry
indicated by the offset on the stack, and then jump to
the resolved address. */
-#if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float
+#if !defined __loongarch_soft_float
if (SUPPORT_LASX)
gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lasx;
else if (SUPPORT_LSX)
diff --git a/sysdeps/loongarch/dl-trampoline.S b/sysdeps/loongarch/dl-trampoline.S
index f6ba5e44..8fd91469 100644
--- a/sysdeps/loongarch/dl-trampoline.S
+++ b/sysdeps/loongarch/dl-trampoline.S
@@ -19,7 +19,7 @@
#include <sysdep.h>
#include <sys/asm.h>
-#if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float
+#if !defined __loongarch_soft_float
#define USE_LASX
#define _dl_runtime_resolve _dl_runtime_resolve_lasx
#include "dl-trampoline.h"
--
2.33.0

View file

@ -0,0 +1,277 @@
From e5ccd79e81de7ad5821fde83875973e878d85d4b Mon Sep 17 00:00:00 2001
From: dengjianbo <dengjianbo@loongson.cn>
Date: Mon, 28 Aug 2023 10:08:40 +0800
Subject: [PATCH 19/29] LoongArch: Change loongarch to LoongArch in comments
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S | 2 +-
sysdeps/loongarch/lp64/multiarch/memcpy-lasx.S | 2 +-
sysdeps/loongarch/lp64/multiarch/memcpy-lsx.S | 2 +-
sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S | 2 +-
sysdeps/loongarch/lp64/multiarch/memmove-aligned.S | 2 +-
sysdeps/loongarch/lp64/multiarch/memmove-lasx.S | 2 +-
sysdeps/loongarch/lp64/multiarch/memmove-lsx.S | 2 +-
sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S | 2 +-
sysdeps/loongarch/lp64/multiarch/strchr-aligned.S | 2 +-
sysdeps/loongarch/lp64/multiarch/strchr-lasx.S | 2 +-
sysdeps/loongarch/lp64/multiarch/strchr-lsx.S | 2 +-
sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S | 2 +-
sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S | 2 +-
sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S | 2 +-
sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S | 2 +-
sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S | 2 +-
sysdeps/loongarch/lp64/multiarch/strlen-aligned.S | 2 +-
sysdeps/loongarch/lp64/multiarch/strlen-lasx.S | 2 +-
sysdeps/loongarch/lp64/multiarch/strlen-lsx.S | 2 +-
sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S | 2 +-
sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S | 2 +-
sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S | 2 +-
sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S | 2 +-
sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S | 2 +-
24 files changed, 24 insertions(+), 24 deletions(-)
diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S
index 299dd49c..7eb34395 100644
--- a/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S
+++ b/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S
@@ -1,4 +1,4 @@
-/* Optimized memcpy_aligned implementation using basic Loongarch instructions.
+/* Optimized memcpy_aligned implementation using basic LoongArch instructions.
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-lasx.S b/sysdeps/loongarch/lp64/multiarch/memcpy-lasx.S
index 4aae5bf8..ae148df5 100644
--- a/sysdeps/loongarch/lp64/multiarch/memcpy-lasx.S
+++ b/sysdeps/loongarch/lp64/multiarch/memcpy-lasx.S
@@ -1,4 +1,4 @@
-/* Optimized memcpy implementation using Loongarch LASX instructions.
+/* Optimized memcpy implementation using LoongArch LASX instructions.
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/memcpy-lsx.S
index 6ebbe7a2..feb2bb0e 100644
--- a/sysdeps/loongarch/lp64/multiarch/memcpy-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/memcpy-lsx.S
@@ -1,4 +1,4 @@
-/* Optimized memcpy implementation using Loongarch LSX instructions.
+/* Optimized memcpy implementation using LoongArch LSX instructions.
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S
index 8e60a22d..31019b13 100644
--- a/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S
+++ b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S
@@ -1,4 +1,4 @@
-/* Optimized unaligned memcpy implementation using basic Loongarch instructions.
+/* Optimized unaligned memcpy implementation using basic LoongArch instructions.
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-aligned.S b/sysdeps/loongarch/lp64/multiarch/memmove-aligned.S
index 5354f383..a02114c0 100644
--- a/sysdeps/loongarch/lp64/multiarch/memmove-aligned.S
+++ b/sysdeps/loongarch/lp64/multiarch/memmove-aligned.S
@@ -1,4 +1,4 @@
-/* Optimized memmove_aligned implementation using basic Loongarch instructions.
+/* Optimized memmove_aligned implementation using basic LoongArch instructions.
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S
index ff68e7a2..95d8ee7b 100644
--- a/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S
+++ b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S
@@ -1,4 +1,4 @@
-/* Optimized memmove implementation using Loongarch LASX instructions.
+/* Optimized memmove implementation using LoongArch LASX instructions.
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
index 9e1502a7..8a936770 100644
--- a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
@@ -1,4 +1,4 @@
-/* Optimized memmove implementation using Loongarch LSX instructions.
+/* Optimized memmove implementation using LoongArch LSX instructions.
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S
index 90a64b6b..3284ce25 100644
--- a/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S
+++ b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S
@@ -1,4 +1,4 @@
-/* Optimized memmove_unaligned implementation using basic Loongarch instructions.
+/* Optimized memmove_unaligned implementation using basic LoongArch instructions.
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S
index 5fb01806..62020054 100644
--- a/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S
+++ b/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S
@@ -1,4 +1,4 @@
-/* Optimized strchr implementation using basic Loongarch instructions.
+/* Optimized strchr implementation using basic LoongArch instructions.
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S
index 254402da..4d3cc588 100644
--- a/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S
+++ b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S
@@ -1,4 +1,4 @@
-/* Optimized strchr implementation using loongarch LASX SIMD instructions.
+/* Optimized strchr implementation using LoongArch LASX instructions.
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S
index dae98b0a..8b78c35c 100644
--- a/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S
@@ -1,4 +1,4 @@
-/* Optimized strlen implementation using loongarch LSX SIMD instructions.
+/* Optimized strlen implementation using LoongArch LSX instructions.
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S
index 1c01a023..20856a06 100644
--- a/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S
+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S
@@ -1,4 +1,4 @@
-/* Optimized strchrnul implementation using basic Loongarch instructions.
+/* Optimized strchrnul implementation using basic LoongArch instructions.
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S
index d45495e4..4753d4ce 100644
--- a/sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S
+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S
@@ -1,4 +1,4 @@
-/* Optimized strchrnul implementation using loongarch LASX SIMD instructions.
+/* Optimized strchrnul implementation using LoongArch LASX instructions.
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S
index 07d793ae..671e740c 100644
--- a/sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S
@@ -1,4 +1,4 @@
-/* Optimized strchrnul implementation using loongarch LSX SIMD instructions.
+/* Optimized strchrnul implementation using LoongArch LSX instructions.
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S
index f5f4f336..ba1f9667 100644
--- a/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S
+++ b/sysdeps/loongarch/lp64/multiarch/strcmp-aligned.S
@@ -1,4 +1,4 @@
-/* Optimized strcmp implementation using basic Loongarch instructions.
+/* Optimized strcmp implementation using basic LoongArch instructions.
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
index 2e177a38..091c8c9e 100644
--- a/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/strcmp-lsx.S
@@ -1,4 +1,4 @@
-/* Optimized strcmp implementation using Loongarch LSX instructions.
+/* Optimized strcmp implementation using LoongArch LSX instructions.
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
index e9e1d2fc..ed0548e4 100644
--- a/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
+++ b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
@@ -1,4 +1,4 @@
-/* Optimized strlen implementation using basic Loongarch instructions.
+/* Optimized strlen implementation using basic LoongArch instructions.
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
index 258c47ce..91342f34 100644
--- a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
+++ b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
@@ -1,4 +1,4 @@
-/* Optimized strlen implementation using loongarch LASX SIMD instructions.
+/* Optimized strlen implementation using LoongArch LASX instructions.
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
index b194355e..b09c12e0 100644
--- a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
@@ -1,4 +1,4 @@
-/* Optimized strlen implementation using Loongarch LSX SIMD instructions.
+/* Optimized strlen implementation using LoongArch LSX instructions.
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S
index e2687fa7..f63de872 100644
--- a/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S
+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S
@@ -1,4 +1,4 @@
-/* Optimized strncmp implementation using basic Loongarch instructions.
+/* Optimized strncmp implementation using basic LoongArch instructions.
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
index 0b4eee2a..83cb801d 100644
--- a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
@@ -1,4 +1,4 @@
-/* Optimized strncmp implementation using Loongarch LSX instructions.
+/* Optimized strncmp implementation using LoongArch LSX instructions.
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S
index b900430a..a8296a1b 100644
--- a/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S
+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-aligned.S
@@ -1,4 +1,4 @@
-/* Optimized strnlen implementation using basic Loongarch instructions.
+/* Optimized strnlen implementation using basic LoongArch instructions.
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
index 2c03d3d9..aa6c812d 100644
--- a/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lasx.S
@@ -1,4 +1,4 @@
-/* Optimized strnlen implementation using loongarch LASX instructions
+/* Optimized strnlen implementation using LoongArch LASX instructions
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
index b769a895..d0febe3e 100644
--- a/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-lsx.S
@@ -1,4 +1,4 @@
-/* Optimized strnlen implementation using loongarch LSX instructions
+/* Optimized strnlen implementation using LoongArch LSX instructions
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
--
2.33.0

View file

@ -0,0 +1,67 @@
From fb72c81f9894b23797f6e2e066532c0963f5155f Mon Sep 17 00:00:00 2001
From: dengjianbo <dengjianbo@loongson.cn>
Date: Wed, 13 Sep 2023 15:35:01 +0800
Subject: [PATCH 24/29] LoongArch: Change to put magic number to .rodata
section
Change to put magic number to .rodata section in memmove-lsx, and use
pcalau12i and %pc_lo12 with vld to get the data.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
.../loongarch/lp64/multiarch/memmove-lsx.S | 20 +++++++++----------
1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
index 8a936770..5eb819ef 100644
--- a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S
@@ -209,13 +209,10 @@ L(al_less_16):
nop
-L(magic_num):
- .dword 0x0706050403020100
- .dword 0x0f0e0d0c0b0a0908
L(unaligned):
- pcaddi t2, -4
+ pcalau12i t2, %pc_hi20(L(INDEX))
bstrins.d a1, zero, 3, 0
- vld vr8, t2, 0
+ vld vr8, t2, %pc_lo12(L(INDEX))
vld vr0, a1, 0
vld vr1, a1, 16
@@ -413,13 +410,10 @@ L(back_al_less_16):
vst vr1, a0, 0
jr ra
-L(magic_num_2):
- .dword 0x0706050403020100
- .dword 0x0f0e0d0c0b0a0908
L(back_unaligned):
- pcaddi t2, -4
+ pcalau12i t2, %pc_hi20(L(INDEX))
bstrins.d a4, zero, 3, 0
- vld vr8, t2, 0
+ vld vr8, t2, %pc_lo12(L(INDEX))
vld vr0, a4, 0
vld vr1, a4, -16
@@ -529,6 +523,12 @@ L(back_un_less_16):
jr ra
END(MEMMOVE_NAME)
+ .section .rodata.cst16,"M",@progbits,16
+ .align 4
+L(INDEX):
+ .dword 0x0706050403020100
+ .dword 0x0f0e0d0c0b0a0908
+
libc_hidden_builtin_def (MEMCPY_NAME)
libc_hidden_builtin_def (MEMMOVE_NAME)
#endif
--
2.33.0

View file

@ -0,0 +1,44 @@
From 7f703cf758c4f185dd62f2a4f463002bb514af16 Mon Sep 17 00:00:00 2001
From: Xi Ruoyao <xry111@xry111.site>
Date: Sun, 27 Aug 2023 00:36:51 +0800
Subject: [PATCH 13/29] LoongArch: Micro-optimize LD_PCREL
We are requiring Binutils >= 2.41, so explicit relocation syntax is
always supported by the assembler. Use it to reduce one instruction.
Signed-off-by: Xi Ruoyao <xry111@xry111.site>
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/unix/sysv/linux/loongarch/pointer_guard.h | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/sysdeps/unix/sysv/linux/loongarch/pointer_guard.h b/sysdeps/unix/sysv/linux/loongarch/pointer_guard.h
index b25e353b..d6c78687 100644
--- a/sysdeps/unix/sysv/linux/loongarch/pointer_guard.h
+++ b/sysdeps/unix/sysv/linux/loongarch/pointer_guard.h
@@ -19,17 +19,15 @@
#ifndef POINTER_GUARD_H
#define POINTER_GUARD_H
-/* Load a got-relative EXPR into G, using T.
- Note G and T are register names. */
+/* Load a got-relative EXPR into register G. */
#define LD_GLOBAL(G, EXPR) \
la.global G, EXPR; \
REG_L G, G, 0;
-/* Load a pc-relative EXPR into G, using T.
- Note G and T are register names. */
+/* Load a pc-relative EXPR into register G. */
#define LD_PCREL(G, EXPR) \
- la.pcrel G, EXPR; \
- REG_L G, G, 0;
+ pcalau12i G, %pc_hi20(EXPR); \
+ REG_L G, G, %pc_lo12(EXPR);
#if (IS_IN (rtld) \
|| (!defined SHARED && (IS_IN (libc) \
--
2.33.0

View file

@ -0,0 +1,65 @@
From 8dcd8c837df2e3cf81675522487697522f1542f8 Mon Sep 17 00:00:00 2001
From: dengjianbo <dengjianbo@loongson.cn>
Date: Tue, 8 Aug 2023 14:15:42 +0800
Subject: [PATCH 01/29] LoongArch: Redefine macro LEAF/ENTRY.
The following usage of macro LEAF/ENTRY are all feasible:
1. LEAF(fcn) -- the align value of fcn is .align 3(default value)
2. LEAF(fcn, 6) -- the align value of fcn is .align 6
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/sys/asm.h | 36 ++++++++++++++++++++++++++----------
1 file changed, 26 insertions(+), 10 deletions(-)
diff --git a/sysdeps/loongarch/sys/asm.h b/sysdeps/loongarch/sys/asm.h
index d1a279b8..c5eb8afa 100644
--- a/sysdeps/loongarch/sys/asm.h
+++ b/sysdeps/loongarch/sys/asm.h
@@ -39,16 +39,32 @@
#define FREG_L fld.d
#define FREG_S fst.d
-/* Declare leaf routine. */
-#define LEAF(symbol) \
- .text; \
- .globl symbol; \
- .align 3; \
- cfi_startproc; \
- .type symbol, @function; \
- symbol:
-
-#define ENTRY(symbol) LEAF (symbol)
+/* Declare leaf routine.
+ The usage of macro LEAF/ENTRY is as follows:
+ 1. LEAF(fcn) -- the align value of fcn is .align 3 (default value)
+ 2. LEAF(fcn, 6) -- the align value of fcn is .align 6
+*/
+#define LEAF_IMPL(symbol, aln, ...) \
+ .text; \
+ .globl symbol; \
+ .align aln; \
+ .type symbol, @function; \
+symbol: \
+ cfi_startproc;
+
+
+#define LEAF(...) LEAF_IMPL(__VA_ARGS__, 3)
+#define ENTRY(...) LEAF(__VA_ARGS__)
+
+#define LEAF_NO_ALIGN(symbol) \
+ .text; \
+ .globl symbol; \
+ .type symbol, @function; \
+symbol: \
+ cfi_startproc;
+
+#define ENTRY_NO_ALIGN(symbol) LEAF_NO_ALIGN(symbol)
+
/* Mark end of function. */
#undef END
--
2.33.0

View file

@ -0,0 +1,56 @@
From f8d66a269cb6f1a7087afadf3375bdf0553abf53 Mon Sep 17 00:00:00 2001
From: Xi Ruoyao <xry111@xry111.site>
Date: Sun, 27 Aug 2023 00:36:50 +0800
Subject: [PATCH 12/29] LoongArch: Remove support code for old linker in
start.S
We are requiring Binutils >= 2.41, so la.pcrel always works here.
Signed-off-by: Xi Ruoyao <xry111@xry111.site>
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/start.S | 19 +++----------------
1 file changed, 3 insertions(+), 16 deletions(-)
diff --git a/sysdeps/loongarch/start.S b/sysdeps/loongarch/start.S
index e9d82033..bf6bfc9e 100644
--- a/sysdeps/loongarch/start.S
+++ b/sysdeps/loongarch/start.S
@@ -60,20 +60,7 @@ ENTRY (ENTRY_POINT)
cfi_undefined (1)
or a5, a0, zero /* rtld_fini */
-#if ENABLE_STATIC_PIE
-/* For static PIE, the GOT cannot be used in _start because the GOT entries are
- offsets instead of real addresses before __libc_start_main.
- __libc_start_main and/or main may be not local, so we rely on the linker to
- produce PLT entries for them. GNU ld >= 2.40 supports this. */
-# define LA la.pcrel
-#else
-/* Old GNU ld (< 2.40) cannot handle PC relative address against a non-local
- function correctly. We deem these old linkers failing to support static PIE
- and load the addresses from GOT. */
-# define LA la.got
-#endif
-
- LA a0, t0, main
+ la.pcrel a0, t0, main
REG_L a1, sp, 0
ADDI a2, sp, SZREG
@@ -84,9 +71,9 @@ ENTRY (ENTRY_POINT)
move a4, zero /* used to be fini */
or a6, sp, zero /* stack_end */
- LA ra, t0, __libc_start_main
+ la.pcrel ra, t0, __libc_start_main
jirl ra, ra, 0
- LA ra, t0, abort
+ la.pcrel ra, t0, abort
jirl ra, ra, 0
END (ENTRY_POINT)
--
2.33.0

View file

@ -0,0 +1,28 @@
From b4b4bb7c9220a0bbdf5aec0ac8c1de1d22329280 Mon Sep 17 00:00:00 2001
From: caiyinyu <caiyinyu@loongson.cn>
Date: Thu, 14 Sep 2023 19:48:24 +0800
Subject: [PATCH 21/29] LoongArch: Replace deprecated $v0 with $a0 to eliminate
'as' Warnings.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/dl-machine.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
index 8a2db9de..57913cef 100644
--- a/sysdeps/loongarch/dl-machine.h
+++ b/sysdeps/loongarch/dl-machine.h
@@ -90,7 +90,7 @@ static inline ElfW (Addr) elf_machine_dynamic (void)
or $a0, $sp, $zero \n\
bl _dl_start \n\
# Stash user entry point in s0. \n\
- or $s0, $v0, $zero \n\
+ or $s0, $a0, $zero \n\
# Load the original argument count. \n\
ld.d $a1, $sp, 0 \n\
# Call _dl_init (struct link_map *main_map, int argc, \
--
2.33.0

View file

@ -0,0 +1,81 @@
From 458ab6d5f39cca1cabd83abd2022f67491f6f5ed Mon Sep 17 00:00:00 2001
From: caiyinyu <caiyinyu@loongson.cn>
Date: Fri, 20 Oct 2023 09:20:02 +0800
Subject: [PATCH 27/29] LoongArch: Unify Register Names.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/__longjmp.S | 20 ++++++++++----------
sysdeps/loongarch/setjmp.S | 18 +++++++++---------
2 files changed, 19 insertions(+), 19 deletions(-)
diff --git a/sysdeps/loongarch/__longjmp.S b/sysdeps/loongarch/__longjmp.S
index cbde1946..e87ce311 100644
--- a/sysdeps/loongarch/__longjmp.S
+++ b/sysdeps/loongarch/__longjmp.S
@@ -43,18 +43,18 @@ ENTRY (__longjmp)
REG_L s8, a0, 12*SZREG
#ifndef __loongarch_soft_float
- FREG_L $f24, a0, 13*SZREG + 0*SZFREG
- FREG_L $f25, a0, 13*SZREG + 1*SZFREG
- FREG_L $f26, a0, 13*SZREG + 2*SZFREG
- FREG_L $f27, a0, 13*SZREG + 3*SZFREG
- FREG_L $f28, a0, 13*SZREG + 4*SZFREG
- FREG_L $f29, a0, 13*SZREG + 5*SZFREG
- FREG_L $f30, a0, 13*SZREG + 6*SZFREG
- FREG_L $f31, a0, 13*SZREG + 7*SZFREG
+ FREG_L fs0, a0, 13*SZREG + 0*SZFREG
+ FREG_L fs1, a0, 13*SZREG + 1*SZFREG
+ FREG_L fs2, a0, 13*SZREG + 2*SZFREG
+ FREG_L fs3, a0, 13*SZREG + 3*SZFREG
+ FREG_L fs4, a0, 13*SZREG + 4*SZFREG
+ FREG_L fs5, a0, 13*SZREG + 5*SZFREG
+ FREG_L fs6, a0, 13*SZREG + 6*SZFREG
+ FREG_L fs7, a0, 13*SZREG + 7*SZFREG
#endif
- sltui a0,a1,1
+ sltui a0, a1, 1
ADD a0, a0, a1 # a0 = (a1 == 0) ? 1 : a1
- jirl zero,ra,0
+ jirl zero, ra, 0
END (__longjmp)
diff --git a/sysdeps/loongarch/setjmp.S b/sysdeps/loongarch/setjmp.S
index 6c7065cd..b6e4f727 100644
--- a/sysdeps/loongarch/setjmp.S
+++ b/sysdeps/loongarch/setjmp.S
@@ -52,19 +52,19 @@ ENTRY (__sigsetjmp)
REG_S s8, a0, 12*SZREG
#ifndef __loongarch_soft_float
- FREG_S $f24, a0, 13*SZREG + 0*SZFREG
- FREG_S $f25, a0, 13*SZREG + 1*SZFREG
- FREG_S $f26, a0, 13*SZREG + 2*SZFREG
- FREG_S $f27, a0, 13*SZREG + 3*SZFREG
- FREG_S $f28, a0, 13*SZREG + 4*SZFREG
- FREG_S $f29, a0, 13*SZREG + 5*SZFREG
- FREG_S $f30, a0, 13*SZREG + 6*SZFREG
- FREG_S $f31, a0, 13*SZREG + 7*SZFREG
+ FREG_S fs0, a0, 13*SZREG + 0*SZFREG
+ FREG_S fs1, a0, 13*SZREG + 1*SZFREG
+ FREG_S fs2, a0, 13*SZREG + 2*SZFREG
+ FREG_S fs3, a0, 13*SZREG + 3*SZFREG
+ FREG_S fs4, a0, 13*SZREG + 4*SZFREG
+ FREG_S fs5, a0, 13*SZREG + 5*SZFREG
+ FREG_S fs6, a0, 13*SZREG + 6*SZFREG
+ FREG_S fs7, a0, 13*SZREG + 7*SZFREG
#endif
#if !IS_IN (libc) && IS_IN(rtld)
li.w v0, 0
- jirl zero,ra,0
+ jirl zero, ra, 0
#else
b __sigjmp_save
#endif
--
2.33.0

View file

@ -0,0 +1,24 @@
From 4828d1aa0028e819a5fb336d962e8f7cbfedf8b4 Mon Sep 17 00:00:00 2001
From: caiyinyu <caiyinyu@loongson.cn>
Date: Mon, 23 Oct 2023 15:53:38 +0800
Subject: [PATCH 28/29] LoongArch: Update hwcap.h to sync with LoongArch
kernel.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/unix/sysv/linux/loongarch/bits/hwcap.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/sysdeps/unix/sysv/linux/loongarch/bits/hwcap.h b/sysdeps/unix/sysv/linux/loongarch/bits/hwcap.h
index 5104b69c..7acec23d 100644
--- a/sysdeps/unix/sysv/linux/loongarch/bits/hwcap.h
+++ b/sysdeps/unix/sysv/linux/loongarch/bits/hwcap.h
@@ -35,3 +35,4 @@
#define HWCAP_LOONGARCH_LBT_X86 (1 << 10)
#define HWCAP_LOONGARCH_LBT_ARM (1 << 11)
#define HWCAP_LOONGARCH_LBT_MIPS (1 << 12)
+#define HWCAP_LOONGARCH_PTW (1 << 13)
--
2.33.0

View file

@ -0,0 +1,30 @@
From 4938840b15ff9734fdcc63cc0744ce3f3bbb0b16 Mon Sep 17 00:00:00 2001
From: caiyinyu <caiyinyu@loongson.cn>
Date: Mon, 14 Aug 2023 15:34:08 +0800
Subject: [PATCH 05/29] LoongArch: elf: Add new LoongArch reloc types 109 into
elf.h
These reloc types are generated by GNU assembler >= 2.41 for relaxation
support.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
elf/elf.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/elf/elf.h b/elf/elf.h
index d623bdeb..9c51073f 100644
--- a/elf/elf.h
+++ b/elf/elf.h
@@ -4213,6 +4213,7 @@ enum
#define R_LARCH_SUB6 106
#define R_LARCH_ADD_ULEB128 107
#define R_LARCH_SUB_ULEB128 108
+#define R_LARCH_64_PCREL 109
/* ARC specific declarations. */
--
2.33.0

View file

@ -0,0 +1,528 @@
From 43abd8772a143cd96688c081500397dd712e631b Mon Sep 17 00:00:00 2001
From: dengjianbo <dengjianbo@loongson.cn>
Date: Tue, 8 Aug 2023 14:15:44 +0800
Subject: [PATCH 03/29] Loongarch: Add ifunc support and add different versions
of strlen
strlen-lasx is implemeted by LASX simd instructions(256bit)
strlen-lsx is implemeted by LSX simd instructions(128bit)
strlen-align is implemented by LA basic instructions and never use unaligned memory acess
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/lp64/multiarch/Makefile | 7 ++
.../lp64/multiarch/ifunc-impl-list.c | 41 +++++++
.../loongarch/lp64/multiarch/ifunc-strlen.h | 40 +++++++
.../loongarch/lp64/multiarch/strlen-aligned.S | 100 ++++++++++++++++++
.../loongarch/lp64/multiarch/strlen-lasx.S | 63 +++++++++++
sysdeps/loongarch/lp64/multiarch/strlen-lsx.S | 71 +++++++++++++
sysdeps/loongarch/lp64/multiarch/strlen.c | 37 +++++++
sysdeps/loongarch/sys/regdef.h | 57 ++++++++++
.../unix/sysv/linux/loongarch/cpu-features.h | 2 +
9 files changed, 418 insertions(+)
create mode 100644 sysdeps/loongarch/lp64/multiarch/Makefile
create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h
create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen.c
diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
new file mode 100644
index 00000000..76c506c9
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
@@ -0,0 +1,7 @@
+ifeq ($(subdir),string)
+sysdep_routines += \
+ strlen-aligned \
+ strlen-lsx \
+ strlen-lasx \
+# sysdep_routines
+endif
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
new file mode 100644
index 00000000..1a2a576f
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
@@ -0,0 +1,41 @@
+/* Enumerate available IFUNC implementations of a function LoongArch64 version.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <assert.h>
+#include <string.h>
+#include <wchar.h>
+#include <ldsodefs.h>
+#include <ifunc-impl-list.h>
+#include <stdio.h>
+
+size_t
+__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ size_t max)
+{
+
+ size_t i = max;
+
+ IFUNC_IMPL (i, name, strlen,
+#if !defined __loongarch_soft_float
+ IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LASX, __strlen_lasx)
+ IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LSX, __strlen_lsx)
+#endif
+ IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_aligned)
+ )
+ return i;
+}
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h b/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h
new file mode 100644
index 00000000..6258bb76
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h
@@ -0,0 +1,40 @@
+/* Common definition for strlen ifunc selections.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <ldsodefs.h>
+#include <ifunc-init.h>
+
+#if !defined __loongarch_soft_float
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
+#endif
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+#if !defined __loongarch_soft_float
+ if (SUPPORT_LASX)
+ return OPTIMIZE (lasx);
+ else if (SUPPORT_LSX)
+ return OPTIMIZE (lsx);
+ else
+#endif
+ return OPTIMIZE (aligned);
+}
diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
new file mode 100644
index 00000000..e9e1d2fc
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
@@ -0,0 +1,100 @@
+/* Optimized strlen implementation using basic Loongarch instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+# define STRLEN __strlen_aligned
+#else
+# define STRLEN strlen
+#endif
+
+LEAF(STRLEN, 6)
+ move a1, a0
+ bstrins.d a0, zero, 2, 0
+ lu12i.w a2, 0x01010
+ li.w t0, -1
+
+ ld.d t2, a0, 0
+ andi t1, a1, 0x7
+ ori a2, a2, 0x101
+ slli.d t1, t1, 3
+
+ bstrins.d a2, a2, 63, 32
+ sll.d t1, t0, t1
+ slli.d t3, a2, 7
+ nor a3, zero, t3
+
+ orn t2, t2, t1
+ sub.d t0, t2, a2
+ nor t1, t2, a3
+ and t0, t0, t1
+
+
+ bnez t0, L(count_pos)
+ addi.d a0, a0, 8
+L(loop_16_7bit):
+ ld.d t2, a0, 0
+ sub.d t1, t2, a2
+
+ and t0, t1, t3
+ bnez t0, L(more_check)
+ ld.d t2, a0, 8
+ sub.d t1, t2, a2
+
+ and t0, t1, t3
+ addi.d a0, a0, 16
+ beqz t0, L(loop_16_7bit)
+ addi.d a0, a0, -8
+
+L(more_check):
+ nor t0, t2, a3
+ and t0, t1, t0
+ bnez t0, L(count_pos)
+ addi.d a0, a0, 8
+
+
+L(loop_16_8bit):
+ ld.d t2, a0, 0
+ sub.d t1, t2, a2
+ nor t0, t2, a3
+ and t0, t0, t1
+
+ bnez t0, L(count_pos)
+ ld.d t2, a0, 8
+ addi.d a0, a0, 16
+ sub.d t1, t2, a2
+
+ nor t0, t2, a3
+ and t0, t0, t1
+ beqz t0, L(loop_16_8bit)
+ addi.d a0, a0, -8
+
+L(count_pos):
+ ctz.d t1, t0
+ sub.d a0, a0, a1
+ srli.d t1, t1, 3
+ add.d a0, a0, t1
+
+ jr ra
+END(STRLEN)
+
+libc_hidden_builtin_def (STRLEN)
diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
new file mode 100644
index 00000000..258c47ce
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
@@ -0,0 +1,63 @@
+/* Optimized strlen implementation using loongarch LASX SIMD instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# define STRLEN __strlen_lasx
+
+LEAF(STRLEN, 6)
+ move a1, a0
+ bstrins.d a0, zero, 4, 0
+ li.d t1, -1
+ xvld xr0, a0, 0
+
+ xvmsknz.b xr0, xr0
+ xvpickve.w xr1, xr0, 4
+ vilvl.h vr0, vr1, vr0
+ movfr2gr.s t0, fa0 # sign extend
+
+ sra.w t0, t0, a1
+ beq t0, t1, L(loop)
+ cto.w a0, t0
+ jr ra
+
+L(loop):
+ xvld xr0, a0, 32
+ addi.d a0, a0, 32
+ xvsetanyeqz.b fcc0, xr0
+ bceqz fcc0, L(loop)
+
+
+ xvmsknz.b xr0, xr0
+ sub.d a0, a0, a1
+ xvpickve.w xr1, xr0, 4
+ vilvl.h vr0, vr1, vr0
+
+ movfr2gr.s t0, fa0
+ cto.w t0, t0
+ add.d a0, a0, t0
+ jr ra
+END(STRLEN)
+
+libc_hidden_builtin_def (STRLEN)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
new file mode 100644
index 00000000..b194355e
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
@@ -0,0 +1,71 @@
+/* Optimized strlen implementation using Loongarch LSX SIMD instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# define STRLEN __strlen_lsx
+
+LEAF(STRLEN, 6)
+ move a1, a0
+ bstrins.d a0, zero, 4, 0
+ vld vr0, a0, 0
+ vld vr1, a0, 16
+
+ li.d t1, -1
+ vmsknz.b vr0, vr0
+ vmsknz.b vr1, vr1
+ vilvl.h vr0, vr1, vr0
+
+ movfr2gr.s t0, fa0
+ sra.w t0, t0, a1
+ beq t0, t1, L(loop)
+ cto.w a0, t0
+
+ jr ra
+ nop
+ nop
+ nop
+
+
+L(loop):
+ vld vr0, a0, 32
+ vld vr1, a0, 48
+ addi.d a0, a0, 32
+ vmin.bu vr2, vr0, vr1
+
+ vsetanyeqz.b fcc0, vr2
+ bceqz fcc0, L(loop)
+ vmsknz.b vr0, vr0
+ vmsknz.b vr1, vr1
+
+ vilvl.h vr0, vr1, vr0
+ sub.d a0, a0, a1
+ movfr2gr.s t0, fa0
+ cto.w t0, t0
+
+ add.d a0, a0, t0
+ jr ra
+END(STRLEN)
+
+libc_hidden_builtin_def (STRLEN)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strlen.c b/sysdeps/loongarch/lp64/multiarch/strlen.c
new file mode 100644
index 00000000..381c2daa
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strlen.c
@@ -0,0 +1,37 @@
+/* Multiple versions of strlen.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+
+#if IS_IN (libc)
+# define strlen __redirect_strlen
+# include <string.h>
+# undef strlen
+
+# define SYMBOL_NAME strlen
+# include "ifunc-strlen.h"
+
+libc_ifunc_redirected (__redirect_strlen, strlen, IFUNC_SELECTOR ());
+
+# ifdef SHARED
+__hidden_ver1 (strlen, __GI_strlen, __redirect_strlen)
+ __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlen);
+# endif
+
+#endif
diff --git a/sysdeps/loongarch/sys/regdef.h b/sysdeps/loongarch/sys/regdef.h
index 5100f36d..524d2e32 100644
--- a/sysdeps/loongarch/sys/regdef.h
+++ b/sysdeps/loongarch/sys/regdef.h
@@ -89,6 +89,14 @@
#define fs5 $f29
#define fs6 $f30
#define fs7 $f31
+#define fcc0 $fcc0
+#define fcc1 $fcc1
+#define fcc2 $fcc2
+#define fcc3 $fcc3
+#define fcc4 $fcc4
+#define fcc5 $fcc5
+#define fcc6 $fcc6
+#define fcc7 $fcc7
#define vr0 $vr0
#define vr1 $vr1
@@ -98,6 +106,30 @@
#define vr5 $vr5
#define vr6 $vr6
#define vr7 $vr7
+#define vr8 $vr8
+#define vr9 $vr9
+#define vr10 $vr10
+#define vr11 $vr11
+#define vr12 $vr12
+#define vr13 $vr13
+#define vr14 $vr14
+#define vr15 $vr15
+#define vr16 $vr16
+#define vr17 $vr17
+#define vr18 $vr18
+#define vr19 $vr19
+#define vr20 $vr20
+#define vr21 $vr21
+#define vr22 $vr22
+#define vr23 $vr23
+#define vr24 $vr24
+#define vr25 $vr25
+#define vr26 $vr26
+#define vr27 $vr27
+#define vr28 $vr28
+#define vr29 $vr29
+#define vr30 $vr30
+#define vr31 $vr31
#define xr0 $xr0
#define xr1 $xr1
@@ -107,5 +139,30 @@
#define xr5 $xr5
#define xr6 $xr6
#define xr7 $xr7
+#define xr7 $xr7
+#define xr8 $xr8
+#define xr9 $xr9
+#define xr10 $xr10
+#define xr11 $xr11
+#define xr12 $xr12
+#define xr13 $xr13
+#define xr14 $xr14
+#define xr15 $xr15
+#define xr16 $xr16
+#define xr17 $xr17
+#define xr18 $xr18
+#define xr19 $xr19
+#define xr20 $xr20
+#define xr21 $xr21
+#define xr22 $xr22
+#define xr23 $xr23
+#define xr24 $xr24
+#define xr25 $xr25
+#define xr26 $xr26
+#define xr27 $xr27
+#define xr28 $xr28
+#define xr29 $xr29
+#define xr30 $xr30
+#define xr31 $xr31
#endif /* _SYS_REGDEF_H */
diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
index e371e13b..d1a280a5 100644
--- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
+++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
@@ -25,5 +25,7 @@
#define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX)
#define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX)
+#define INIT_ARCH()
+
#endif /* _CPU_FEATURES_LOONGARCH64_H */
--
2.33.0

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,706 @@
From aca7d7f0dde5f56344e8e58e5f6648c96bb1f1cc Mon Sep 17 00:00:00 2001
From: dengjianbo <dengjianbo@loongson.cn>
Date: Tue, 15 Aug 2023 09:08:11 +0800
Subject: [PATCH 06/29] Loongarch: Add ifunc support for strchr{aligned, lsx,
lasx} and strchrnul{aligned, lsx, lasx}
These implementations improve the time to run strchr{nul}
microbenchmark in glibc as below:
strchr-lasx reduces the runtime about 50%-83%
strchr-lsx reduces the runtime about 30%-67%
strchr-aligned reduces the runtime about 10%-20%
strchrnul-lasx reduces the runtime about 50%-83%
strchrnul-lsx reduces the runtime about 36%-65%
strchrnul-aligned reduces the runtime about 6%-10%
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/lp64/multiarch/Makefile | 6 ++
.../lp64/multiarch/ifunc-impl-list.c | 16 +++
.../loongarch/lp64/multiarch/ifunc-strchr.h | 41 ++++++++
.../lp64/multiarch/ifunc-strchrnul.h | 41 ++++++++
.../loongarch/lp64/multiarch/strchr-aligned.S | 99 +++++++++++++++++++
.../loongarch/lp64/multiarch/strchr-lasx.S | 91 +++++++++++++++++
sysdeps/loongarch/lp64/multiarch/strchr-lsx.S | 73 ++++++++++++++
sysdeps/loongarch/lp64/multiarch/strchr.c | 36 +++++++
.../lp64/multiarch/strchrnul-aligned.S | 95 ++++++++++++++++++
.../loongarch/lp64/multiarch/strchrnul-lasx.S | 22 +++++
.../loongarch/lp64/multiarch/strchrnul-lsx.S | 22 +++++
sysdeps/loongarch/lp64/multiarch/strchrnul.c | 39 ++++++++
12 files changed, 581 insertions(+)
create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strchr.h
create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strchrnul.h
create mode 100644 sysdeps/loongarch/lp64/multiarch/strchr-aligned.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/strchr-lasx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/strchr-lsx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/strchr.c
create mode 100644 sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S
create mode 100644 sysdeps/loongarch/lp64/multiarch/strchrnul.c
diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
index 76c506c9..110a8c5c 100644
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
@@ -3,5 +3,11 @@ sysdep_routines += \
strlen-aligned \
strlen-lsx \
strlen-lasx \
+ strchr-aligned \
+ strchr-lsx \
+ strchr-lasx \
+ strchrnul-aligned \
+ strchrnul-lsx \
+ strchrnul-lasx \
# sysdep_routines
endif
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
index 1a2a576f..c7164b45 100644
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
@@ -37,5 +37,21 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
#endif
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_aligned)
)
+
+ IFUNC_IMPL (i, name, strchr,
+#if !defined __loongarch_soft_float
+ IFUNC_IMPL_ADD (array, i, strchr, SUPPORT_LASX, __strchr_lasx)
+ IFUNC_IMPL_ADD (array, i, strchr, SUPPORT_LSX, __strchr_lsx)
+#endif
+ IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_aligned)
+ )
+
+ IFUNC_IMPL (i, name, strchrnul,
+#if !defined __loongarch_soft_float
+ IFUNC_IMPL_ADD (array, i, strchrnul, SUPPORT_LASX, __strchrnul_lasx)
+ IFUNC_IMPL_ADD (array, i, strchrnul, SUPPORT_LSX, __strchrnul_lsx)
+#endif
+ IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_aligned)
+ )
return i;
}
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strchr.h b/sysdeps/loongarch/lp64/multiarch/ifunc-strchr.h
new file mode 100644
index 00000000..4494db79
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strchr.h
@@ -0,0 +1,41 @@
+/* Common definition for strchr ifunc selections.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <ldsodefs.h>
+#include <ifunc-init.h>
+
+#if !defined __loongarch_soft_float
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
+#endif
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+#if !defined __loongarch_soft_float
+ if (SUPPORT_LASX)
+ return OPTIMIZE (lasx);
+ else if (SUPPORT_LSX)
+ return OPTIMIZE (lsx);
+ else
+#endif
+ return OPTIMIZE (aligned);
+}
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strchrnul.h b/sysdeps/loongarch/lp64/multiarch/ifunc-strchrnul.h
new file mode 100644
index 00000000..8a925120
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strchrnul.h
@@ -0,0 +1,41 @@
+/* Common definition for strchrnul ifunc selections.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <ldsodefs.h>
+#include <ifunc-init.h>
+
+#if !defined __loongarch_soft_float
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
+#endif
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+#if !defined __loongarch_soft_float
+ if (SUPPORT_LASX)
+ return OPTIMIZE (lasx);
+ else if (SUPPORT_LSX)
+ return OPTIMIZE (lsx);
+ else
+#endif
+ return OPTIMIZE (aligned);
+}
diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S b/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S
new file mode 100644
index 00000000..5fb01806
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strchr-aligned.S
@@ -0,0 +1,99 @@
+/* Optimized strchr implementation using basic Loongarch instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+# define STRCHR_NAME __strchr_aligned
+#else
+# define STRCHR_NAME strchr
+#endif
+
+LEAF(STRCHR_NAME, 6)
+ slli.d t1, a0, 3
+ bstrins.d a0, zero, 2, 0
+ lu12i.w a2, 0x01010
+ ld.d t2, a0, 0
+
+ ori a2, a2, 0x101
+ andi a1, a1, 0xff
+ bstrins.d a2, a2, 63, 32
+ li.w t0, -1
+
+ mul.d a1, a1, a2
+ sll.d t0, t0, t1
+ slli.d a3, a2, 7
+ orn t2, t2, t0
+
+ sll.d t3, a1, t1
+ xor t4, t2, t3
+ sub.d a4, t2, a2
+ sub.d a5, t4, a2
+
+
+ andn a4, a4, t2
+ andn a5, a5, t4
+ or t0, a4, a5
+ and t0, t0, a3
+
+ bnez t0, L(end)
+ addi.d a0, a0, 8
+L(loop):
+ ld.d t4, a0, 0
+ xor t2, t4, a1
+
+ sub.d a4, t4, a2
+ sub.d a5, t2, a2
+ andn a4, a4, t4
+ andn a5, a5, t2
+
+ or t0, a4, a5
+ and t0, t0, a3
+ bnez t0, L(end)
+ ld.d t4, a0, 8
+
+
+ addi.d a0, a0, 16
+ xor t2, t4, a1
+ sub.d a4, t4, a2
+ sub.d a5, t2, a2
+
+ andn a4, a4, t4
+ andn a5, a5, t2
+ or t0, a4, a5
+ and t0, t0, a3
+
+ beqz t0, L(loop)
+ addi.d a0, a0, -8
+L(end):
+ and t0, a5, a3
+ and t1, a4, a3
+
+ ctz.d t0, t0
+ ctz.d t1, t1
+ srli.w t2, t0, 3
+ sltu t3, t1, t0
+
+
+ add.d a0, a0, t2
+ masknez a0, a0, t3
+ jr ra
+END(STRCHR_NAME)
diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S
new file mode 100644
index 00000000..254402da
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strchr-lasx.S
@@ -0,0 +1,91 @@
+/* Optimized strchr implementation using loongarch LASX SIMD instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+#ifndef AS_STRCHRNUL
+# define STRCHR __strchr_lasx
+#endif
+
+LEAF(STRCHR, 6)
+ andi t1, a0, 0x1f
+ bstrins.d a0, zero, 4, 0
+ xvld xr0, a0, 0
+ li.d t2, -1
+
+ xvreplgr2vr.b xr1, a1
+ sll.d t1, t2, t1
+ xvxor.v xr2, xr0, xr1
+ xvmin.bu xr0, xr0, xr2
+
+ xvmsknz.b xr0, xr0
+ xvpickve.w xr3, xr0, 4
+ vilvl.h vr0, vr3, vr0
+ movfr2gr.s t0, fa0
+
+ orn t0, t0, t1
+ bne t0, t2, L(end)
+ addi.d a0, a0, 32
+ nop
+
+
+L(loop):
+ xvld xr0, a0, 0
+ xvxor.v xr2, xr0, xr1
+ xvmin.bu xr0, xr0, xr2
+ xvsetanyeqz.b fcc0, xr0
+
+ bcnez fcc0, L(loop_end)
+ xvld xr0, a0, 32
+ addi.d a0, a0, 64
+ xvxor.v xr2, xr0, xr1
+
+ xvmin.bu xr0, xr0, xr2
+ xvsetanyeqz.b fcc0, xr0
+ bceqz fcc0, L(loop)
+ addi.d a0, a0, -32
+
+L(loop_end):
+ xvmsknz.b xr0, xr0
+ xvpickve.w xr1, xr0, 4
+ vilvl.h vr0, vr1, vr0
+ movfr2gr.s t0, fa0
+
+
+L(end):
+ cto.w t0, t0
+ add.d a0, a0, t0
+#ifndef AS_STRCHRNUL
+ vreplgr2vr.b vr0, t0
+ xvpermi.q xr3, xr2, 1
+
+ vshuf.b vr0, vr3, vr2, vr0
+ vpickve2gr.bu t0, vr0, 0
+ masknez a0, a0, t0
+#endif
+ jr ra
+
+END(STRCHR)
+
+libc_hidden_builtin_def(STRCHR)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S
new file mode 100644
index 00000000..dae98b0a
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strchr-lsx.S
@@ -0,0 +1,73 @@
+/* Optimized strlen implementation using loongarch LSX SIMD instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+#ifndef AS_STRCHRNUL
+# define STRCHR __strchr_lsx
+#endif
+
+LEAF(STRCHR, 6)
+ andi t1, a0, 0xf
+ bstrins.d a0, zero, 3, 0
+ vld vr0, a0, 0
+ li.d t2, -1
+
+ vreplgr2vr.b vr1, a1
+ sll.d t3, t2, t1
+ vxor.v vr2, vr0, vr1
+ vmin.bu vr0, vr0, vr2
+
+ vmsknz.b vr0, vr0
+ movfr2gr.s t0, fa0
+ ext.w.h t0, t0
+ orn t0, t0, t3
+
+ beq t0, t2, L(loop)
+L(found):
+ cto.w t0, t0
+ add.d a0, a0, t0
+#ifndef AS_STRCHRNUL
+ vreplve.b vr2, vr2, t0
+ vpickve2gr.bu t1, vr2, 0
+ masknez a0, a0, t1
+#endif
+ jr ra
+
+
+L(loop):
+ vld vr0, a0, 16
+ addi.d a0, a0, 16
+ vxor.v vr2, vr0, vr1
+ vmin.bu vr0, vr0, vr2
+
+ vsetanyeqz.b fcc0, vr0
+ bceqz fcc0, L(loop)
+ vmsknz.b vr0, vr0
+ movfr2gr.s t0, fa0
+
+ b L(found)
+END(STRCHR)
+
+libc_hidden_builtin_def (STRCHR)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strchr.c b/sysdeps/loongarch/lp64/multiarch/strchr.c
new file mode 100644
index 00000000..404e97bd
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strchr.c
@@ -0,0 +1,36 @@
+/* Multiple versions of strchr.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define strchr __redirect_strchr
+# include <string.h>
+# undef strchr
+
+# define SYMBOL_NAME strchr
+# include "ifunc-strchr.h"
+
+libc_ifunc_redirected (__redirect_strchr, strchr, IFUNC_SELECTOR ());
+weak_alias(strchr, index)
+# ifdef SHARED
+__hidden_ver1 (strchr, __GI_strchr, __redirect_strchr)
+ __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strchr);
+# endif
+
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S
new file mode 100644
index 00000000..1c01a023
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-aligned.S
@@ -0,0 +1,95 @@
+/* Optimized strchrnul implementation using basic Loongarch instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+# define STRCHRNUL_NAME __strchrnul_aligned
+#else
+# define STRCHRNUL_NAME __strchrnul
+#endif
+
+LEAF(STRCHRNUL_NAME, 6)
+ slli.d t1, a0, 3
+ bstrins.d a0, zero, 2, 0
+ lu12i.w a2, 0x01010
+ ld.d t2, a0, 0
+
+ ori a2, a2, 0x101
+ andi a1, a1, 0xff
+ bstrins.d a2, a2, 63, 32
+ li.w t0, -1
+
+ mul.d a1, a1, a2
+ sll.d t0, t0, t1
+ slli.d a3, a2, 7
+ orn t2, t2, t0
+
+ sll.d t3, a1, t1
+ xor t4, t2, t3
+ sub.d a4, t2, a2
+ sub.d a5, t4, a2
+
+
+ andn a4, a4, t2
+ andn a5, a5, t4
+ or t0, a4, a5
+ and t0, t0, a3
+
+ bnez t0, L(end)
+ addi.d a0, a0, 8
+L(loop):
+ ld.d t4, a0, 0
+ xor t2, t4, a1
+
+ sub.d a4, t4, a2
+ sub.d a5, t2, a2
+ andn a4, a4, t4
+ andn a5, a5, t2
+
+ or t0, a4, a5
+ and t0, t0, a3
+ bnez t0, L(end)
+ ld.d t4, a0, 8
+
+
+ addi.d a0, a0, 16
+ xor t2, t4, a1
+ sub.d a4, t4, a2
+ sub.d a5, t2, a2
+
+ andn a4, a4, t4
+ andn a5, a5, t2
+ or t0, a4, a5
+ and t0, t0, a3
+
+ beqz t0, L(loop)
+ addi.d a0, a0, -8
+L(end):
+ ctz.d t0, t0
+ srli.w t0, t0, 3
+
+
+ add.d a0, a0, t0
+ jr ra
+END(STRCHRNUL_NAME)
+
+libc_hidden_builtin_def (STRCHRNUL_NAME)
diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S
new file mode 100644
index 00000000..d45495e4
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-lasx.S
@@ -0,0 +1,22 @@
+/* Optimized strchrnul implementation using loongarch LASX SIMD instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define STRCHR __strchrnul_lasx
+#define AS_STRCHRNUL
+#include "strchr-lasx.S"
diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S
new file mode 100644
index 00000000..07d793ae
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-lsx.S
@@ -0,0 +1,22 @@
+/* Optimized strchrnul implementation using loongarch LSX SIMD instructions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define STRCHR __strchrnul_lsx
+#define AS_STRCHRNUL
+#include "strchr-lsx.S"
diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul.c b/sysdeps/loongarch/lp64/multiarch/strchrnul.c
new file mode 100644
index 00000000..f3b8296e
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul.c
@@ -0,0 +1,39 @@
+/* Multiple versions of strchrnul.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+
+#if IS_IN (libc)
+# define strchrnul __redirect_strchrnul
+# define __strchrnul __redirect___strchrnul
+# include <string.h>
+# undef __strchrnul
+# undef strchrnul
+
+# define SYMBOL_NAME strchrnul
+# include "ifunc-strchrnul.h"
+
+libc_ifunc_redirected (__redirect_strchrnul, __strchrnul,
+ IFUNC_SELECTOR ());
+weak_alias (__strchrnul, strchrnul)
+# ifdef SHARED
+__hidden_ver1 (__strchrnul, __GI___strchrnul, __redirect_strchrnul)
+ __attribute__((visibility ("hidden"))) __attribute_copy__ (strchrnul);
+# endif
+#endif
--
2.33.0

11
README.md Normal file
View file

@ -0,0 +1,11 @@
Anolis OS
=======================================
# 代码仓库说明
## 分支说明
>进行代码开发工作时,请注意选择当前版本对应的分支
* aX分支为对应大版本的主分支,如a8分支对应当前最新版本
* aX.Y分支为对应小版本的维护分支如a8.2分支对应8.2版本
## 开发流程
1. 首先fork目标分支到自己的namespace
2. 在自己的fork分支上做出修改
3. 向对应的仓库中提交merge request源分支为fork分支

View file

@ -0,0 +1,478 @@
From c0f3b0a8c71c26d5351e8ddabe3e8a323803e683 Mon Sep 17 00:00:00 2001
From: caiyinyu <caiyinyu@loongson.cn>
Date: Thu, 21 Sep 2023 09:10:11 +0800
Subject: [PATCH 26/29] Revert "LoongArch: Add glibc.cpu.hwcap support."
This reverts commit a53451559dc9cce765ea5bcbb92c4007e058e92b.
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/Makefile | 4 -
sysdeps/loongarch/Versions | 5 --
sysdeps/loongarch/cpu-tunables.c | 89 -------------------
sysdeps/loongarch/dl-get-cpu-features.c | 25 ------
sysdeps/loongarch/dl-machine.h | 27 +-----
sysdeps/loongarch/dl-tunables.list | 25 ------
.../unix/sysv/linux/loongarch/cpu-features.c | 29 ------
.../unix/sysv/linux/loongarch/cpu-features.h | 18 +---
.../unix/sysv/linux/loongarch/dl-procinfo.c | 60 -------------
sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c | 21 -----
.../unix/sysv/linux/loongarch/libc-start.c | 34 -------
11 files changed, 8 insertions(+), 329 deletions(-)
delete mode 100644 sysdeps/loongarch/Versions
delete mode 100644 sysdeps/loongarch/cpu-tunables.c
delete mode 100644 sysdeps/loongarch/dl-get-cpu-features.c
delete mode 100644 sysdeps/loongarch/dl-tunables.list
delete mode 100644 sysdeps/unix/sysv/linux/loongarch/cpu-features.c
delete mode 100644 sysdeps/unix/sysv/linux/loongarch/dl-procinfo.c
delete mode 100644 sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c
delete mode 100644 sysdeps/unix/sysv/linux/loongarch/libc-start.c
diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
index 30a1f4a8..43d2f583 100644
--- a/sysdeps/loongarch/Makefile
+++ b/sysdeps/loongarch/Makefile
@@ -6,10 +6,6 @@ ifeq ($(subdir),elf)
gen-as-const-headers += dl-link.sym
endif
-ifeq ($(subdir),elf)
- sysdep-dl-routines += dl-get-cpu-features
-endif
-
# LoongArch's assembler also needs to know about PIC as it changes the
# definition of some assembler macros.
ASFLAGS-.os += $(pic-ccflag)
diff --git a/sysdeps/loongarch/Versions b/sysdeps/loongarch/Versions
deleted file mode 100644
index 33ae2cc0..00000000
--- a/sysdeps/loongarch/Versions
+++ /dev/null
@@ -1,5 +0,0 @@
-ld {
- GLIBC_PRIVATE {
- _dl_larch_get_cpu_features;
- }
-}
diff --git a/sysdeps/loongarch/cpu-tunables.c b/sysdeps/loongarch/cpu-tunables.c
deleted file mode 100644
index 8e9fab93..00000000
--- a/sysdeps/loongarch/cpu-tunables.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/* LoongArch CPU feature tuning.
- This file is part of the GNU C Library.
- Copyright (C) 2023 Free Software Foundation, Inc.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-# include <stdbool.h>
-# include <stdint.h>
-# include <unistd.h> /* Get STDOUT_FILENO for _dl_printf. */
-# include <elf/dl-tunables.h>
-# include <string.h>
-# include <cpu-features.h>
-# include <ldsodefs.h>
-# include <sys/auxv.h>
-
-# define HWCAP_LOONGARCH_IFUNC \
- (HWCAP_LOONGARCH_UAL | HWCAP_LOONGARCH_LSX | HWCAP_LOONGARCH_LASX)
-
-# define CHECK_GLIBC_IFUNC_CPU_OFF(f, name, len) \
- _Static_assert (sizeof (#name) - 1 == len, #name " != " #len); \
- if (!memcmp (f, #name, len) && \
- (GLRO (dl_hwcap) & HWCAP_LOONGARCH_##name)) \
- { \
- hwcap |= (HWCAP_LOONGARCH_##name | (~HWCAP_LOONGARCH_IFUNC)); \
- break; \
- } \
-
-attribute_hidden
-void
-TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
-{
- const char *p = valp->strval;
- size_t len;
- unsigned long hwcap = 0;
- const char *c;
-
- do {
- for (c = p; *c != ','; c++)
- if (*c == '\0')
- break;
-
- len = c - p;
-
- switch(len)
- {
- default:
- _dl_fatal_printf (
- "The valid values of glibc.cpu.hwcaps is UAL, LASX, LSX!!\n"
- );
- break;
- case 3:
- {
- CHECK_GLIBC_IFUNC_CPU_OFF (p, LSX, 3);
- CHECK_GLIBC_IFUNC_CPU_OFF (p, UAL, 3);
- _dl_fatal_printf (
- "Some features are invalid or not supported on this machine!!\n"
- "The valid values of glibc.cpu.hwcaps is UAL, LASX, LSX!!\n"
- );
- }
- break;
- case 4:
- {
- CHECK_GLIBC_IFUNC_CPU_OFF (p, LASX, 4);
- _dl_fatal_printf (
- "Some features are invalid or not supported on this machine!!\n"
- "The valid values of glibc.cpu.hwcaps is UAL, LASX, LSX!!\n"
- );
- }
- break;
- }
-
- p += len + 1;
- }
- while (*c != '\0');
-
- GLRO (dl_larch_cpu_features).hwcap &= hwcap;
-}
diff --git a/sysdeps/loongarch/dl-get-cpu-features.c b/sysdeps/loongarch/dl-get-cpu-features.c
deleted file mode 100644
index 7cd9bc15..00000000
--- a/sysdeps/loongarch/dl-get-cpu-features.c
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Define _dl_larch_get_cpu_features.
- Copyright (C) 2023 Free Software Foundation, Inc.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-
-#include <ldsodefs.h>
-
-const struct cpu_features *
-_dl_larch_get_cpu_features (void)
-{
- return &GLRO(dl_larch_cpu_features);
-}
diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
index b395a928..57913cef 100644
--- a/sysdeps/loongarch/dl-machine.h
+++ b/sysdeps/loongarch/dl-machine.h
@@ -29,8 +29,6 @@
#include <dl-static-tls.h>
#include <dl-machine-rel.h>
-#include <cpu-features.c>
-
#ifndef _RTLD_PROLOGUE
# define _RTLD_PROLOGUE(entry) \
".globl\t" __STRING (entry) "\n\t" \
@@ -55,23 +53,6 @@
#define ELF_MACHINE_NO_REL 1
#define ELF_MACHINE_NO_RELA 0
-#define DL_PLATFORM_INIT dl_platform_init ()
-
-static inline void __attribute__ ((unused))
-dl_platform_init (void)
-{
- if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0')
- /* Avoid an empty string which would disturb us. */
- GLRO(dl_platform) = NULL;
-
-#ifdef SHARED
- /* init_cpu_features has been called early from __libc_start_main in
- static executable. */
- init_cpu_features (&GLRO(dl_larch_cpu_features));
-#endif
-}
-
-
/* Return nonzero iff ELF header is compatible with the running host. */
static inline int
elf_machine_matches_host (const ElfW (Ehdr) *ehdr)
@@ -309,9 +290,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
if (profile != 0)
{
#if !defined __loongarch_soft_float
- if (RTLD_SUPPORT_LASX)
+ if (SUPPORT_LASX)
gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile_lasx;
- else if (RTLD_SUPPORT_LSX)
+ else if (SUPPORT_LSX)
gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile_lsx;
else
#endif
@@ -329,9 +310,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
indicated by the offset on the stack, and then jump to
the resolved address. */
#if !defined __loongarch_soft_float
- if (RTLD_SUPPORT_LASX)
+ if (SUPPORT_LASX)
gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lasx;
- else if (RTLD_SUPPORT_LSX)
+ else if (SUPPORT_LSX)
gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lsx;
else
#endif
diff --git a/sysdeps/loongarch/dl-tunables.list b/sysdeps/loongarch/dl-tunables.list
deleted file mode 100644
index 66b34275..00000000
--- a/sysdeps/loongarch/dl-tunables.list
+++ /dev/null
@@ -1,25 +0,0 @@
-# LoongArch specific tunables.
-# Copyright (C) 2023 Free Software Foundation, Inc.
-# This file is part of the GNU C Library.
-
-# The GNU C Library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-
-# The GNU C Library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# Lesser General Public License for more details.
-
-# You should have received a copy of the GNU Lesser General Public
-# License along with the GNU C Library; if not, see
-# <http://www.gnu.org/licenses/>.
-
-glibc {
- cpu {
- hwcaps {
- type: STRING
- }
- }
-}
diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.c b/sysdeps/unix/sysv/linux/loongarch/cpu-features.c
deleted file mode 100644
index 1290c4ce..00000000
--- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Initialize CPU feature data. LoongArch64 version.
- This file is part of the GNU C Library.
- Copyright (C) 2023 Free Software Foundation, Inc.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <cpu-features.h>
-#include <elf/dl-hwcaps.h>
-#include <elf/dl-tunables.h>
-extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *) attribute_hidden;
-
-static inline void
-init_cpu_features (struct cpu_features *cpu_features)
-{
- GLRO (dl_larch_cpu_features).hwcap = GLRO (dl_hwcap);
- TUNABLE_GET (glibc, cpu, hwcaps, tunable_val_t *, TUNABLE_CALLBACK (set_hwcaps));
-}
diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
index 450963ce..d1a280a5 100644
--- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
+++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
@@ -19,23 +19,13 @@
#ifndef _CPU_FEATURES_LOONGARCH64_H
#define _CPU_FEATURES_LOONGARCH64_H
-#include <stdint.h>
#include <sys/auxv.h>
-struct cpu_features
- {
- uint64_t hwcap;
- };
+#define SUPPORT_UAL (GLRO (dl_hwcap) & HWCAP_LOONGARCH_UAL)
+#define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX)
+#define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX)
-/* Get a pointer to the CPU features structure. */
-extern const struct cpu_features *_dl_larch_get_cpu_features (void)
- __attribute__ ((pure));
-
-#define SUPPORT_UAL (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_UAL)
-#define SUPPORT_LSX (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_LSX)
-#define SUPPORT_LASX (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_LASX)
-#define RTLD_SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX)
-#define RTLD_SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX)
#define INIT_ARCH()
#endif /* _CPU_FEATURES_LOONGARCH64_H */
+
diff --git a/sysdeps/unix/sysv/linux/loongarch/dl-procinfo.c b/sysdeps/unix/sysv/linux/loongarch/dl-procinfo.c
deleted file mode 100644
index 6217fda9..00000000
--- a/sysdeps/unix/sysv/linux/loongarch/dl-procinfo.c
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Data for LoongArch64 version of processor capability information.
- Linux version.
- Copyright (C) 2023 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-/* If anything should be added here check whether the size of each string
- is still ok with the given array size.
-
- All the #ifdefs in the definitions are quite irritating but
- necessary if we want to avoid duplicating the information. There
- are three different modes:
-
- - PROCINFO_DECL is defined. This means we are only interested in
- declarations.
-
- - PROCINFO_DECL is not defined:
-
- + if SHARED is defined the file is included in an array
- initializer. The .element = { ... } syntax is needed.
-
- + if SHARED is not defined a normal array initialization is
- needed.
- */
-
-#ifndef PROCINFO_CLASS
-# define PROCINFO_CLASS
-#endif
-
-#if !IS_IN (ldconfig)
-# if !defined PROCINFO_DECL && defined SHARED
- ._dl_larch_cpu_features
-# else
-PROCINFO_CLASS struct cpu_features _dl_larch_cpu_features
-# endif
-# ifndef PROCINFO_DECL
-= { }
-# endif
-# if !defined SHARED || defined PROCINFO_DECL
-;
-# else
-,
-# endif
-#endif
-
-#undef PROCINFO_DECL
-#undef PROCINFO_CLASS
diff --git a/sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c b/sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c
deleted file mode 100644
index 455fd71a..00000000
--- a/sysdeps/unix/sysv/linux/loongarch/dl-sysdep.c
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Operating system support for run-time dynamic linker. LoongArch version.
- Copyright (C) 2023 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <config.h>
-#include <sysdeps/loongarch/cpu-tunables.c>
-#include <sysdeps/unix/sysv/linux/dl-sysdep.c>
diff --git a/sysdeps/unix/sysv/linux/loongarch/libc-start.c b/sysdeps/unix/sysv/linux/loongarch/libc-start.c
deleted file mode 100644
index f1346ece..00000000
--- a/sysdeps/unix/sysv/linux/loongarch/libc-start.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Override csu/libc-start.c on LoongArch64.
- Copyright (C) 2023 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#ifndef SHARED
-
-/* Mark symbols hidden in static PIE for early self relocation to work. */
-# if BUILD_PIE_DEFAULT
-# pragma GCC visibility push(hidden)
-# endif
-
-# include <ldsodefs.h>
-# include <cpu-features.c>
-
-extern struct cpu_features _dl_larch_cpu_features;
-
-# define ARCH_INIT_CPU_FEATURES() init_cpu_features (&_dl_larch_cpu_features)
-
-#endif
-#include <csu/libc-start.c>
--
2.33.0

496
SUPPORTED
View file

@ -1,496 +0,0 @@
# This file names the currently supported and somewhat tested locales.
# If you have any additions please file a glibc bug report.
SUPPORTED-LOCALES=\
C.UTF-8/UTF-8 \
aa_DJ.UTF-8/UTF-8 \
aa_DJ/ISO-8859-1 \
aa_ER/UTF-8 \
aa_ER@saaho/UTF-8 \
aa_ET/UTF-8 \
af_ZA.UTF-8/UTF-8 \
af_ZA/ISO-8859-1 \
agr_PE/UTF-8 \
ak_GH/UTF-8 \
am_ET/UTF-8 \
an_ES.UTF-8/UTF-8 \
an_ES/ISO-8859-15 \
anp_IN/UTF-8 \
ar_AE.UTF-8/UTF-8 \
ar_AE/ISO-8859-6 \
ar_BH.UTF-8/UTF-8 \
ar_BH/ISO-8859-6 \
ar_DZ.UTF-8/UTF-8 \
ar_DZ/ISO-8859-6 \
ar_EG.UTF-8/UTF-8 \
ar_EG/ISO-8859-6 \
ar_IN/UTF-8 \
ar_IQ.UTF-8/UTF-8 \
ar_IQ/ISO-8859-6 \
ar_JO.UTF-8/UTF-8 \
ar_JO/ISO-8859-6 \
ar_KW.UTF-8/UTF-8 \
ar_KW/ISO-8859-6 \
ar_LB.UTF-8/UTF-8 \
ar_LB/ISO-8859-6 \
ar_LY.UTF-8/UTF-8 \
ar_LY/ISO-8859-6 \
ar_MA.UTF-8/UTF-8 \
ar_MA/ISO-8859-6 \
ar_OM.UTF-8/UTF-8 \
ar_OM/ISO-8859-6 \
ar_QA.UTF-8/UTF-8 \
ar_QA/ISO-8859-6 \
ar_SA.UTF-8/UTF-8 \
ar_SA/ISO-8859-6 \
ar_SD.UTF-8/UTF-8 \
ar_SD/ISO-8859-6 \
ar_SS/UTF-8 \
ar_SY.UTF-8/UTF-8 \
ar_SY/ISO-8859-6 \
ar_TN.UTF-8/UTF-8 \
ar_TN/ISO-8859-6 \
ar_YE.UTF-8/UTF-8 \
ar_YE/ISO-8859-6 \
ayc_PE/UTF-8 \
az_AZ/UTF-8 \
az_IR/UTF-8 \
as_IN/UTF-8 \
ast_ES.UTF-8/UTF-8 \
ast_ES/ISO-8859-15 \
be_BY.UTF-8/UTF-8 \
be_BY/CP1251 \
be_BY@latin/UTF-8 \
bem_ZM/UTF-8 \
ber_DZ/UTF-8 \
ber_MA/UTF-8 \
bg_BG.UTF-8/UTF-8 \
bg_BG/CP1251 \
bhb_IN.UTF-8/UTF-8 \
bho_IN/UTF-8 \
bho_NP/UTF-8 \
bi_VU/UTF-8 \
bn_BD/UTF-8 \
bn_IN/UTF-8 \
bo_CN/UTF-8 \
bo_IN/UTF-8 \
br_FR.UTF-8/UTF-8 \
br_FR/ISO-8859-1 \
br_FR@euro/ISO-8859-15 \
brx_IN/UTF-8 \
bs_BA.UTF-8/UTF-8 \
bs_BA/ISO-8859-2 \
byn_ER/UTF-8 \
ca_AD.UTF-8/UTF-8 \
ca_AD/ISO-8859-15 \
ca_ES.UTF-8/UTF-8 \
ca_ES/ISO-8859-1 \
ca_ES@euro/ISO-8859-15 \
ca_ES@valencia/UTF-8 \
ca_FR.UTF-8/UTF-8 \
ca_FR/ISO-8859-15 \
ca_IT.UTF-8/UTF-8 \
ca_IT/ISO-8859-15 \
ce_RU/UTF-8 \
chr_US/UTF-8 \
cmn_TW/UTF-8 \
crh_UA/UTF-8 \
cs_CZ.UTF-8/UTF-8 \
cs_CZ/ISO-8859-2 \
csb_PL/UTF-8 \
cv_RU/UTF-8 \
cy_GB.UTF-8/UTF-8 \
cy_GB/ISO-8859-14 \
da_DK.UTF-8/UTF-8 \
da_DK/ISO-8859-1 \
da_DK.ISO-8859-15/ISO-8859-15 \
de_AT.UTF-8/UTF-8 \
de_AT/ISO-8859-1 \
de_AT@euro/ISO-8859-15 \
de_BE.UTF-8/UTF-8 \
de_BE/ISO-8859-1 \
de_BE@euro/ISO-8859-15 \
de_CH.UTF-8/UTF-8 \
de_CH/ISO-8859-1 \
de_DE.UTF-8/UTF-8 \
de_DE/ISO-8859-1 \
de_DE@euro/ISO-8859-15 \
de_IT.UTF-8/UTF-8 \
de_IT/ISO-8859-1 \
de_LI.UTF-8/UTF-8 \
de_LU.UTF-8/UTF-8 \
de_LU/ISO-8859-1 \
de_LU@euro/ISO-8859-15 \
doi_IN/UTF-8 \
dsb_DE/UTF-8 \
dv_MV/UTF-8 \
dz_BT/UTF-8 \
el_GR.UTF-8/UTF-8 \
el_GR/ISO-8859-7 \
el_GR@euro/ISO-8859-7 \
el_CY.UTF-8/UTF-8 \
el_CY/ISO-8859-7 \
en_AG/UTF-8 \
en_AU.UTF-8/UTF-8 \
en_AU/ISO-8859-1 \
en_BW.UTF-8/UTF-8 \
en_BW/ISO-8859-1 \
en_CA.UTF-8/UTF-8 \
en_CA/ISO-8859-1 \
en_DK.UTF-8/UTF-8 \
en_DK/ISO-8859-1 \
en_GB.UTF-8/UTF-8 \
en_GB/ISO-8859-1 \
en_GB.ISO-8859-15/ISO-8859-15 \
en_HK.UTF-8/UTF-8 \
en_HK/ISO-8859-1 \
en_IE.UTF-8/UTF-8 \
en_IE/ISO-8859-1 \
en_IE@euro/ISO-8859-15 \
en_IL/UTF-8 \
en_IN/UTF-8 \
en_NG/UTF-8 \
en_NZ.UTF-8/UTF-8 \
en_NZ/ISO-8859-1 \
en_PH.UTF-8/UTF-8 \
en_PH/ISO-8859-1 \
en_SC.UTF-8/UTF-8 \
en_SG.UTF-8/UTF-8 \
en_SG/ISO-8859-1 \
en_US.UTF-8/UTF-8 \
en_US/ISO-8859-1 \
en_US.ISO-8859-15/ISO-8859-15 \
en_US@ampm/UTF-8 \
en_US.UTF-8@ampm/UTF-8 \
en_ZA.UTF-8/UTF-8 \
en_ZA/ISO-8859-1 \
en_ZM/UTF-8 \
en_ZW.UTF-8/UTF-8 \
en_ZW/ISO-8859-1 \
eo/UTF-8 \
es_AR.UTF-8/UTF-8 \
es_AR/ISO-8859-1 \
es_BO.UTF-8/UTF-8 \
es_BO/ISO-8859-1 \
es_CL.UTF-8/UTF-8 \
es_CL/ISO-8859-1 \
es_CO.UTF-8/UTF-8 \
es_CO/ISO-8859-1 \
es_CR.UTF-8/UTF-8 \
es_CR/ISO-8859-1 \
es_CU/UTF-8 \
es_DO.UTF-8/UTF-8 \
es_DO/ISO-8859-1 \
es_EC.UTF-8/UTF-8 \
es_EC/ISO-8859-1 \
es_ES.UTF-8/UTF-8 \
es_ES/ISO-8859-1 \
es_ES@euro/ISO-8859-15 \
es_GT.UTF-8/UTF-8 \
es_GT/ISO-8859-1 \
es_HN.UTF-8/UTF-8 \
es_HN/ISO-8859-1 \
es_MX.UTF-8/UTF-8 \
es_MX/ISO-8859-1 \
es_NI.UTF-8/UTF-8 \
es_NI/ISO-8859-1 \
es_PA.UTF-8/UTF-8 \
es_PA/ISO-8859-1 \
es_PE.UTF-8/UTF-8 \
es_PE/ISO-8859-1 \
es_PR.UTF-8/UTF-8 \
es_PR/ISO-8859-1 \
es_PY.UTF-8/UTF-8 \
es_PY/ISO-8859-1 \
es_SV.UTF-8/UTF-8 \
es_SV/ISO-8859-1 \
es_US.UTF-8/UTF-8 \
es_US/ISO-8859-1 \
es_UY.UTF-8/UTF-8 \
es_UY/ISO-8859-1 \
es_VE.UTF-8/UTF-8 \
es_VE/ISO-8859-1 \
et_EE.UTF-8/UTF-8 \
et_EE/ISO-8859-1 \
et_EE.ISO-8859-15/ISO-8859-15 \
eu_ES.UTF-8/UTF-8 \
eu_ES/ISO-8859-1 \
eu_ES@euro/ISO-8859-15 \
fa_IR/UTF-8 \
ff_SN/UTF-8 \
fi_FI.UTF-8/UTF-8 \
fi_FI/ISO-8859-1 \
fi_FI@euro/ISO-8859-15 \
fil_PH/UTF-8 \
fo_FO.UTF-8/UTF-8 \
fo_FO/ISO-8859-1 \
fr_BE.UTF-8/UTF-8 \
fr_BE/ISO-8859-1 \
fr_BE@euro/ISO-8859-15 \
fr_CA.UTF-8/UTF-8 \
fr_CA/ISO-8859-1 \
fr_CH.UTF-8/UTF-8 \
fr_CH/ISO-8859-1 \
fr_FR.UTF-8/UTF-8 \
fr_FR/ISO-8859-1 \
fr_FR@euro/ISO-8859-15 \
fr_LU.UTF-8/UTF-8 \
fr_LU/ISO-8859-1 \
fr_LU@euro/ISO-8859-15 \
fur_IT/UTF-8 \
fy_NL/UTF-8 \
fy_DE/UTF-8 \
ga_IE.UTF-8/UTF-8 \
ga_IE/ISO-8859-1 \
ga_IE@euro/ISO-8859-15 \
gd_GB.UTF-8/UTF-8 \
gd_GB/ISO-8859-15 \
gez_ER/UTF-8 \
gez_ER@abegede/UTF-8 \
gez_ET/UTF-8 \
gez_ET@abegede/UTF-8 \
gl_ES.UTF-8/UTF-8 \
gl_ES/ISO-8859-1 \
gl_ES@euro/ISO-8859-15 \
gu_IN/UTF-8 \
gv_GB.UTF-8/UTF-8 \
gv_GB/ISO-8859-1 \
ha_NG/UTF-8 \
hak_TW/UTF-8 \
he_IL.UTF-8/UTF-8 \
he_IL/ISO-8859-8 \
hi_IN/UTF-8 \
hif_FJ/UTF-8 \
hne_IN/UTF-8 \
hr_HR.UTF-8/UTF-8 \
hr_HR/ISO-8859-2 \
hsb_DE/ISO-8859-2 \
hsb_DE.UTF-8/UTF-8 \
ht_HT/UTF-8 \
hu_HU.UTF-8/UTF-8 \
hu_HU/ISO-8859-2 \
hy_AM/UTF-8 \
hy_AM.ARMSCII-8/ARMSCII-8 \
ia_FR/UTF-8 \
id_ID.UTF-8/UTF-8 \
id_ID/ISO-8859-1 \
ig_NG/UTF-8 \
ik_CA/UTF-8 \
is_IS.UTF-8/UTF-8 \
is_IS/ISO-8859-1 \
it_CH.UTF-8/UTF-8 \
it_CH/ISO-8859-1 \
it_IT.UTF-8/UTF-8 \
it_IT/ISO-8859-1 \
it_IT@euro/ISO-8859-15 \
iu_CA/UTF-8 \
ja_JP.EUC-JP/EUC-JP \
ja_JP.UTF-8/UTF-8 \
ka_GE.UTF-8/UTF-8 \
ka_GE/GEORGIAN-PS \
kab_DZ/UTF-8 \
kk_KZ.UTF-8/UTF-8 \
kk_KZ/PT154 \
kl_GL.UTF-8/UTF-8 \
kl_GL/ISO-8859-1 \
km_KH/UTF-8 \
kn_IN/UTF-8 \
ko_KR.EUC-KR/EUC-KR \
ko_KR.UTF-8/UTF-8 \
kok_IN/UTF-8 \
ks_IN/UTF-8 \
ks_IN@devanagari/UTF-8 \
ku_TR.UTF-8/UTF-8 \
ku_TR/ISO-8859-9 \
kw_GB.UTF-8/UTF-8 \
kw_GB/ISO-8859-1 \
ky_KG/UTF-8 \
lb_LU/UTF-8 \
lg_UG.UTF-8/UTF-8 \
lg_UG/ISO-8859-10 \
li_BE/UTF-8 \
li_NL/UTF-8 \
lij_IT/UTF-8 \
ln_CD/UTF-8 \
lo_LA/UTF-8 \
lt_LT.UTF-8/UTF-8 \
lt_LT/ISO-8859-13 \
lv_LV.UTF-8/UTF-8 \
lv_LV/ISO-8859-13 \
lzh_TW/UTF-8 \
mag_IN/UTF-8 \
mai_IN/UTF-8 \
mai_NP/UTF-8 \
mfe_MU/UTF-8 \
mg_MG.UTF-8/UTF-8 \
mg_MG/ISO-8859-15 \
mhr_RU/UTF-8 \
mi_NZ.UTF-8/UTF-8 \
mi_NZ/ISO-8859-13 \
miq_NI/UTF-8 \
mjw_IN/UTF-8 \
mk_MK.UTF-8/UTF-8 \
mk_MK/ISO-8859-5 \
ml_IN/UTF-8 \
mn_MN/UTF-8 \
mni_IN/UTF-8 \
mr_IN/UTF-8 \
ms_MY.UTF-8/UTF-8 \
ms_MY/ISO-8859-1 \
mt_MT.UTF-8/UTF-8 \
mt_MT/ISO-8859-3 \
my_MM/UTF-8 \
nan_TW/UTF-8 \
nan_TW@latin/UTF-8 \
nb_NO.UTF-8/UTF-8 \
nb_NO/ISO-8859-1 \
nds_DE/UTF-8 \
nds_NL/UTF-8 \
ne_NP/UTF-8 \
nhn_MX/UTF-8 \
niu_NU/UTF-8 \
niu_NZ/UTF-8 \
nl_AW/UTF-8 \
nl_BE.UTF-8/UTF-8 \
nl_BE/ISO-8859-1 \
nl_BE@euro/ISO-8859-15 \
nl_NL.UTF-8/UTF-8 \
nl_NL/ISO-8859-1 \
nl_NL@euro/ISO-8859-15 \
nn_NO.UTF-8/UTF-8 \
nn_NO/ISO-8859-1 \
nr_ZA/UTF-8 \
nso_ZA/UTF-8 \
oc_FR.UTF-8/UTF-8 \
oc_FR/ISO-8859-1 \
om_ET/UTF-8 \
om_KE.UTF-8/UTF-8 \
om_KE/ISO-8859-1 \
or_IN/UTF-8 \
os_RU/UTF-8 \
pa_IN/UTF-8 \
pa_PK/UTF-8 \
pap_AW/UTF-8 \
pap_CW/UTF-8 \
pl_PL.UTF-8/UTF-8 \
pl_PL/ISO-8859-2 \
ps_AF/UTF-8 \
pt_BR.UTF-8/UTF-8 \
pt_BR/ISO-8859-1 \
pt_PT.UTF-8/UTF-8 \
pt_PT/ISO-8859-1 \
pt_PT@euro/ISO-8859-15 \
quz_PE/UTF-8 \
raj_IN/UTF-8 \
ro_RO.UTF-8/UTF-8 \
ro_RO/ISO-8859-2 \
ru_RU.KOI8-R/KOI8-R \
ru_RU.UTF-8/UTF-8 \
ru_RU/ISO-8859-5 \
ru_UA.UTF-8/UTF-8 \
ru_UA/KOI8-U \
rw_RW/UTF-8 \
sa_IN/UTF-8 \
sah_RU/UTF-8 \
sat_IN/UTF-8 \
sc_IT/UTF-8 \
sd_IN/UTF-8 \
sd_IN@devanagari/UTF-8 \
se_NO/UTF-8 \
sgs_LT/UTF-8 \
shn_MM/UTF-8 \
shs_CA/UTF-8 \
si_LK/UTF-8 \
sid_ET/UTF-8 \
sk_SK.UTF-8/UTF-8 \
sk_SK/ISO-8859-2 \
sl_SI.UTF-8/UTF-8 \
sl_SI/ISO-8859-2 \
sm_WS/UTF-8 \
so_DJ.UTF-8/UTF-8 \
so_DJ/ISO-8859-1 \
so_ET/UTF-8 \
so_KE.UTF-8/UTF-8 \
so_KE/ISO-8859-1 \
so_SO.UTF-8/UTF-8 \
so_SO/ISO-8859-1 \
sq_AL.UTF-8/UTF-8 \
sq_AL/ISO-8859-1 \
sq_MK/UTF-8 \
sr_ME/UTF-8 \
sr_RS/UTF-8 \
sr_RS@latin/UTF-8 \
ss_ZA/UTF-8 \
st_ZA.UTF-8/UTF-8 \
st_ZA/ISO-8859-1 \
sv_FI.UTF-8/UTF-8 \
sv_FI/ISO-8859-1 \
sv_FI@euro/ISO-8859-15 \
sv_SE.UTF-8/UTF-8 \
sv_SE/ISO-8859-1 \
sv_SE.ISO-8859-15/ISO-8859-15 \
sw_KE/UTF-8 \
sw_TZ/UTF-8 \
szl_PL/UTF-8 \
ta_IN/UTF-8 \
ta_LK/UTF-8 \
tcy_IN.UTF-8/UTF-8 \
te_IN/UTF-8 \
tg_TJ.UTF-8/UTF-8 \
tg_TJ/KOI8-T \
th_TH.UTF-8/UTF-8 \
th_TH/TIS-620 \
the_NP/UTF-8 \
ti_ER/UTF-8 \
ti_ET/UTF-8 \
tig_ER/UTF-8 \
tk_TM/UTF-8 \
tl_PH.UTF-8/UTF-8 \
tl_PH/ISO-8859-1 \
tn_ZA/UTF-8 \
to_TO/UTF-8 \
tpi_PG/UTF-8 \
tr_CY.UTF-8/UTF-8 \
tr_CY/ISO-8859-9 \
tr_TR.UTF-8/UTF-8 \
tr_TR/ISO-8859-9 \
ts_ZA/UTF-8 \
tt_RU/UTF-8 \
tt_RU@iqtelif/UTF-8 \
ug_CN/UTF-8 \
uk_UA.UTF-8/UTF-8 \
uk_UA/KOI8-U \
unm_US/UTF-8 \
ur_IN/UTF-8 \
ur_PK/UTF-8 \
uz_UZ.UTF-8/UTF-8 \
uz_UZ/ISO-8859-1 \
uz_UZ@cyrillic/UTF-8 \
ve_ZA/UTF-8 \
vi_VN/UTF-8 \
wa_BE/ISO-8859-1 \
wa_BE@euro/ISO-8859-15 \
wa_BE.UTF-8/UTF-8 \
wae_CH/UTF-8 \
wal_ET/UTF-8 \
wo_SN/UTF-8 \
xh_ZA.UTF-8/UTF-8 \
xh_ZA/ISO-8859-1 \
yi_US.UTF-8/UTF-8 \
yi_US/CP1255 \
yo_NG/UTF-8 \
yue_HK/UTF-8 \
yuw_PG/UTF-8 \
zh_CN.GB18030/GB18030 \
zh_CN.GBK/GBK \
zh_CN.UTF-8/UTF-8 \
zh_CN/GB2312 \
zh_HK.UTF-8/UTF-8 \
zh_HK/BIG5-HKSCS \
zh_SG.UTF-8/UTF-8 \
zh_SG.GBK/GBK \
zh_SG/GB2312 \
zh_TW.EUC-TW/EUC-TW \
zh_TW.UTF-8/UTF-8 \
zh_TW/BIG5 \
zu_ZA.UTF-8/UTF-8 \
zu_ZA/ISO-8859-1 \

View file

@ -1,862 +0,0 @@
#define _GNU_SOURCE
#include <assert.h>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <locale.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#include "../locale/hashval.h"
#define __LC_LAST 13
#include "../locale/locarchive.h"
#include "../crypt/md5.h"
const char *alias_file = DATADIR "/locale/locale.alias";
const char *locar_file = PREFIX "/lib/locale/locale-archive";
const char *tmpl_file = PREFIX "/lib/locale/locale-archive.tmpl";
const char *loc_path = PREFIX "/lib/locale/";
/* Flags set by `--verbose` option. */
int be_quiet = 1;
int verbose = 0;
int max_locarchive_open_retry = 10;
const char *output_prefix;
/* Endianness should have been taken care of by localedef. We don't need to do
additional swapping. We need this variable exported however, since
locarchive.c uses it to determine if it needs to swap endianness of a value
before writing to or reading from the archive. */
bool swap_endianness_p = false;
static const char *locnames[] =
{
#define DEFINE_CATEGORY(category, category_name, items, a) \
[category] = category_name,
#include "../locale/categories.def"
#undef DEFINE_CATEGORY
};
static int
is_prime (unsigned long candidate)
{
/* No even number and none less than 10 will be passed here. */
unsigned long int divn = 3;
unsigned long int sq = divn * divn;
while (sq < candidate && candidate % divn != 0)
{
++divn;
sq += 4 * divn;
++divn;
}
return candidate % divn != 0;
}
unsigned long
next_prime (unsigned long seed)
{
/* Make it definitely odd. */
seed |= 1;
while (!is_prime (seed))
seed += 2;
return seed;
}
void
error (int status, int errnum, const char *message, ...)
{
va_list args;
va_start (args, message);
fflush (stdout);
fprintf (stderr, "%s: ", program_invocation_name);
vfprintf (stderr, message, args);
va_end (args);
if (errnum)
fprintf (stderr, ": %s", strerror (errnum));
putc ('\n', stderr);
fflush (stderr);
if (status)
exit (errnum == EROFS ? 0 : status);
}
void *
xmalloc (size_t size)
{
void *p = malloc (size);
if (p == NULL)
error (EXIT_FAILURE, errno, "could not allocate %zd bytes of memory", size);
return p;
}
static void
open_tmpl_archive (struct locarhandle *ah)
{
struct stat64 st;
int fd;
struct locarhead head;
const char *archivefname = ah->fname == NULL ? tmpl_file : ah->fname;
/* Open the archive. We must have exclusive write access. */
fd = open64 (archivefname, O_RDONLY);
if (fd == -1)
error (EXIT_FAILURE, errno, "cannot open locale archive template file \"%s\"",
archivefname);
if (fstat64 (fd, &st) < 0)
error (EXIT_FAILURE, errno, "cannot stat locale archive template file \"%s\"",
archivefname);
/* Read the header. */
if (TEMP_FAILURE_RETRY (read (fd, &head, sizeof (head))) != sizeof (head))
error (EXIT_FAILURE, errno, "cannot read archive header");
ah->fd = fd;
ah->mmaped = (head.sumhash_offset
+ head.sumhash_size * sizeof (struct sumhashent));
if (ah->mmaped > (unsigned long) st.st_size)
error (EXIT_FAILURE, 0, "locale archive template file truncated");
ah->mmaped = st.st_size;
ah->reserved = st.st_size;
/* Now we know how large the administrative information part is.
Map all of it. */
ah->addr = mmap64 (NULL, ah->mmaped, PROT_READ, MAP_SHARED, fd, 0);
if (ah->addr == MAP_FAILED)
error (EXIT_FAILURE, errno, "cannot map archive header");
}
/* Open the locale archive. */
extern void open_archive (struct locarhandle *ah, bool readonly);
/* Close the locale archive. */
extern void close_archive (struct locarhandle *ah);
/* Add given locale data to the archive. */
extern int add_locale_to_archive (struct locarhandle *ah, const char *name,
locale_data_t data, bool replace);
extern void add_alias (struct locarhandle *ah, const char *alias,
bool replace, const char *oldname,
uint32_t *locrec_offset_p);
extern struct namehashent *
insert_name (struct locarhandle *ah,
const char *name, size_t name_len, bool replace);
struct nameent
{
char *name;
struct locrecent *locrec;
};
struct dataent
{
const unsigned char *sum;
uint32_t file_offset;
};
static int
nameentcmp (const void *a, const void *b)
{
struct locrecent *la = ((const struct nameent *) a)->locrec;
struct locrecent *lb = ((const struct nameent *) b)->locrec;
uint32_t start_a = -1, end_a = 0;
uint32_t start_b = -1, end_b = 0;
int cnt;
for (cnt = 0; cnt < __LC_LAST; ++cnt)
if (cnt != LC_ALL)
{
if (la->record[cnt].offset < start_a)
start_a = la->record[cnt].offset;
if (la->record[cnt].offset + la->record[cnt].len > end_a)
end_a = la->record[cnt].offset + la->record[cnt].len;
}
assert (start_a != (uint32_t)-1);
assert (end_a != 0);
for (cnt = 0; cnt < __LC_LAST; ++cnt)
if (cnt != LC_ALL)
{
if (lb->record[cnt].offset < start_b)
start_b = lb->record[cnt].offset;
if (lb->record[cnt].offset + lb->record[cnt].len > end_b)
end_b = lb->record[cnt].offset + lb->record[cnt].len;
}
assert (start_b != (uint32_t)-1);
assert (end_b != 0);
if (start_a != start_b)
return (int)start_a - (int)start_b;
return (int)end_a - (int)end_b;
}
static int
dataentcmp (const void *a, const void *b)
{
if (((const struct dataent *) a)->file_offset
< ((const struct dataent *) b)->file_offset)
return -1;
if (((const struct dataent *) a)->file_offset
> ((const struct dataent *) b)->file_offset)
return 1;
return 0;
}
static int
sumsearchfn (const void *key, const void *ent)
{
uint32_t keyn = *(uint32_t *)key;
uint32_t entn = ((struct dataent *)ent)->file_offset;
if (keyn < entn)
return -1;
if (keyn > entn)
return 1;
return 0;
}
static void
compute_data (struct locarhandle *ah, struct nameent *name, size_t sumused,
struct dataent *files, locale_data_t data)
{
int cnt;
struct locrecent *locrec = name->locrec;
struct dataent *file;
data[LC_ALL].addr = ((char *) ah->addr) + locrec->record[LC_ALL].offset;
data[LC_ALL].size = locrec->record[LC_ALL].len;
for (cnt = 0; cnt < __LC_LAST; ++cnt)
if (cnt != LC_ALL)
{
data[cnt].addr = ((char *) ah->addr) + locrec->record[cnt].offset;
data[cnt].size = locrec->record[cnt].len;
if (data[cnt].addr >= data[LC_ALL].addr
&& data[cnt].addr + data[cnt].size
<= data[LC_ALL].addr + data[LC_ALL].size)
__md5_buffer (data[cnt].addr, data[cnt].size, data[cnt].sum);
else
{
file = bsearch (&locrec->record[cnt].offset, files, sumused,
sizeof (*files), sumsearchfn);
if (file == NULL)
error (EXIT_FAILURE, 0, "inconsistent template file");
memcpy (data[cnt].sum, file->sum, sizeof (data[cnt].sum));
}
}
}
static int
fill_archive (struct locarhandle *tmpl_ah,
const char *fname,
size_t install_langs_count, char *install_langs_list[],
size_t nlist, char *list[],
const char *primary)
{
struct locarhandle ah;
struct locarhead *head;
int result = 0;
struct nameent *names;
struct namehashent *namehashtab;
size_t cnt, used;
struct dataent *files;
struct sumhashent *sumhashtab;
size_t sumused;
struct locrecent *primary_locrec = NULL;
struct nameent *primary_nameent = NULL;
head = tmpl_ah->addr;
names = (struct nameent *) malloc (head->namehash_used
* sizeof (struct nameent));
files = (struct dataent *) malloc (head->sumhash_used
* sizeof (struct dataent));
if (names == NULL || files == NULL)
error (EXIT_FAILURE, errno, "could not allocate tables");
namehashtab = (struct namehashent *) ((char *) tmpl_ah->addr
+ head->namehash_offset);
sumhashtab = (struct sumhashent *) ((char *) tmpl_ah->addr
+ head->sumhash_offset);
for (cnt = used = 0; cnt < head->namehash_size; ++cnt)
if (namehashtab[cnt].locrec_offset != 0)
{
char * name;
int i;
assert (used < head->namehash_used);
name = tmpl_ah->addr + namehashtab[cnt].name_offset;
if (install_langs_count == 0)
{
/* Always intstall the entry. */
names[used].name = name;
names[used++].locrec
= (struct locrecent *) ((char *) tmpl_ah->addr +
namehashtab[cnt].locrec_offset);
}
else
{
/* Only install the entry if the user asked for it via
--install-langs. */
for (i = 0; i < install_langs_count; i++)
{
/* Add one for "_" and one for the null terminator. */
size_t len = strlen (install_langs_list[i]) + 2;
char *install_lang = (char *)xmalloc (len);
strcpy (install_lang, install_langs_list[i]);
if (strchr (install_lang, '_') == NULL)
strcat (install_lang, "_");
if (strncmp (name, install_lang, strlen (install_lang)) == 0)
{
names[used].name = name;
names[used++].locrec
= (struct locrecent *) ((char *)tmpl_ah->addr
+ namehashtab[cnt].locrec_offset);
}
free (install_lang);
}
}
}
/* Sort the names. */
qsort (names, used, sizeof (struct nameent), nameentcmp);
for (cnt = sumused = 0; cnt < head->sumhash_size; ++cnt)
if (sumhashtab[cnt].file_offset != 0)
{
assert (sumused < head->sumhash_used);
files[sumused].sum = (const unsigned char *) sumhashtab[cnt].sum;
files[sumused++].file_offset = sumhashtab[cnt].file_offset;
}
/* Sort by file locations. */
qsort (files, sumused, sizeof (struct dataent), dataentcmp);
/* Open the archive. This call never returns if we cannot
successfully open the archive. */
ah.fname = NULL;
if (fname != NULL)
ah.fname = fname;
open_archive (&ah, false);
if (primary != NULL)
{
for (cnt = 0; cnt < used; ++cnt)
if (strcmp (names[cnt].name, primary) == 0)
break;
if (cnt < used)
{
locale_data_t data;
compute_data (tmpl_ah, &names[cnt], sumused, files, data);
result |= add_locale_to_archive (&ah, primary, data, 0);
primary_locrec = names[cnt].locrec;
primary_nameent = &names[cnt];
}
}
for (cnt = 0; cnt < used; ++cnt)
if (&names[cnt] == primary_nameent)
continue;
else if ((cnt > 0 && names[cnt - 1].locrec == names[cnt].locrec)
|| names[cnt].locrec == primary_locrec)
{
const char *oldname;
struct namehashent *namehashent;
uint32_t locrec_offset;
if (names[cnt].locrec == primary_locrec)
oldname = primary;
else
oldname = names[cnt - 1].name;
namehashent = insert_name (&ah, oldname, strlen (oldname), true);
assert (namehashent->name_offset != 0);
assert (namehashent->locrec_offset != 0);
locrec_offset = namehashent->locrec_offset;
add_alias (&ah, names[cnt].name, 0, oldname, &locrec_offset);
}
else
{
locale_data_t data;
compute_data (tmpl_ah, &names[cnt], sumused, files, data);
result |= add_locale_to_archive (&ah, names[cnt].name, data, 0);
}
while (nlist-- > 0)
{
const char *fname = *list++;
size_t fnamelen = strlen (fname);
struct stat64 st;
DIR *dirp;
struct dirent64 *d;
int seen;
locale_data_t data;
int cnt;
/* First see whether this really is a directory and whether it
contains all the require locale category files. */
if (stat64 (fname, &st) < 0)
{
error (0, 0, "stat of \"%s\" failed: %s: ignored", fname,
strerror (errno));
continue;
}
if (!S_ISDIR (st.st_mode))
{
error (0, 0, "\"%s\" is no directory; ignored", fname);
continue;
}
dirp = opendir (fname);
if (dirp == NULL)
{
error (0, 0, "cannot open directory \"%s\": %s: ignored",
fname, strerror (errno));
continue;
}
seen = 0;
while ((d = readdir64 (dirp)) != NULL)
{
for (cnt = 0; cnt < __LC_LAST; ++cnt)
if (cnt != LC_ALL)
if (strcmp (d->d_name, locnames[cnt]) == 0)
{
unsigned char d_type;
/* We have an object of the required name. If it's
a directory we have to look at a file with the
prefix "SYS_". Otherwise we have found what we
are looking for. */
#ifdef _DIRENT_HAVE_D_TYPE
d_type = d->d_type;
if (d_type != DT_REG)
#endif
{
char fullname[fnamelen + 2 * strlen (d->d_name) + 7];
#ifdef _DIRENT_HAVE_D_TYPE
if (d_type == DT_UNKNOWN || d_type == DT_LNK)
#endif
{
strcpy (stpcpy (stpcpy (fullname, fname), "/"),
d->d_name);
if (stat64 (fullname, &st) == -1)
/* We cannot stat the file, ignore it. */
break;
d_type = IFTODT (st.st_mode);
}
if (d_type == DT_DIR)
{
/* We have to do more tests. The file is a
directory and it therefore must contain a
regular file with the same name except a
"SYS_" prefix. */
char *t = stpcpy (stpcpy (fullname, fname), "/");
strcpy (stpcpy (stpcpy (t, d->d_name), "/SYS_"),
d->d_name);
if (stat64 (fullname, &st) == -1)
/* There is no SYS_* file or we cannot
access it. */
break;
d_type = IFTODT (st.st_mode);
}
}
/* If we found a regular file (eventually after
following a symlink) we are successful. */
if (d_type == DT_REG)
++seen;
break;
}
}
closedir (dirp);
if (seen != __LC_LAST - 1)
{
/* We don't have all locale category files. Ignore the name. */
error (0, 0, "incomplete set of locale files in \"%s\"",
fname);
continue;
}
/* Add the files to the archive. To do this we first compute
sizes and the MD5 sums of all the files. */
for (cnt = 0; cnt < __LC_LAST; ++cnt)
if (cnt != LC_ALL)
{
char fullname[fnamelen + 2 * strlen (locnames[cnt]) + 7];
int fd;
strcpy (stpcpy (stpcpy (fullname, fname), "/"), locnames[cnt]);
fd = open64 (fullname, O_RDONLY);
if (fd == -1 || fstat64 (fd, &st) == -1)
{
/* Cannot read the file. */
if (fd != -1)
close (fd);
break;
}
if (S_ISDIR (st.st_mode))
{
char *t;
close (fd);
t = stpcpy (stpcpy (fullname, fname), "/");
strcpy (stpcpy (stpcpy (t, locnames[cnt]), "/SYS_"),
locnames[cnt]);
fd = open64 (fullname, O_RDONLY);
if (fd == -1 || fstat64 (fd, &st) == -1
|| !S_ISREG (st.st_mode))
{
if (fd != -1)
close (fd);
break;
}
}
/* Map the file. */
data[cnt].addr = mmap64 (NULL, st.st_size, PROT_READ, MAP_SHARED,
fd, 0);
if (data[cnt].addr == MAP_FAILED)
{
/* Cannot map it. */
close (fd);
break;
}
data[cnt].size = st.st_size;
__md5_buffer (data[cnt].addr, st.st_size, data[cnt].sum);
/* We don't need the file descriptor anymore. */
close (fd);
}
if (cnt != __LC_LAST)
{
while (cnt-- > 0)
if (cnt != LC_ALL)
munmap (data[cnt].addr, data[cnt].size);
error (0, 0, "cannot read all files in \"%s\": ignored", fname);
continue;
}
result |= add_locale_to_archive (&ah, basename (fname), data, 0);
for (cnt = 0; cnt < __LC_LAST; ++cnt)
if (cnt != LC_ALL)
munmap (data[cnt].addr, data[cnt].size);
}
/* We are done. */
close_archive (&ah);
return result;
}
void usage()
{
printf ("\
Usage: build-locale-archive [OPTION]... [TEMPLATE-FILE] [ARCHIVE-FILE]\n\
Builds a locale archive from a template file.\n\
Options:\n\
-h, --help Print this usage message.\n\
-v, --verbose Verbose execution.\n\
-l, --install-langs=LIST Only include locales given in LIST into the \n\
locale archive. LIST is a colon separated list\n\
of locale prefixes, for example \"de:en:ja\".\n\
The special argument \"all\" means to install\n\
all languages and it must be present by itself.\n\
If \"all\" is present with any other language it\n\
will be treated as the name of a locale.\n\
If the --install-langs option is missing, all\n\
locales are installed. The colon separated list\n\
can contain any strings matching the beginning of\n\
locale names.\n\
If a string does not contain a \"_\", it is added.\n\
Examples:\n\
--install-langs=\"en\"\n\
installs en_US, en_US.iso88591,\n\
en_US.iso885915, en_US.utf8,\n\
en_GB ...\n\
--install-langs=\"en_US.utf8\"\n\
installs only en_US.utf8.\n\
--install-langs=\"ko\"\n\
installs ko_KR, ko_KR.euckr,\n\
ko_KR.utf8 but *not* kok_IN\n\
because \"ko\" does not contain\n\
\"_\" and it is silently added\n\
--install-langs\"ko:kok\"\n\
installs ko_KR, ko_KR.euckr,\n\
ko_KR.utf8, kok_IN, and\n\
kok_IN.utf8.\n\
--install-langs=\"POSIX\" will\n\
installs *no* locales at all\n\
because POSIX matches none of\n\
the locales. Actually, any string\n\
matching nothing will do that.\n\
POSIX and C will always be\n\
available because they are\n\
builtin.\n\
Aliases are installed as well,\n\
i.e. --install-langs=\"de\"\n\
will install not only every locale starting with\n\
\"de\" but also the aliases \"deutsch\"\n\
and and \"german\" although the latter does not\n\
start with \"de\".\n\
\n\
If the arguments TEMPLATE-FILE and ARCHIVE-FILE are not given the locations\n\
where the glibc used expects these files are used by default.\n\
");
}
int main (int argc, char *argv[])
{
char path[4096];
DIR *dirp;
struct dirent64 *d;
struct stat64 st;
char *list[16384], *primary;
char *lang;
int install_langs_count = 0;
int i;
char *install_langs_arg, *ila_start;
char **install_langs_list = NULL;
unsigned int cnt = 0;
struct locarhandle tmpl_ah;
char *new_locar_fname = NULL;
size_t loc_path_len = strlen (loc_path);
while (1)
{
int c;
static struct option long_options[] =
{
{"help", no_argument, 0, 'h'},
{"verbose", no_argument, 0, 'v'},
{"install-langs", required_argument, 0, 'l'},
{0, 0, 0, 0}
};
/* getopt_long stores the option index here. */
int option_index = 0;
c = getopt_long (argc, argv, "vhl:",
long_options, &option_index);
/* Detect the end of the options. */
if (c == -1)
break;
switch (c)
{
case 0:
printf ("unknown option %s", long_options[option_index].name);
if (optarg)
printf (" with arg %s", optarg);
printf ("\n");
usage ();
exit (1);
case 'v':
verbose = 1;
be_quiet = 0;
break;
case 'h':
usage ();
exit (0);
case 'l':
install_langs_arg = ila_start = strdup (optarg);
/* If the argument to --install-lang is "all", do
not limit the list of languages to install and install
them all. We do not support installing a single locale
called "all". */
#define MAGIC_INSTALL_ALL "all"
if (install_langs_arg != NULL
&& install_langs_arg[0] != '\0'
&& !(strncmp(install_langs_arg, MAGIC_INSTALL_ALL,
strlen(MAGIC_INSTALL_ALL)) == 0
&& strlen (install_langs_arg) == 3))
{
/* Count the number of languages we will install. */
while (true)
{
lang = strtok(install_langs_arg, ":;,");
if (lang == NULL)
break;
install_langs_count++;
install_langs_arg = NULL;
}
free (ila_start);
/* Reject an entire string made up of delimiters. */
if (install_langs_count == 0)
break;
/* Copy the list. */
install_langs_list = (char **)xmalloc (sizeof(char *) * install_langs_count);
install_langs_arg = ila_start = strdup (optarg);
install_langs_count = 0;
while (true)
{
lang = strtok(install_langs_arg, ":;,");
if (lang == NULL)
break;
install_langs_list[install_langs_count] = lang;
install_langs_count++;
install_langs_arg = NULL;
}
}
break;
case '?':
/* getopt_long already printed an error message. */
usage ();
exit (0);
default:
abort ();
}
}
tmpl_ah.fname = NULL;
if (optind < argc)
tmpl_ah.fname = argv[optind];
if (optind + 1 < argc)
new_locar_fname = argv[optind + 1];
if (verbose)
{
if (tmpl_ah.fname)
printf("input archive file specified on command line: %s\n",
tmpl_ah.fname);
else
printf("using default input archive file.\n");
if (new_locar_fname)
printf("output archive file specified on command line: %s\n",
new_locar_fname);
else
printf("using default output archive file.\n");
}
dirp = opendir (loc_path);
if (dirp == NULL)
error (EXIT_FAILURE, errno, "cannot open directory \"%s\"", loc_path);
open_tmpl_archive (&tmpl_ah);
if (new_locar_fname)
unlink (new_locar_fname);
else
unlink (locar_file);
primary = getenv ("LC_ALL");
if (primary == NULL)
primary = getenv ("LANG");
if (primary != NULL)
{
if (strncmp (primary, "ja", 2) != 0
&& strncmp (primary, "ko", 2) != 0
&& strncmp (primary, "zh", 2) != 0)
{
char *ptr = malloc (strlen (primary) + strlen (".utf8") + 1), *p, *q;
/* This leads to invalid locales sometimes:
de_DE.iso885915@euro -> de_DE.utf8@euro */
if (ptr != NULL)
{
p = ptr;
q = primary;
while (*q && *q != '.' && *q != '@')
*p++ = *q++;
if (*q == '.')
while (*q && *q != '@')
q++;
p = stpcpy (p, ".utf8");
strcpy (p, q);
primary = ptr;
}
else
primary = NULL;
}
}
memcpy (path, loc_path, loc_path_len);
while ((d = readdir64 (dirp)) != NULL)
{
if (strcmp (d->d_name, ".") == 0 || strcmp (d->d_name, "..") == 0)
continue;
if (strchr (d->d_name, '_') == NULL)
continue;
size_t d_name_len = strlen (d->d_name);
if (loc_path_len + d_name_len + 1 > sizeof (path))
{
error (0, 0, "too long filename \"%s\"", d->d_name);
continue;
}
memcpy (path + loc_path_len, d->d_name, d_name_len + 1);
if (stat64 (path, &st) < 0)
{
error (0, errno, "cannot stat \"%s\"", path);
continue;
}
if (! S_ISDIR (st.st_mode))
continue;
if (cnt == 16384)
{
error (0, 0, "too many directories in \"%s\"", loc_path);
break;
}
list[cnt] = strdup (path);
if (list[cnt] == NULL)
{
error (0, errno, "cannot add file to list \"%s\"", path);
continue;
}
if (primary != NULL && cnt > 0 && strcmp (primary, d->d_name) == 0)
{
char *p = list[0];
list[0] = list[cnt];
list[cnt] = p;
}
cnt++;
}
closedir (dirp);
/* Store the archive to the file specified as the second argument on the
command line or the default locale archive. */
fill_archive (&tmpl_ah, new_locar_fname,
install_langs_count, install_langs_list,
cnt, list, primary);
close_archive (&tmpl_ah);
truncate (tmpl_file, 0);
if (install_langs_count > 0)
{
free (ila_start);
free (install_langs_list);
}
char *tz_argv[] = { "/usr/sbin/tzdata-update", NULL };
execve (tz_argv[0], (char *const *)tz_argv, (char *const *)&tz_argv[1]);
exit (0);
}

1
dist
View file

@ -1 +0,0 @@
an8_10

View file

@ -1 +0,0 @@
c81d2388896379997bc359d4f2084239 glibc-2.28.tar.xz

View file

@ -0,0 +1,39 @@
From fc60db3cf29ba157d09ba4f4b92e3ab382b0339d Mon Sep 17 00:00:00 2001
From: Xi Ruoyao <xry111@xry111.site>
Date: Wed, 9 Aug 2023 19:12:54 +0800
Subject: [PATCH 04/29] elf: Add new LoongArch reloc types (101 to 108) into
elf.h
These reloc types are generated by GNU assembler >= 2.41 for relaxation
support.
Link: https://sourceware.org/git/?p=binutils-gdb.git;a=commitdiff;h=57a930e3
Signed-off-by: Xi Ruoyao <xry111@xry111.site>
Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
elf/elf.h | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/elf/elf.h b/elf/elf.h
index 89fc8021..d623bdeb 100644
--- a/elf/elf.h
+++ b/elf/elf.h
@@ -4205,6 +4205,14 @@ enum
#define R_LARCH_TLS_GD_HI20 98
#define R_LARCH_32_PCREL 99
#define R_LARCH_RELAX 100
+#define R_LARCH_DELETE 101
+#define R_LARCH_ALIGN 102
+#define R_LARCH_PCREL20_S2 103
+#define R_LARCH_CFA 104
+#define R_LARCH_ADD6 105
+#define R_LARCH_SUB6 106
+#define R_LARCH_ADD_ULEB128 107
+#define R_LARCH_SUB_ULEB128 108
/* ARC specific declarations. */
--
2.33.0

File diff suppressed because it is too large Load diff

View file

@ -1,29 +0,0 @@
From dc2d26d52c129c47fa1f16bd0157cd20c6d9a958 Mon Sep 17 00:00:00 2001
From: caiyinyu <caiyinyu@loongson.cn>
Date: Wed, 21 Jun 2023 11:55:02 +0800
Subject: [PATCH 08/14] glibc-2.28: Add new struct user_fp_state in user.h
Change-Id: Idc233cc11c8f76b624dc2891b432f4d02a53cebc
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/unix/sysv/linux/loongarch/sys/user.h | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/sysdeps/unix/sysv/linux/loongarch/sys/user.h b/sysdeps/unix/sysv/linux/loongarch/sys/user.h
index f9108350..21e340f6 100644
--- a/sysdeps/unix/sysv/linux/loongarch/sys/user.h
+++ b/sysdeps/unix/sysv/linux/loongarch/sys/user.h
@@ -28,4 +28,10 @@ struct user_regs_struct
uint64_t reserved[11];
};
+struct user_fp_struct {
+ uint64_t fpr[32];
+ uint64_t fcc;
+ uint32_t fcsr;
+};
+
#endif /* _SYS_USER_H */
--
2.33.0

View file

@ -1,162 +0,0 @@
From 647a0a28e5c9aed2f1fa59bbb7595133e7a4e62f Mon Sep 17 00:00:00 2001
From: caiyinyu <caiyinyu@loongson.cn>
Date: Mon, 24 Apr 2023 18:09:55 +0800
Subject: [PATCH 03/14] glibc-2.28: Fix ifunc str/mem functions xfail problems.
Change-Id: Ibff4229fcfef23c0b19fb94b21a4d17b49eceec6
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
.../lp64/multiarch/ifunc-impl-list.c | 76 +++++++++----------
1 file changed, 38 insertions(+), 38 deletions(-)
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
index c2b6bbf7..fdeae797 100644
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
@@ -36,105 +36,105 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
size_t i = 0;
IFUNC_IMPL (i, name, memcpy,
- IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_lasx)
- IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_lsx)
+ IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_LASX, __memcpy_lasx)
+ IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_LSX, __memcpy_lsx)
+ IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_UAL, __memcpy_unaligned)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_aligned)
- IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_unaligned)
)
IFUNC_IMPL (i, name, memmove,
- IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_lasx)
- IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_lsx)
+ IFUNC_IMPL_ADD (array, i, memmove, SUPPORT_LASX, __memmove_lasx)
+ IFUNC_IMPL_ADD (array, i, memmove, SUPPORT_LSX, __memmove_lsx)
+ IFUNC_IMPL_ADD (array, i, memmove, SUPPORT_UAL, __memmove_unaligned)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_aligned)
- IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_unaligned)
)
IFUNC_IMPL (i, name, memset,
- IFUNC_IMPL_ADD (array, i, memset, 1, __memset_lasx)
- IFUNC_IMPL_ADD (array, i, memset, 1, __memset_lsx)
+ IFUNC_IMPL_ADD (array, i, memset, SUPPORT_LASX, __memset_lasx)
+ IFUNC_IMPL_ADD (array, i, memset, SUPPORT_LSX, __memset_lsx)
+ IFUNC_IMPL_ADD (array, i, memset, SUPPORT_UAL, __memset_unaligned)
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_aligned)
- IFUNC_IMPL_ADD (array, i, memset, 1, __memset_unaligned)
)
IFUNC_IMPL (i, name, memchr,
- IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_lasx)
- IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_lsx)
+ IFUNC_IMPL_ADD (array, i, memchr, SUPPORT_LASX, __memchr_lasx)
+ IFUNC_IMPL_ADD (array, i, memchr, SUPPORT_LSX, __memchr_lsx)
IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_aligned)
)
IFUNC_IMPL (i, name, memrchr,
- IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_lasx)
- IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_lsx)
+ IFUNC_IMPL_ADD (array, i, memrchr, SUPPORT_LASX, __memrchr_lasx)
+ IFUNC_IMPL_ADD (array, i, memrchr, SUPPORT_LSX, __memrchr_lsx)
IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_generic)
)
IFUNC_IMPL (i, name, memcmp,
- IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_lasx)
- IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_lsx)
+ IFUNC_IMPL_ADD (array, i, memcmp, SUPPORT_LASX, __memcmp_lasx)
+ IFUNC_IMPL_ADD (array, i, memcmp, SUPPORT_LSX, __memcmp_lsx)
IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_aligned)
)
IFUNC_IMPL (i, name, rawmemchr,
- IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_lasx)
- IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_lsx)
+ IFUNC_IMPL_ADD (array, i, rawmemchr, SUPPORT_LASX, __rawmemchr_lasx)
+ IFUNC_IMPL_ADD (array, i, rawmemchr, SUPPORT_LSX, __rawmemchr_lsx)
IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_aligned)
)
IFUNC_IMPL (i, name, strchr,
- IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_lasx)
- IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_lsx)
+ IFUNC_IMPL_ADD (array, i, strchr, SUPPORT_LASX, __strchr_lasx)
+ IFUNC_IMPL_ADD (array, i, strchr, SUPPORT_LSX, __strchr_lsx)
+ IFUNC_IMPL_ADD (array, i, strchr, SUPPORT_UAL, __strchr_unaligned)
IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_aligned)
- IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_unaligned)
)
IFUNC_IMPL (i, name, strrchr,
- IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_lasx)
- IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_lsx)
+ IFUNC_IMPL_ADD (array, i, strrchr, SUPPORT_LASX, __strrchr_lasx)
+ IFUNC_IMPL_ADD (array, i, strrchr, SUPPORT_LSX, __strrchr_lsx)
IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_aligned)
)
IFUNC_IMPL (i, name, strlen,
- IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_lasx)
- IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_lsx)
+ IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LASX, __strlen_lasx)
+ IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LSX, __strlen_lsx)
+ IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_UAL, __strlen_unaligned)
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_aligned)
- IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_unaligned)
)
IFUNC_IMPL (i, name, strnlen,
- IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_lasx)
- IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_lsx)
+ IFUNC_IMPL_ADD (array, i, strnlen, SUPPORT_LASX, __strnlen_lasx)
+ IFUNC_IMPL_ADD (array, i, strnlen, SUPPORT_LSX, __strnlen_lsx)
+ IFUNC_IMPL_ADD (array, i, strnlen, SUPPORT_UAL, __strnlen_unaligned)
IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_aligned)
- IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_unaligned)
)
IFUNC_IMPL (i, name, strchrnul,
- IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_lasx)
- IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_lsx)
+ IFUNC_IMPL_ADD (array, i, strchrnul, SUPPORT_LASX, __strchrnul_lasx)
+ IFUNC_IMPL_ADD (array, i, strchrnul, SUPPORT_LSX, __strchrnul_lsx)
+ IFUNC_IMPL_ADD (array, i, strchrnul, SUPPORT_UAL, __strchrnul_unaligned)
IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_aligned)
- IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_unaligned)
)
IFUNC_IMPL (i, name, strncmp,
- IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_lsx)
+ IFUNC_IMPL_ADD (array, i, strncmp, SUPPORT_LSX, __strncmp_lsx)
+ IFUNC_IMPL_ADD (array, i, strncmp, SUPPORT_UAL, __strncmp_unaligned)
IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_aligned)
- IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_unaligned)
)
IFUNC_IMPL (i, name, strcpy,
- IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_lsx)
+ IFUNC_IMPL_ADD (array, i, strcpy, SUPPORT_LSX, __strcpy_lsx)
+ IFUNC_IMPL_ADD (array, i, strcpy, SUPPORT_UAL, __strcpy_unaligned)
IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_aligned)
- IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_unaligned)
)
IFUNC_IMPL (i, name, stpcpy,
- IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_lsx)
+ IFUNC_IMPL_ADD (array, i, stpcpy, SUPPORT_LSX, __stpcpy_lsx)
IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_aligned)
)
IFUNC_IMPL (i, name, strcmp,
- IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_lsx)
+ IFUNC_IMPL_ADD (array, i, strcmp, SUPPORT_LSX, __strcmp_lsx)
+ IFUNC_IMPL_ADD (array, i, strcmp, SUPPORT_UAL, __strcmp_unaligned)
IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_aligned)
- IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_unaligned)
)
return i;
--
2.33.0

View file

@ -1,57 +0,0 @@
From 00537d6945e71af8c9b0b1e7c2695f6a9a1ef1f5 Mon Sep 17 00:00:00 2001
From: caiyinyu <caiyinyu@loongson.cn>
Date: Sun, 25 Jun 2023 16:23:25 +0800
Subject: [PATCH 09/14] glibc-2.28: Redefine macro LEAF/ENTRY.
The following usage of macro LEAF/ENTRY are all feasible:
1. LEAF(fcn) -- the align value of fcn is .align 3 (default value)
2. LEAF(fcn, 6) -- the align value of fcn is .align 6
Change-Id: Ie3df4df8dba5259b665bd0e4702aaab0a09a5f65
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/sys/asm.h | 15 ++++++++++-----
1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/sysdeps/loongarch/sys/asm.h b/sysdeps/loongarch/sys/asm.h
index 357a5ba3..734e45ae 100644
--- a/sysdeps/loongarch/sys/asm.h
+++ b/sysdeps/loongarch/sys/asm.h
@@ -26,16 +26,21 @@
#endif
-/* Declare leaf routine. */
-#define LEAF(symbol, aln) \
+/* Declare leaf routine.
+ The usage of macro LEAF/ENTRY is as follows:
+ 1. LEAF(fcn) -- the align value of fcn is .align 3 (default value)
+ 2. LEAF(fcn, 6) -- the align value of fcn is .align 6
+*/
+#define LEAF_IMPL(symbol, aln, ...) \
.text; \
.globl symbol; \
.align aln; \
.type symbol, @function; \
symbol: \
- cfi_startproc; \
+ cfi_startproc;
-# define ENTRY(symbol, aln) LEAF(symbol, aln)
+#define LEAF(...) LEAF_IMPL(__VA_ARGS__, 3)
+#define ENTRY(...) LEAF(__VA_ARGS__)
#define LEAF_NO_ALIGN(symbol) \
.text; \
@@ -44,7 +49,7 @@ symbol: \
symbol: \
cfi_startproc;
-# define ENTRY_NO_ALIGN(symbol) LEAF_NO_ALIGN(symbol)
+#define ENTRY_NO_ALIGN(symbol) LEAF_NO_ALIGN(symbol)
/* Mark end of function. */
#undef END
--
2.33.0

View file

@ -1,306 +0,0 @@
From 27a004c9777340afd86fc0d129f6ffad508bf090 Mon Sep 17 00:00:00 2001
From: caiyinyu <caiyinyu@loongson.cn>
Date: Tue, 11 Jul 2023 16:09:55 +0800
Subject: [PATCH 12/14] glibc-2.28: Refactor code and fix bug in
_dl_runtime_resolve.
Change-Id: I4907e6643ef25b87d7862e957ce9bf6d201da816
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/dl-machine.h | 8 +-
sysdeps/loongarch/dl-trampoline.S | 7 ++
sysdeps/loongarch/dl-trampoline.h | 159 +++++++++++++-----------------
sysdeps/loongarch/sys/asm.h | 9 ++
4 files changed, 90 insertions(+), 93 deletions(-)
diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
index 6e9c6258..ff520a07 100644
--- a/sysdeps/loongarch/dl-machine.h
+++ b/sysdeps/loongarch/dl-machine.h
@@ -381,9 +381,13 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
/* If using PLTs, fill in the first two entries of .got.plt. */
if (l->l_info[DT_JMPREL])
{
- extern void _dl_runtime_resolve (void) __attribute__ ((visibility ("hidden")));
+
+#if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float
extern void _dl_runtime_resolve_lasx (void) __attribute__ ((visibility ("hidden")));
extern void _dl_runtime_resolve_lsx (void) __attribute__ ((visibility ("hidden")));
+#endif
+ extern void _dl_runtime_resolve (void) __attribute__ ((visibility ("hidden")));
+
ElfW(Addr) *gotplt = (ElfW(Addr) *) D_PTR (l, l_info[DT_PLTGOT]);
/* If a library is prelinked but we have to relocate anyway,
we have to be able to undo the prelinking of .got.plt.
@@ -391,11 +395,13 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
if (gotplt[1])
l->l_mach.plt = gotplt[1] + l->l_addr;
+#if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float
if (SUPPORT_LASX)
gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lasx;
else if (SUPPORT_LSX)
gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lsx;
else
+#endif
gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve;
gotplt[1] = (ElfW(Addr)) l;
diff --git a/sysdeps/loongarch/dl-trampoline.S b/sysdeps/loongarch/dl-trampoline.S
index 5f627a63..78d741f3 100644
--- a/sysdeps/loongarch/dl-trampoline.S
+++ b/sysdeps/loongarch/dl-trampoline.S
@@ -16,16 +16,23 @@
License along with the GNU C Library. If not, see
<http://www.gnu.org/licenses/>. */
+#include <sysdep.h>
+#include <sys/asm.h>
+
+#if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float
#define USE_LASX
#define _dl_runtime_resolve _dl_runtime_resolve_lasx
#include "dl-trampoline.h"
+#undef FRAME_SIZE
#undef USE_LASX
#undef _dl_runtime_resolve
#define USE_LSX
#define _dl_runtime_resolve _dl_runtime_resolve_lsx
#include "dl-trampoline.h"
+#undef FRAME_SIZE
#undef USE_LSX
#undef _dl_runtime_resolve
+#endif
#include "dl-trampoline.h"
diff --git a/sysdeps/loongarch/dl-trampoline.h b/sysdeps/loongarch/dl-trampoline.h
index 96f41f1d..9a6d9b6c 100644
--- a/sysdeps/loongarch/dl-trampoline.h
+++ b/sysdeps/loongarch/dl-trampoline.h
@@ -17,31 +17,24 @@
License along with the GNU C Library. If not, see
<http://www.gnu.org/licenses/>. */
-#include <sysdep.h>
-#include <sys/asm.h>
-
/* Assembler veneer called from the PLT header code for lazy loading.
The PLT header passes its own args in t0-t2. */
-
-#ifdef __loongarch_soft_float
-# define FRAME_SIZE (-((-10 * SZREG) & ALMASK))
+#ifdef USE_LASX
+# define FRAME_SIZE (-((-9 * SZREG - 8 * SZFREG - 8 * SZXREG) & ALMASK))
+#elif defined USE_LSX
+# define FRAME_SIZE (-((-9 * SZREG - 8 * SZFREG - 8 * SZVREG) & ALMASK))
+#elif !defined __loongarch_soft_float
+# define FRAME_SIZE (-((-9 * SZREG - 8 * SZFREG) & ALMASK))
#else
-# define FRAME_SIZE (-((-10 * SZREG - 8 * 256) & ALMASK))
+# define FRAME_SIZE (-((-9 * SZREG) & ALMASK))
#endif
ENTRY (_dl_runtime_resolve, 3)
- # Save arguments to stack.
-
-#ifdef __loongarch64
- li.d t3, -FRAME_SIZE
- add.d sp, sp, t3
-#elif defined __loongarch32
- li.w t3, -FRAME_SIZE
- add.w sp, sp, t3
-#endif
+ /* Save arguments to stack. */
+ ADDI sp, sp, -FRAME_SIZE
- REG_S ra, sp, 9*SZREG
+ REG_S ra, sp, 0*SZREG
REG_S a0, sp, 1*SZREG
REG_S a1, sp, 2*SZREG
REG_S a2, sp, 3*SZREG
@@ -51,55 +44,45 @@ ENTRY (_dl_runtime_resolve, 3)
REG_S a6, sp, 7*SZREG
REG_S a7, sp, 8*SZREG
-#ifndef __loongarch_soft_float
- FREG_S fa0, sp, 10*SZREG + 0*SZFREG
- FREG_S fa1, sp, 10*SZREG + 1*SZFREG
- FREG_S fa2, sp, 10*SZREG + 2*SZFREG
- FREG_S fa3, sp, 10*SZREG + 3*SZFREG
- FREG_S fa4, sp, 10*SZREG + 4*SZFREG
- FREG_S fa5, sp, 10*SZREG + 5*SZFREG
- FREG_S fa6, sp, 10*SZREG + 6*SZFREG
- FREG_S fa7, sp, 10*SZREG + 7*SZFREG
#ifdef USE_LASX
- xvst xr0, sp, 10*SZREG + 0*256
- xvst xr1, sp, 10*SZREG + 1*256
- xvst xr2, sp, 10*SZREG + 2*256
- xvst xr3, sp, 10*SZREG + 3*256
- xvst xr4, sp, 10*SZREG + 4*256
- xvst xr5, sp, 10*SZREG + 5*256
- xvst xr6, sp, 10*SZREG + 6*256
- xvst xr7, sp, 10*SZREG + 7*256
+ xvst xr0, sp, 9*SZREG + 8*SZFREG + 0*SZXREG
+ xvst xr1, sp, 9*SZREG + 8*SZFREG + 1*SZXREG
+ xvst xr2, sp, 9*SZREG + 8*SZFREG + 2*SZXREG
+ xvst xr3, sp, 9*SZREG + 8*SZFREG + 3*SZXREG
+ xvst xr4, sp, 9*SZREG + 8*SZFREG + 4*SZXREG
+ xvst xr5, sp, 9*SZREG + 8*SZFREG + 5*SZXREG
+ xvst xr6, sp, 9*SZREG + 8*SZFREG + 6*SZXREG
+ xvst xr7, sp, 9*SZREG + 8*SZFREG + 7*SZXREG
#elif defined USE_LSX
- vst vr0, sp, 10*SZREG + 0*128
- vst vr1, sp, 10*SZREG + 1*128
- vst vr2, sp, 10*SZREG + 2*128
- vst vr3, sp, 10*SZREG + 3*128
- vst vr4, sp, 10*SZREG + 4*128
- vst vr5, sp, 10*SZREG + 5*128
- vst vr6, sp, 10*SZREG + 6*128
- vst vr7, sp, 10*SZREG + 7*128
-#endif
+ vst vr0, sp, 9*SZREG + 8*SZFREG + 0*SZVREG
+ vst vr1, sp, 9*SZREG + 8*SZFREG + 1*SZVREG
+ vst vr2, sp, 9*SZREG + 8*SZFREG + 2*SZVREG
+ vst vr3, sp, 9*SZREG + 8*SZFREG + 3*SZVREG
+ vst vr4, sp, 9*SZREG + 8*SZFREG + 4*SZVREG
+ vst vr5, sp, 9*SZREG + 8*SZFREG + 5*SZVREG
+ vst vr6, sp, 9*SZREG + 8*SZFREG + 6*SZVREG
+ vst vr7, sp, 9*SZREG + 8*SZFREG + 7*SZVREG
+#elif !defined __loongarch_soft_float
+ FREG_S fa0, sp, 9*SZREG + 0*SZFREG
+ FREG_S fa1, sp, 9*SZREG + 1*SZFREG
+ FREG_S fa2, sp, 9*SZREG + 2*SZFREG
+ FREG_S fa3, sp, 9*SZREG + 3*SZFREG
+ FREG_S fa4, sp, 9*SZREG + 4*SZFREG
+ FREG_S fa5, sp, 9*SZREG + 5*SZFREG
+ FREG_S fa6, sp, 9*SZREG + 6*SZFREG
+ FREG_S fa7, sp, 9*SZREG + 7*SZFREG
#endif
- # Update .got.plt and obtain runtime address of callee.
-#ifdef __loongarch64
- slli.d a1, t1, 1
+ /* Update .got.plt and obtain runtime address of callee */
+ SLLI a1, t1, 1
or a0, t0, zero
- add.d a1, a1, t1
+ ADD a1, a1, t1
la a2, _dl_fixup
jirl ra, a2, 0
or t1, v0, zero
-#elif defined __loongarch32
- slli.w a1, t1, 1
- or a0, t0, zero
- add.w a1, a1, t1
- la a2, _dl_fixup
- jirl ra, a2, 0
- or t1, v0, zero
-#endif
- # Restore arguments from stack.
- REG_L ra, sp, 9*SZREG
+ /* Restore arguments from stack. */
+ REG_L ra, sp, 0*SZREG
REG_L a0, sp, 1*SZREG
REG_L a1, sp, 2*SZREG
REG_L a2, sp, 3*SZREG
@@ -109,45 +92,37 @@ ENTRY (_dl_runtime_resolve, 3)
REG_L a6, sp, 7*SZREG
REG_L a7, sp, 8*SZREG
-#ifndef __loongarch_soft_float
- FREG_L fa0, sp, 10*SZREG + 0*SZFREG
- FREG_L fa1, sp, 10*SZREG + 1*SZFREG
- FREG_L fa2, sp, 10*SZREG + 2*SZFREG
- FREG_L fa3, sp, 10*SZREG + 3*SZFREG
- FREG_L fa4, sp, 10*SZREG + 4*SZFREG
- FREG_L fa5, sp, 10*SZREG + 5*SZFREG
- FREG_L fa6, sp, 10*SZREG + 6*SZFREG
- FREG_L fa7, sp, 10*SZREG + 7*SZFREG
#ifdef USE_LASX
- xvld xr0, sp, 10*SZREG + 0*256
- xvld xr1, sp, 10*SZREG + 1*256
- xvld xr2, sp, 10*SZREG + 2*256
- xvld xr3, sp, 10*SZREG + 3*256
- xvld xr4, sp, 10*SZREG + 4*256
- xvld xr5, sp, 10*SZREG + 5*256
- xvld xr6, sp, 10*SZREG + 6*256
- xvld xr7, sp, 10*SZREG + 7*256
+ xvld xr0, sp, 9*SZREG + 8*SZFREG + 0*SZXREG
+ xvld xr1, sp, 9*SZREG + 8*SZFREG + 1*SZXREG
+ xvld xr2, sp, 9*SZREG + 8*SZFREG + 2*SZXREG
+ xvld xr3, sp, 9*SZREG + 8*SZFREG + 3*SZXREG
+ xvld xr4, sp, 9*SZREG + 8*SZFREG + 4*SZXREG
+ xvld xr5, sp, 9*SZREG + 8*SZFREG + 5*SZXREG
+ xvld xr6, sp, 9*SZREG + 8*SZFREG + 6*SZXREG
+ xvld xr7, sp, 9*SZREG + 8*SZFREG + 7*SZXREG
#elif defined USE_LSX
- vld vr0, sp, 10*SZREG + 0*128
- vld vr1, sp, 10*SZREG + 1*128
- vld vr2, sp, 10*SZREG + 2*128
- vld vr3, sp, 10*SZREG + 3*128
- vld vr4, sp, 10*SZREG + 4*128
- vld vr5, sp, 10*SZREG + 5*128
- vld vr6, sp, 10*SZREG + 6*128
- vld vr7, sp, 10*SZREG + 7*128
-#endif
-#endif
-
-#ifdef __loongarch64
- li.d t3, FRAME_SIZE
- add.d sp, sp, t3
-#elif defined __loongarch32
- li.w t3, FRAME_SIZE
- addi.w sp, sp, FRAME_SIZE
+ vld vr0, sp, 9*SZREG + 8*SZFREG + 0*SZVREG
+ vld vr1, sp, 9*SZREG + 8*SZFREG + 1*SZVREG
+ vld vr2, sp, 9*SZREG + 8*SZFREG + 2*SZVREG
+ vld vr3, sp, 9*SZREG + 8*SZFREG + 3*SZVREG
+ vld vr4, sp, 9*SZREG + 8*SZFREG + 4*SZVREG
+ vld vr5, sp, 9*SZREG + 8*SZFREG + 5*SZVREG
+ vld vr6, sp, 9*SZREG + 8*SZFREG + 6*SZVREG
+ vld vr7, sp, 9*SZREG + 8*SZFREG + 7*SZVREG
+#elif !defined __loongarch_soft_float
+ FREG_L fa0, sp, 9*SZREG + 0*SZFREG
+ FREG_L fa1, sp, 9*SZREG + 1*SZFREG
+ FREG_L fa2, sp, 9*SZREG + 2*SZFREG
+ FREG_L fa3, sp, 9*SZREG + 3*SZFREG
+ FREG_L fa4, sp, 9*SZREG + 4*SZFREG
+ FREG_L fa5, sp, 9*SZREG + 5*SZFREG
+ FREG_L fa6, sp, 9*SZREG + 6*SZFREG
+ FREG_L fa7, sp, 9*SZREG + 7*SZFREG
#endif
+ ADDI sp, sp, FRAME_SIZE
- # Invoke the callee.
+ /* Invoke the callee. */
jirl zero, t1, 0
END (_dl_runtime_resolve)
diff --git a/sysdeps/loongarch/sys/asm.h b/sysdeps/loongarch/sys/asm.h
index 734e45ae..e80c6245 100644
--- a/sysdeps/loongarch/sys/asm.h
+++ b/sysdeps/loongarch/sys/asm.h
@@ -9,8 +9,17 @@
# define PTRLOG 3
# define SZREG 8
# define SZFREG 8
+# define SZVREG 16
+# define SZXREG 32
# define REG_L ld.d
# define REG_S st.d
+# define SRLI srli.d
+# define SLLI slli.d
+# define ADDI addi.d
+# define ADD add.d
+# define SUB sub.d
+# define BSTRINS bstrins.d
+# define LI li.d
# define FREG_L fld.d
# define FREG_S fst.d
#elif defined __loongarch32
--
2.33.0

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,292 +0,0 @@
From e2dd1f13592fa3b99b70eb54cc61e9f98cdcb123 Mon Sep 17 00:00:00 2001
From: caiyinyu <caiyinyu@loongson.cn>
Date: Mon, 17 Apr 2023 17:20:04 +0800
Subject: [PATCH 01/14] glibc-2.28: Remove unseless ANDROID_CHANGES and related
code.
Change-Id: Ib08e92d435126c7b56096ff6f24f1c6b5ea57f46
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/lp64/memchr.S | 6 ------
sysdeps/loongarch/lp64/memcpy.S | 13 -------------
sysdeps/loongarch/lp64/memset.S | 6 ------
sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S | 6 ------
.../loongarch/lp64/multiarch/memmove-unaligned.S | 6 ------
sysdeps/loongarch/lp64/multiarch/memset-unaligned.S | 7 -------
sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S | 2 --
.../loongarch/lp64/multiarch/strchrnul-unaligned.S | 2 --
sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S | 2 --
sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S | 2 --
.../loongarch/lp64/multiarch/strncmp-unaligned.S | 2 --
.../loongarch/lp64/multiarch/strnlen-unaligned.S | 2 --
12 files changed, 56 deletions(-)
diff --git a/sysdeps/loongarch/lp64/memchr.S b/sysdeps/loongarch/lp64/memchr.S
index ec34b1af..75c4e15c 100644
--- a/sysdeps/loongarch/lp64/memchr.S
+++ b/sysdeps/loongarch/lp64/memchr.S
@@ -11,11 +11,7 @@
#define MEMCHR_NAME memchr
#endif
-#ifdef ANDROID_CHANGES
-LEAF(MEMCHR_NAME, 0)
-#else
LEAF(MEMCHR_NAME)
-#endif
.align 6
beqz a2, L(out)
andi t1, a0, 0x7
@@ -92,8 +88,6 @@ L(out):
jr ra
END(MEMCHR_NAME)
-#ifndef ANDROID_CHANGES
#ifdef _LIBC
libc_hidden_builtin_def (MEMCHR_NAME)
#endif
-#endif
diff --git a/sysdeps/loongarch/lp64/memcpy.S b/sysdeps/loongarch/lp64/memcpy.S
index 1076e678..b6ca60a1 100644
--- a/sysdeps/loongarch/lp64/memcpy.S
+++ b/sysdeps/loongarch/lp64/memcpy.S
@@ -35,29 +35,18 @@
st.d t6, reg, n+48; \
st.d t7, reg, n+56;
-#ifdef ANDROID_CHANGES
-LEAF(MEMMOVE_NAME, 0)
-#else
LEAF(MEMMOVE_NAME)
-#endif
-
.align 6
sub.d t0, a0, a1
bltu t0, a2, L(copy_back)
END(MEMMOVE_NAME)
-#ifndef ANDROID_CHANGES
#ifdef _LIBC
libc_hidden_builtin_def (MEMMOVE_NAME)
#endif
-#endif
-#ifdef ANDROID_CHANGES
-LEAF(MEMCPY_NAME, 0)
-#else
LEAF(MEMCPY_NAME)
-#endif
srai.d a3, a2, 4
beqz a3, L(short_data) # less than 16 bytes
@@ -811,8 +800,6 @@ L(back_end):
END(MEMCPY_NAME)
-#ifndef ANDROID_CHANGES
#ifdef _LIBC
libc_hidden_builtin_def (MEMCPY_NAME)
#endif
-#endif
diff --git a/sysdeps/loongarch/lp64/memset.S b/sysdeps/loongarch/lp64/memset.S
index 9fe42b24..41629e7e 100644
--- a/sysdeps/loongarch/lp64/memset.S
+++ b/sysdeps/loongarch/lp64/memset.S
@@ -21,11 +21,7 @@
st.d a1, a0, n+48; \
st.d a1, a0, n+56;
-#ifdef ANDROID_CHANGES
-LEAF(MEMSET_NAME, 0)
-#else
LEAF(MEMSET_NAME)
-#endif
.align 6
move t0, a0
andi a3, a0, 0x7
@@ -166,8 +162,6 @@ L(short_0):
END(MEMSET_NAME)
-#ifndef ANDROID_CHANGES
#ifdef _LIBC
libc_hidden_builtin_def (MEMSET_NAME)
#endif
-#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S
index 5e38df0d..64b60244 100644
--- a/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S
+++ b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S
@@ -31,11 +31,7 @@
st.d t6, reg, n+48; \
st.d t7, reg, n+56;
-#ifdef ANDROID_CHANGES
-LEAF(MEMCPY_NAME, 0)
-#else
LEAF(MEMCPY_NAME)
-#endif
//1st var: dst ptr: void *a1 $r4 a0
//2nd var: src ptr: void *a2 $r5 a1
@@ -250,10 +246,8 @@ end_0_8_unalign:
END(MEMCPY_NAME)
-#ifndef ANDROID_CHANGES
#ifdef _LIBC
libc_hidden_builtin_def (MEMCPY_NAME)
#endif
-#endif
#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S
index 27ed0c9c..42920a1a 100644
--- a/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S
+++ b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S
@@ -100,11 +100,7 @@
LD_64(a4, -1024); \
ST_64(a3, -1024);
-#ifdef ANDROID_CHANGES
-LEAF(MEMMOVE_NAME, 0)
-#else
LEAF(MEMMOVE_NAME)
-#endif
//1st var: dest ptr: void *str1 $r4 a0
//2nd var: src ptr: void *str2 $r5 a1
@@ -469,10 +465,8 @@ end_unalign_proc_back:
END(MEMMOVE_NAME)
-#ifndef ANDROID_CHANGES
#ifdef _LIBC
libc_hidden_builtin_def (MEMMOVE_NAME)
#endif
-#endif
#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
index 16ff2ef7..54e51546 100644
--- a/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
+++ b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
@@ -33,12 +33,7 @@
//2nd var: int val $5 a1
//3rd var: size_t num $6 a2
-#ifdef ANDROID_CHANGES
-LEAF(MEMSET_NAME, 0)
-#else
LEAF(MEMSET_NAME)
-#endif
-
.align 6
bstrins.d a1, a1, 15, 8
add.d t7, a0, a2
@@ -168,10 +163,8 @@ end_0_8_unalign:
END(MEMSET_NAME)
-#ifndef ANDROID_CHANGES
#ifdef _LIBC
libc_hidden_builtin_def (MEMSET_NAME)
#endif
-#endif
#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S
index 1d5e56c5..de6c7f4f 100644
--- a/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S
+++ b/sysdeps/loongarch/lp64/multiarch/strchr-unaligned.S
@@ -123,10 +123,8 @@ L(_mc8_a):
jr ra
END(STRCHR_NAME)
-#ifndef ANDROID_CHANGES
#ifdef _LIBC
libc_hidden_builtin_def (STRCHR_NAME)
#endif
-#endif
#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S
index 6338d005..abc246ca 100644
--- a/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S
+++ b/sysdeps/loongarch/lp64/multiarch/strchrnul-unaligned.S
@@ -136,11 +136,9 @@ L(_mc8_a):
jr ra
END(STRCHRNUL_NAME)
-#ifndef ANDROID_CHANGES
#ifdef _LIBC
weak_alias(STRCHRNUL_NAME, strchrnul)
libc_hidden_builtin_def (STRCHRNUL_NAME)
#endif
-#endif
#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S
index 449733cb..c77dc1a9 100644
--- a/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S
+++ b/sysdeps/loongarch/lp64/multiarch/strcpy-unaligned.S
@@ -190,10 +190,8 @@ strcpy_page_cross:
beqz has_nul, strcpy_page_cross_ok
b strcpy_end
END(STRCPY)
-#ifndef ANDROID_CHANGES
#ifdef _LIBC
libc_hidden_builtin_def (STRCPY)
#endif
-#endif
#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S
index e9b7cf67..2fe0fb34 100644
--- a/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S
+++ b/sysdeps/loongarch/lp64/multiarch/strlen-unaligned.S
@@ -107,10 +107,8 @@ strlen_loop_noascii:
jr ra
END(STRLEN)
-#ifndef ANDROID_CHANGES
#ifdef _LIBC
libc_hidden_builtin_def (STRLEN)
#endif
-#endif
#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S
index 558df29b..6ec107ca 100644
--- a/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S
+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-unaligned.S
@@ -248,10 +248,8 @@ strncmp_ret0:
then exchange(src1,src2). */
END(STRNCMP)
-#ifndef ANDROID_CHANGES
#ifdef _LIBC
libc_hidden_builtin_def (STRNCMP)
#endif
-#endif
#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S b/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S
index 60eccf00..4a195b7c 100644
--- a/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S
+++ b/sysdeps/loongarch/lp64/multiarch/strnlen-unaligned.S
@@ -136,10 +136,8 @@ L(_hit_limit):
move len, limit
jr ra
END(STRNLEN)
-#ifndef ANDROID_CHANGES
#ifdef _LIBC
libc_hidden_builtin_def (STRNLEN)
#endif
-#endif
#endif
--
2.33.0

View file

@ -1,40 +0,0 @@
From f4041e5da609a9f5da966fa000c00b150788a948 Mon Sep 17 00:00:00 2001
From: caiyinyu <caiyinyu@loongson.cn>
Date: Sun, 23 Jul 2023 14:32:08 +0800
Subject: [PATCH 13/14] glibc-2.28: Remove useless IS_LA{264,364,464} and
IS_LA{264, 364, 464}.
Change-Id: Id9a573510e2a493151191372d651f381ec2aefe7
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/unix/sysv/linux/loongarch/cpu-features.h | 7 -------
1 file changed, 7 deletions(-)
diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
index b46a8489..2703d4f7 100644
--- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
+++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
@@ -22,10 +22,6 @@
#include <stdint.h>
#include <sys/auxv.h>
-#define LA264 0x14a000
-#define LA364 0x14b000
-#define LA464 0x14c011
-
struct cpu_features
{
uint64_t cpucfg_prid;
@@ -42,9 +38,6 @@ extern const struct cpu_features *_dl_larch_get_cpu_features (void)
:"=r"(ret) \
:"r"(index));
-#define IS_LA264(prid) (prid == LA264)
-#define IS_LA364(prid) (prid == LA364)
-#define IS_LA464(prid) (prid == LA464)
#define SUPPORT_UAL (GLRO (dl_hwcap) & HWCAP_LOONGARCH_UAL)
#define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX)
#define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX)
--
2.33.0

View file

@ -1,123 +0,0 @@
From c94d9376e241dc52eb9f2a2107313b7836e0e9ad Mon Sep 17 00:00:00 2001
From: caiyinyu <caiyinyu@loongson.cn>
Date: Wed, 6 Sep 2023 16:41:09 +0800
Subject: [PATCH 14/14] glibc-2.28: Use RTLD_SUPPORT_{LSX, LASX} to choose
_dl_runtime_resolve.
Key Points:
1. On lasx & lsx platforms, use _dl_runtime_resolve_{lsx, lasx} to save vector registers.
2. Via "tunables", users can choose str/mem functions with
`export GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX`.
Note: glibc.cpu.hwcaps doesn't affect _dl_runtime_resolve_{lsx, lasx} selection.
Usage Notes:
1. Only valid inputs: LASX, LSX, UAL. Case-sensitive, comma-separated, no spaces.
2. Example: `export GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX,UAL` turns on LASX & UAL.
Unmentioned features turn off. With default ifunc: lasx > lsx > unaligned >
aligned > generic, effect is: lasx > unaligned > aligned > generic; lsx off.
3. Incorrect GLIBC_TUNABLES settings will show error messages.
4. Valid input examples:
- GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX: lasx > aligned > generic.
- GLIBC_TUNABLES=glibc.cpu.hwcaps=LSX,UAL: lsx > unaligned > aligned > generic.
- GLIBC_TUNABLES=glibc.cpu.hwcaps=LASX,UAL,LASX,UAL,LSX,LASX,UAL: Repetitions
allowed but not recommended. Results in: lasx > lsx > unaligned > aligned >
generic.
Change-Id: I555ce2039bc36bf071fc9265d7b0bb7b93b96ae7
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
sysdeps/loongarch/cpu-tunables.c | 2 +-
sysdeps/loongarch/dl-machine.h | 11 ++++++-----
sysdeps/unix/sysv/linux/loongarch/cpu-features.c | 2 ++
sysdeps/unix/sysv/linux/loongarch/cpu-features.h | 10 +++++++---
4 files changed, 16 insertions(+), 9 deletions(-)
diff --git a/sysdeps/loongarch/cpu-tunables.c b/sysdeps/loongarch/cpu-tunables.c
index 840c1b8c..e0799ca9 100644
--- a/sysdeps/loongarch/cpu-tunables.c
+++ b/sysdeps/loongarch/cpu-tunables.c
@@ -88,7 +88,7 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
}
while (*c != '\0');
- GLRO (dl_hwcap) &= hwcap;
+ GLRO (dl_larch_cpu_features).hwcap &= hwcap;
}
#endif
diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
index ff520a07..b5f43c84 100644
--- a/sysdeps/loongarch/dl-machine.h
+++ b/sysdeps/loongarch/dl-machine.h
@@ -75,13 +75,14 @@ dl_platform_init (void)
GLRO(dl_platform) = NULL;
#ifdef SHARED
+ /* init_cpu_features has been called early from __libc_start_main in
+ static executable. */
+ init_cpu_features (&GLRO(dl_larch_cpu_features));
#if HAVE_TUNABLES
TUNABLE_GET (hwcaps, tunable_val_t *, TUNABLE_CALLBACK (set_hwcaps));
#endif
- /* init_cpu_features has been called early from __libc_start_main in
- static executable. */
- init_cpu_features (&GLRO(dl_larch_cpu_features));
+
#endif
}
@@ -396,9 +397,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
l->l_mach.plt = gotplt[1] + l->l_addr;
#if HAVE_LOONGARCH_VEC_ASM && !defined __loongarch_soft_float
- if (SUPPORT_LASX)
+ if (RTLD_SUPPORT_LASX)
gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lasx;
- else if (SUPPORT_LSX)
+ else if (RTLD_SUPPORT_LSX)
gotplt[0] = (ElfW(Addr)) &_dl_runtime_resolve_lsx;
else
#endif
diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.c b/sysdeps/unix/sysv/linux/loongarch/cpu-features.c
index 80870f3c..cf015011 100644
--- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.c
+++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.c
@@ -29,4 +29,6 @@ init_cpu_features (struct cpu_features *cpu_features)
__cpucfg(cpucfg_word, 2);
cpu_features->cpucfg_word_idx2 = cpucfg_word;
+
+ GLRO (dl_larch_cpu_features).hwcap = GLRO (dl_hwcap);
}
diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
index 2703d4f7..17c9f5a7 100644
--- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
+++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
@@ -26,6 +26,7 @@ struct cpu_features
{
uint64_t cpucfg_prid;
uint64_t cpucfg_word_idx2;
+ uint64_t hwcap;
};
/* Get a pointer to the CPU features structure. */
@@ -38,9 +39,12 @@ extern const struct cpu_features *_dl_larch_get_cpu_features (void)
:"=r"(ret) \
:"r"(index));
-#define SUPPORT_UAL (GLRO (dl_hwcap) & HWCAP_LOONGARCH_UAL)
-#define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX)
-#define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX)
+#define SUPPORT_UAL (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_UAL)
+#define SUPPORT_LSX (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_LSX)
+#define SUPPORT_LASX (GLRO (dl_larch_cpu_features).hwcap & HWCAP_LOONGARCH_LASX)
+
+#define RTLD_SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX)
+#define RTLD_SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX)
#endif /* _CPU_FEATURES_LOONGARCH64_H */
--
2.33.0

View file

@ -1,91 +0,0 @@
From 58b1f882644f839259505dde3205e226a1c649f1 Mon Sep 17 00:00:00 2001
From: caiyinyu <caiyinyu@loongson.cn>
Date: Tue, 11 Jul 2023 15:42:26 +0800
Subject: [PATCH 10/14] glibc-2.28: config: Added HAVE_LOONGARCH_VEC_ASM.
Change-Id: Iea464ea0c975a351682a60f66251167f6c79385b
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
config.h.in | 5 +++++
sysdeps/loongarch/configure | 28 ++++++++++++++++++++++++++++
sysdeps/loongarch/configure.ac | 15 +++++++++++++++
3 files changed, 48 insertions(+)
diff --git a/config.h.in b/config.h.in
index 94d5ea36..fa53cc2d 100644
--- a/config.h.in
+++ b/config.h.in
@@ -123,6 +123,11 @@
/* RISC-V floating-point ABI for ld.so. */
#undef RISCV_ABI_FLEN
+/* Assembler support LoongArch LASX/LSX vector instructions.
+ This macro becomes obsolete when glibc increased the minimum
+ required version of GNU 'binutils' to 2.41 or later. */
+#define HAVE_LOONGARCH_VEC_ASM 0
+
/* Linux specific: minimum supported kernel version. */
#undef __LINUX_KERNEL_VERSION
diff --git a/sysdeps/loongarch/configure b/sysdeps/loongarch/configure
index 1e5abf81..0f0dae3a 100755
--- a/sysdeps/loongarch/configure
+++ b/sysdeps/loongarch/configure
@@ -2,3 +2,31 @@
# Local configure fragment for sysdeps/loongarch/elf.
#AC_DEFINE(PI_STATIC_AND_HIDDEN)
+
+# Check if asm support vector instructions.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for vector support in assembler" >&5
+$as_echo_n "checking for vector support in assembler... " >&6; }
+if ${libc_cv_loongarch_vec_asm+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ cat > conftest.s <<\EOF
+ vld $vr0, $sp, 0
+EOF
+if { ac_try='${CC-cc} -c $CFLAGS conftest.s -o conftest 1>&5'
+ { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+ (eval $ac_try) 2>&5
+ ac_status=$?
+ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+ test $ac_status = 0; }; }; then
+ libc_cv_loongarch_vec_asm=yes
+else
+ libc_cv_loongarch_vec_asm=no
+fi
+rm -f conftest*
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_loongarch_vec_asm" >&5
+$as_echo "$libc_cv_loongarch_vec_asm" >&6; }
+if test $libc_cv_loongarch_vec_asm = yes; then
+ $as_echo "#define HAVE_LOONGARCH_VEC_ASM 1" >>confdefs.h
+
+fi
diff --git a/sysdeps/loongarch/configure.ac b/sysdeps/loongarch/configure.ac
index 67b46ce0..aac0efa9 100644
--- a/sysdeps/loongarch/configure.ac
+++ b/sysdeps/loongarch/configure.ac
@@ -4,3 +4,18 @@ GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
dnl It is always possible to access static and hidden symbols in an
dnl position independent way.
#AC_DEFINE(PI_STATIC_AND_HIDDEN)
+
+# Check if asm support vector instructions.
+AC_CACHE_CHECK(for vector support in assembler, libc_cv_loongarch_vec_asm, [dnl
+cat > conftest.s <<\EOF
+ vld $vr0, $sp, 0
+EOF
+if AC_TRY_COMMAND(${CC-cc} -c $CFLAGS conftest.s -o conftest 1>&AS_MESSAGE_LOG_FD); then
+ libc_cv_loongarch_vec_asm=yes
+else
+ libc_cv_loongarch_vec_asm=no
+fi
+rm -f conftest*])
+if test $libc_cv_loongarch_vec_asm = yes; then
+ AC_DEFINE(HAVE_LOONGARCH_VEC_ASM)
+fi
--
2.33.0

View file

@ -1,75 +0,0 @@
From 0153532f680527c4378a10673518cabda2e02584 Mon Sep 17 00:00:00 2001
From: caiyinyu <caiyinyu@loongson.cn>
Date: Fri, 26 May 2023 14:58:39 +0800
Subject: [PATCH 05/14] glibc-2.28: remove ABILPX32 related code.
Change-Id: I73eb5bc4d4ca12e4d45ed6b533fa38d60a3a633f
Signed-off-by: ticat_fp <fanpeng@loongson.cn>
---
elf/elf.h | 3 +--
sysdeps/loongarch/dl-machine.h | 2 --
sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h | 2 +-
sysdeps/loongarch/sys/regdef.h | 4 +---
4 files changed, 3 insertions(+), 8 deletions(-)
diff --git a/elf/elf.h b/elf/elf.h
index 65d1fb46..4bfbad61 100644
--- a/elf/elf.h
+++ b/elf/elf.h
@@ -3933,10 +3933,9 @@ enum
#define R_NDS32_TLS_TPOFF 102
#define R_NDS32_TLS_DESC 119
-/* LoongISA ELF Flags */
+/* LoongArch ELF Flags */
#define EF_LARCH_ABI 0x0003
#define EF_LARCH_ABI_LP64 0x0003
-#define EF_LARCH_ABI_LPX32 0x0002
#define EF_LARCH_ABI_LP32 0x0001
/* Loongarch specific dynamic relocations. */
diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
index 2d527241..6e9c6258 100644
--- a/sysdeps/loongarch/dl-machine.h
+++ b/sysdeps/loongarch/dl-machine.h
@@ -96,8 +96,6 @@ elf_machine_matches_host (const ElfW(Ehdr) *ehdr)
#ifdef _ABILP64
if ((ehdr->e_flags & EF_LARCH_ABI) != EF_LARCH_ABI_LP64)
-#elif defined _ABILPX32
- if ((ehdr->e_flags & EF_LARCH_ABI) != EF_LARCH_ABI_LPX32)
#elif defined _ABILP32
if ((ehdr->e_flags & EF_LARCH_ABI) != EF_LARCH_ABI_LP32)
#else
diff --git a/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h b/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h
index 5a761355..aa63bce1 100644
--- a/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h
+++ b/sysdeps/loongarch/nptl/bits/pthreadtypes-arch.h
@@ -32,7 +32,7 @@
# define __SIZEOF_PTHREAD_BARRIER_T 32
# define __SIZEOF_PTHREAD_BARRIERATTR_T 4
#else
-# error "rv32i-based systems are not supported"
+# error "32-bit based systems are not supported"
#endif
#define __PTHREAD_COMPAT_PADDING_MID
diff --git a/sysdeps/loongarch/sys/regdef.h b/sysdeps/loongarch/sys/regdef.h
index 769784b8..36f00939 100644
--- a/sysdeps/loongarch/sys/regdef.h
+++ b/sysdeps/loongarch/sys/regdef.h
@@ -72,10 +72,8 @@
# define fs6 $f30
# define fs7 $f31
-#elif _LOONGARCH_SIM == _ABILPX32
-# error ABILPX32
#elif _LOONGARCH_SIM == _ABILP32
-# error ABILP32
+# error ABILP32 not support yet
#else
# error noABI
#endif
--
2.33.0

File diff suppressed because it is too large Load diff

BIN
glibc-2.38.tar.xz Normal file

Binary file not shown.

View file

@ -1,29 +0,0 @@
From 4e32231c73cb7a9b004fb160d03c26498ec3860d Mon Sep 17 00:00:00 2001
From: Zhao Hang <wb-zh951434@alibaba-inc.com>
Date: Tue, 28 May 2024 10:10:11 +0800
Subject: [PATCH] Add Hygon Support
Signed-off-by: Zhao Hang <wb-zh951434@alibaba-inc.com>
---
sysdeps/x86/cpu-features.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 1248a702..c72dc2f3 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -546,8 +546,9 @@ init_cpu_features (struct cpu_features *cpu_features)
|= bit_arch_Prefer_AVX2_STRCMP;
}
}
- /* This spells out "AuthenticAMD". */
- else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
+ /* This spells out "AuthenticAMD" or "HygonGenuine". */
+ else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)||(ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e))
+
{
unsigned int extended_model;
--
2.31.1

View file

@ -1,147 +0,0 @@
From 58f93dff514cc0bdf3c72eff590dcf5fe5bf9e00 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Wed, 19 Jul 2023 23:09:09 +0800
Subject: [PATCH 3/6] Add a testcase to check alignment of PT_LOAD segment [BZ
#28676]
Backport from master commit: fc2334a
Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>
---
elf/Makefile | 13 ++++++++++++-
elf/tst-align3.c | 38 ++++++++++++++++++++++++++++++++++++++
elf/tst-alignmod3.c | 32 ++++++++++++++++++++++++++++++++
3 files changed, 82 insertions(+), 1 deletion(-)
create mode 100644 elf/tst-align3.c
create mode 100644 elf/tst-alignmod3.c
diff --git a/elf/Makefile b/elf/Makefile
index 634c3113..442817ca 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -331,6 +331,7 @@ tests += \
tst-addr1 \
tst-align \
tst-align2 \
+ tst-align3 \
tst-audit-tlsdesc \
tst-audit-tlsdesc-dlopen \
tst-audit1 \
@@ -466,7 +467,9 @@ endif
test-srcs = \
tst-pathopt
# tests-srcs
-
+ifeq (yes,$(have-fpie))
+tests-pie += tst-align3
+endif
selinux-enabled := $(shell cat /selinux/enforce 2> /dev/null)
ifneq ($(selinux-enabled),1)
@@ -647,6 +650,7 @@ modules-names = \
tst-absolute-zero-lib \
tst-alignmod \
tst-alignmod2 \
+ tst-alignmod3 \
tst-array2dep \
tst-array5dep \
tst-audit-tlsdesc-mod1 \
@@ -1669,6 +1673,13 @@ CFLAGS-tst-alignmod2.c += $(stack-align-test-flags)
$(objpfx)tst-align: $(libdl)
$(objpfx)tst-align.out: $(objpfx)tst-alignmod.so
$(objpfx)tst-align2: $(objpfx)tst-alignmod2.so
+$(objpfx)tst-align3: $(objpfx)tst-alignmod3.so
+ifeq (yes,$(have-fpie))
+CFLAGS-tst-align3.c += $(PIE-ccflag)
+endif
+LDFLAGS-tst-align3 += -Wl,-z,max-page-size=0x200000
+LDFLAGS-tst-alignmod3.so += -Wl,-z,max-page-size=0x200000
+$(objpfx)tst-alignmod3.so: $(libsupport)
$(objpfx)unload3: $(libdl)
$(objpfx)unload3.out: $(objpfx)unload3mod1.so $(objpfx)unload3mod2.so \
diff --git a/elf/tst-align3.c b/elf/tst-align3.c
new file mode 100644
index 00000000..ac86d623
--- /dev/null
+++ b/elf/tst-align3.c
@@ -0,0 +1,38 @@
+/* Check alignment of PT_LOAD segment in a shared library.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <support/check.h>
+#include <tst-stack-align.h>
+
+/* This should cover all possible page sizes we currently support. */
+#define ALIGN 0x200000
+
+int bar __attribute__ ((aligned (ALIGN))) = 1;
+
+extern int do_load_test (void);
+
+static int
+do_test (void)
+{
+ printf ("bar: %p\n", &bar);
+ TEST_VERIFY (is_aligned (&bar, ALIGN) == 0);
+
+ return do_load_test ();
+}
+
+#include <support/test-driver.c>
diff --git a/elf/tst-alignmod3.c b/elf/tst-alignmod3.c
new file mode 100644
index 00000000..0d33f237
--- /dev/null
+++ b/elf/tst-alignmod3.c
@@ -0,0 +1,32 @@
+/* Check alignment of PT_LOAD segment in a shared library.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <support/check.h>
+#include <tst-stack-align.h>
+
+/* This should cover all possible page sizes we currently support. */
+#define ALIGN 0x200000
+
+int foo __attribute__ ((aligned (ALIGN))) = 1;
+
+void
+do_load_test (void)
+{
+ printf ("foo: %p\n", &foo);
+ TEST_VERIFY (is_aligned (&foo, ALIGN) == 0);
+}
--
2.27.0

View file

@ -1,325 +0,0 @@
From 6152628751bf13f74c9336263a9c22f29ccd8ffb Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Wed, 19 Jul 2023 23:01:53 +0800
Subject: [PATCH 1/6] Properly check stack alignment [BZ #27901]
1. Replace
if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0)
which may be optimized out by compiler, with
int
__attribute__ ((weak, noclone, noinline))
is_aligned (void *p, int align)
{
return (((uintptr_t) p) & (align - 1)) != 0;
}
2. Add TEST_STACK_ALIGN_INIT to TEST_STACK_ALIGN.
3. Add a common TEST_STACK_ALIGN_INIT to check 16-byte stack alignment
for both i386 and x86-64.
4. Update powerpc to use TEST_STACK_ALIGN_INIT.
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>
---
sysdeps/generic/tst-stack-align.h | 40 ++++++++++++++++---------
sysdeps/i386/i686/tst-stack-align.h | 44 ---------------------------
sysdeps/i386/tst-stack-align.h | 41 -------------------------
sysdeps/powerpc/tst-stack-align.h | 27 +++++------------
sysdeps/x86/tst-stack-align.h | 28 ++++++++++++++++++
sysdeps/x86_64/tst-stack-align.h | 46 -----------------------------
6 files changed, 61 insertions(+), 165 deletions(-)
delete mode 100644 sysdeps/i386/i686/tst-stack-align.h
delete mode 100644 sysdeps/i386/tst-stack-align.h
create mode 100644 sysdeps/x86/tst-stack-align.h
delete mode 100644 sysdeps/x86_64/tst-stack-align.h
diff --git a/sysdeps/generic/tst-stack-align.h b/sysdeps/generic/tst-stack-align.h
index e5cb3310..e6050901 100644
--- a/sysdeps/generic/tst-stack-align.h
+++ b/sysdeps/generic/tst-stack-align.h
@@ -1,4 +1,5 @@
-/* Copyright (C) 2003-2018 Free Software Foundation, Inc.
+/* Check stack alignment. Generic version.
+ Copyright (C) 2003-2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -18,17 +19,28 @@
#include <stdio.h>
#include <stdint.h>
+int
+__attribute__ ((weak, noclone, noinline))
+is_aligned (void *p, int align)
+{
+ return (((uintptr_t) p) & (align - 1)) != 0;
+}
+
+#ifndef TEST_STACK_ALIGN_INIT
+# define TEST_STACK_ALIGN_INIT() 0
+#endif
+
#define TEST_STACK_ALIGN() \
- ({ \
- double _d = 12.0; \
- long double _ld = 15.0; \
- int _ret = 0; \
- printf ("double: %g %p %zu\n", _d, &_d, __alignof (double)); \
- if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0) \
- _ret = 1; \
- \
- printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double)); \
- if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0) \
- _ret = 1; \
- _ret; \
- })
+ ({ \
+ double _d = 12.0; \
+ long double _ld = 15.0; \
+ int _ret = TEST_STACK_ALIGN_INIT (); \
+ \
+ printf ("double: %g %p %zu\n", _d, &_d, __alignof (double)); \
+ _ret += is_aligned (&_d, __alignof (double)); \
+ \
+ printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, \
+ __alignof (long double)); \
+ _ret += is_aligned (&_ld, __alignof (long double)); \
+ _ret; \
+ })
diff --git a/sysdeps/i386/i686/tst-stack-align.h b/sysdeps/i386/i686/tst-stack-align.h
deleted file mode 100644
index 975f26ef..00000000
--- a/sysdeps/i386/i686/tst-stack-align.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (C) 2003-2018 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <stdio.h>
-#include <stdint.h>
-#ifndef __SSE__
-#include_next <tst-stack-align.h>
-#else
-#include <xmmintrin.h>
-
-#define TEST_STACK_ALIGN() \
- ({ \
- __m128 _m; \
- double _d = 12.0; \
- long double _ld = 15.0; \
- int _ret = 0; \
- printf ("__m128: %p %zu\n", &_m, __alignof (__m128)); \
- if ((((uintptr_t) &_m) & (__alignof (__m128) - 1)) != 0) \
- _ret = 1; \
- \
- printf ("double: %g %p %zu\n", _d, &_d, __alignof (double)); \
- if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0) \
- _ret = 1; \
- \
- printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double)); \
- if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0) \
- _ret = 1; \
- _ret; \
- })
-#endif
diff --git a/sysdeps/i386/tst-stack-align.h b/sysdeps/i386/tst-stack-align.h
deleted file mode 100644
index 394ff773..00000000
--- a/sysdeps/i386/tst-stack-align.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (C) 2004-2018 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <stdio.h>
-#include <stdint.h>
-
-typedef struct { int i[4]; } int_al16 __attribute__((aligned (16)));
-
-#define TEST_STACK_ALIGN() \
- ({ \
- int_al16 _m; \
- double _d = 12.0; \
- long double _ld = 15.0; \
- int _ret = 0; \
- printf ("int_al16: %p %zu\n", &_m, __alignof (int_al16)); \
- if ((((uintptr_t) &_m) & (__alignof (int_al16) - 1)) != 0) \
- _ret = 1; \
- \
- printf ("double: %g %p %zu\n", _d, &_d, __alignof (double)); \
- if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0) \
- _ret = 1; \
- \
- printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double)); \
- if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0) \
- _ret = 1; \
- _ret; \
- })
diff --git a/sysdeps/powerpc/tst-stack-align.h b/sysdeps/powerpc/tst-stack-align.h
index 7fd7013b..d7400b28 100644
--- a/sysdeps/powerpc/tst-stack-align.h
+++ b/sysdeps/powerpc/tst-stack-align.h
@@ -1,4 +1,5 @@
-/* Copyright (C) 2005-2018 Free Software Foundation, Inc.
+/* Check stack alignment. PowerPC version.
+ Copyright (C) 2005-2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -15,10 +16,7 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#include <stdio.h>
-#include <stdint.h>
-
-#define TEST_STACK_ALIGN() \
+#define TEST_STACK_ALIGN_INIT() \
({ \
/* Altivec __vector int etc. needs 16byte aligned stack. \
Instead of using altivec.h here, use aligned attribute instead. */ \
@@ -27,20 +25,9 @@
int _i __attribute__((aligned (16))); \
int _j[3]; \
} _s = { ._i = 18, ._j[0] = 19, ._j[1] = 20, ._j[2] = 21 }; \
- double _d = 12.0; \
- long double _ld = 15.0; \
- int _ret = 0; \
printf ("__vector int: { %d, %d, %d, %d } %p %zu\n", _s._i, _s._j[0], \
_s._j[1], _s._j[2], &_s, __alignof (_s)); \
- if ((((uintptr_t) &_s) & (__alignof (_s) - 1)) != 0) \
- _ret = 1; \
- \
- printf ("double: %g %p %zu\n", _d, &_d, __alignof (double)); \
- if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0) \
- _ret = 1; \
- \
- printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double)); \
- if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0) \
- _ret = 1; \
- _ret; \
- })
+ is_aligned (&_s, __alignof (_s)); \
+ })
+
+#include_next <tst-stack-align.h>
diff --git a/sysdeps/x86/tst-stack-align.h b/sysdeps/x86/tst-stack-align.h
new file mode 100644
index 00000000..02ecc72d
--- /dev/null
+++ b/sysdeps/x86/tst-stack-align.h
@@ -0,0 +1,28 @@
+/* Check stack alignment. X86 version.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+typedef struct { int i[16]; } int_al16 __attribute__((aligned (16)));
+
+#define TEST_STACK_ALIGN_INIT() \
+ ({ \
+ int_al16 _m; \
+ printf ("int_al16: %p %zu\n", &_m, __alignof (int_al16)); \
+ is_aligned (&_m, __alignof (int_al16)); \
+ })
+
+#include_next <tst-stack-align.h>
diff --git a/sysdeps/x86_64/tst-stack-align.h b/sysdeps/x86_64/tst-stack-align.h
deleted file mode 100644
index b2ef77f6..00000000
--- a/sysdeps/x86_64/tst-stack-align.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (C) 2003-2018 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <stdio.h>
-#include <stdint.h>
-
-#define TEST_STACK_ALIGN() \
- ({ \
- /* AMD64 ABI mandates 16byte aligned stack. \
- Unfortunately, current GCC doesn't support __int128 or __float128 \
- types, so use aligned attribute instead. */ \
- struct _S \
- { \
- int _i __attribute__((aligned (16))); \
- int _pad[3]; \
- } _s = { ._i = 18 }; \
- double _d = 12.0; \
- long double _ld = 15.0; \
- int _ret = 0; \
- printf ("__int128: %d %p %zu\n", _s._i, &_s, __alignof (_s)); \
- if ((((uintptr_t) &_s) & (__alignof (_s) - 1)) != 0) \
- _ret = 1; \
- \
- printf ("double: %g %p %zu\n", _d, &_d, __alignof (double)); \
- if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0) \
- _ret = 1; \
- \
- printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double)); \
- if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0) \
- _ret = 1; \
- _ret; \
- })
--
2.27.0

View file

@ -1,112 +0,0 @@
commit 849274d48fc59bfa6db3c713c8ced8026b20f3b7
Author: Florian Weimer <fweimer@redhat.com>
Date: Thu Nov 16 19:55:35 2023 +0100
elf: Fix force_first handling in dlclose (bug 30981)
The force_first parameter was ineffective because the dlclose'd
object was not necessarily the first in the maps array. Also
enable force_first handling unconditionally, regardless of namespace.
The initial object in a namespace should be destructed first, too.
The _dl_sort_maps_dfs function had early returns for relocation
dependency processing which broke force_first handling, too, and
this is fixed in this change as well.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
diff --git a/elf/dl-close.c b/elf/dl-close.c
index 66524b6708c59f29..8107c2d5f6ad2bc6 100644
--- a/elf/dl-close.c
+++ b/elf/dl-close.c
@@ -182,6 +182,16 @@ _dl_close_worker (struct link_map *map, bool force)
}
assert (idx == nloaded);
+ /* Put the dlclose'd map first, so that its destructor runs first.
+ The map variable is NULL after a retry. */
+ if (map != NULL)
+ {
+ maps[map->l_idx] = maps[0];
+ maps[map->l_idx]->l_idx = map->l_idx;
+ maps[0] = map;
+ maps[0]->l_idx = 0;
+ }
+
/* Keep track of the lowest index link map we have covered already. */
int done_index = -1;
while (++done_index < nloaded)
@@ -255,9 +265,10 @@ _dl_close_worker (struct link_map *map, bool force)
}
}
- /* Sort the entries. We can skip looking for the binary itself which is
- at the front of the search list for the main namespace. */
- _dl_sort_maps (maps, nloaded, (nsid == LM_ID_BASE), true);
+ /* Sort the entries. Unless retrying, the maps[0] object (the
+ original argument to dlclose) needs to remain first, so that its
+ destructor runs first. */
+ _dl_sort_maps (maps, nloaded, /* force_first */ map != NULL, true);
/* Call all termination functions at once. */
bool unload_any = false;
@@ -768,7 +779,11 @@ _dl_close_worker (struct link_map *map, bool force)
/* Recheck if we need to retry, release the lock. */
out:
if (dl_close_state == rerun)
- goto retry;
+ {
+ /* The map may have been deallocated. */
+ map = NULL;
+ goto retry;
+ }
dl_close_state = not_pending;
}
diff --git a/elf/dl-sort-maps.c b/elf/dl-sort-maps.c
index aeb79b40b45054c0..c17ac325eca658ef 100644
--- a/elf/dl-sort-maps.c
+++ b/elf/dl-sort-maps.c
@@ -260,13 +260,12 @@ _dl_sort_maps_dfs (struct link_map **maps, unsigned int nmaps,
The below memcpy is not needed in the do_reldeps case here,
since we wrote back to maps[] during DFS traversal. */
if (maps_head == maps)
- return;
+ break;
}
assert (maps_head == maps);
- return;
}
-
- memcpy (maps, rpo, sizeof (struct link_map *) * nmaps);
+ else
+ memcpy (maps, rpo, sizeof (struct link_map *) * nmaps);
/* Skipping the first object at maps[0] is not valid in general,
since traversing along object dependency-links may "find" that
diff --git a/elf/dso-sort-tests-1.def b/elf/dso-sort-tests-1.def
index 4bf9052db16fb352..cf6453e9eb85ac65 100644
--- a/elf/dso-sort-tests-1.def
+++ b/elf/dso-sort-tests-1.def
@@ -56,14 +56,16 @@ output: b>a>{}<a<b
# relocation(dynamic) dependencies. While this is technically unspecified, the
# presumed reasonable practical behavior is for the destructor order to respect
# the static DT_NEEDED links (here this means the a->b->c->d order).
-# The older dynamic_sort=1 algorithm does not achieve this, while the DFS-based
-# dynamic_sort=2 algorithm does, although it is still arguable whether going
-# beyond spec to do this is the right thing to do.
+# The older dynamic_sort=1 algorithm originally did not achieve this,
+# but this was a bug in the way _dl_sort_maps was called from _dl_close_worker,
+# effectively disabling proper force_first handling.
+# The new dynamic_sort=2 algorithm shows the effect of the simpler force_first
+# handling: the a object is simply moved to the front.
# The below expected outputs are what the two algorithms currently produce
# respectively, for regression testing purposes.
tst-bz15311: {+a;+e;+f;+g;+d;%d;-d;-g;-f;-e;-a};a->b->c->d;d=>[ba];c=>a;b=>e=>a;c=>f=>b;d=>g=>c
-output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<c<d<g<f<b<e];}
-output(glibc.rtld.dynamic_sort=2): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<g<f<a<b<c<d<e];}
+output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<b<c<d<g<f<e];}
+output(glibc.rtld.dynamic_sort=2): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<g<f<b<c<d<e];}
# Test that even in the presence of dependency loops involving dlopen'ed
# object, that object is initialized last (and not unloaded prematurely).

View file

@ -1,83 +0,0 @@
commit c00b984fcd53f679ca2dafcd1aee2c89836e6e73
Author: Florian Weimer <fweimer@redhat.com>
Date: Tue Aug 29 08:28:31 2023 +0200
nscd: Skip unusable entries in first pass in prune_cache (bug 30800)
Previously, if an entry was marked unusable for any reason, but had
not timed out yet, the assert would trigger.
One way to get into such state is if a data change is detected during
re-validation of an entry. This causes the entry to be marked as not
usable. If exits nscd soon after that, then the clock jumps
backwards, and nscd restarted, the cache re-validation run after
startup triggers the removed assert.
The change is more complicated than just the removal of the assert
because entries marked as not usable should be garbage-collected in
the second pass. To make this happen, it is necessary to update some
book-keeping data.
Reviewed-by: DJ Delorie <dj@redhat.com>
diff --git a/nscd/cache.c b/nscd/cache.c
index efe4214d953edb30..2fd3f78ebb567bbe 100644
--- a/nscd/cache.c
+++ b/nscd/cache.c
@@ -371,8 +371,11 @@ prune_cache (struct database_dyn *table, time_t now, int fd)
serv2str[runp->type], str, dh->timeout);
}
- /* Check whether the entry timed out. */
- if (dh->timeout < now)
+ /* Check whether the entry timed out. Timed out entries
+ will be revalidated. For unusable records, it is still
+ necessary to record that the bucket needs to be scanned
+ again below. */
+ if (dh->timeout < now || !dh->usable)
{
/* This hash bucket could contain entries which need to
be looked at. */
@@ -384,7 +387,7 @@ prune_cache (struct database_dyn *table, time_t now, int fd)
/* We only have to look at the data of the first entries
since the count information is kept in the data part
which is shared. */
- if (runp->first)
+ if (runp->first && dh->usable)
{
/* At this point there are two choices: we reload the
@@ -400,9 +403,6 @@ prune_cache (struct database_dyn *table, time_t now, int fd)
{
/* Remove the value. */
dh->usable = false;
-
- /* We definitely have some garbage entries now. */
- any = true;
}
else
{
@@ -414,18 +414,15 @@ prune_cache (struct database_dyn *table, time_t now, int fd)
time_t timeout = readdfcts[runp->type] (table, runp, dh);
next_timeout = MIN (next_timeout, timeout);
-
- /* If the entry has been replaced, we might need
- cleanup. */
- any |= !dh->usable;
}
}
+
+ /* If the entry has been replaced, we might need cleanup. */
+ any |= !dh->usable;
}
else
- {
- assert (dh->usable);
- next_timeout = MIN (next_timeout, dh->timeout);
- }
+ /* Entry has not timed out and is usable. */
+ next_timeout = MIN (next_timeout, dh->timeout);
run = runp->next;
}

View file

@ -1,72 +0,0 @@
commit 2aa0974d2573441bffd596b07bff8698b1f2f18c
Author: Florian Weimer <fweimer@redhat.com>
Date: Fri Oct 20 14:29:50 2023 +0200
elf: ldconfig should skip temporary files created by package managers
This avoids crashes due to partially written files, after a package
update is interrupted.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Conflicts:
elf/ldconfig.c
(missing alloca removal downstream)
diff --git a/elf/ldconfig.c b/elf/ldconfig.c
index 8c66d7e5426d8cc4..51de08f91fbaf093 100644
--- a/elf/ldconfig.c
+++ b/elf/ldconfig.c
@@ -771,6 +771,31 @@ struct dlib_entry
struct dlib_entry *next;
};
+/* Skip some temporary DSO files. These files may be partially written
+ and lead to ldconfig crashes when examined. */
+static bool
+skip_dso_based_on_name (const char *name, size_t len)
+{
+ /* Skip temporary files created by the prelink program. Files with
+ names like these are never really DSOs we want to look at. */
+ if (len >= sizeof (".#prelink#") - 1)
+ {
+ if (strcmp (name + len - sizeof (".#prelink#") + 1,
+ ".#prelink#") == 0)
+ return true;
+ if (len >= sizeof (".#prelink#.XXXXXX") - 1
+ && memcmp (name + len - sizeof (".#prelink#.XXXXXX")
+ + 1, ".#prelink#.", sizeof (".#prelink#.") - 1) == 0)
+ return true;
+ }
+ /* Skip temporary files created by RPM. */
+ if (memchr (name, len, ';') != NULL)
+ return true;
+ /* Skip temporary files created by dpkg. */
+ if (len > 4 && memcmp (name + len - 4, ".tmp", 4) == 0)
+ return true;
+ return false;
+}
static void
search_dir (const struct dir_entry *entry)
@@ -849,18 +874,8 @@ search_dir (const struct dir_entry *entry)
continue;
size_t len = strlen (direntry->d_name);
- /* Skip temporary files created by the prelink program. Files with
- names like these are never really DSOs we want to look at. */
- if (len >= sizeof (".#prelink#") - 1)
- {
- if (strcmp (direntry->d_name + len - sizeof (".#prelink#") + 1,
- ".#prelink#") == 0)
- continue;
- if (len >= sizeof (".#prelink#.XXXXXX") - 1
- && memcmp (direntry->d_name + len - sizeof (".#prelink#.XXXXXX")
- + 1, ".#prelink#.", sizeof (".#prelink#.") - 1) == 0)
- continue;
- }
+ if (skip_dso_based_on_name (direntry->d_name, len))
+ continue;
len += strlen (entry->path) + 2;
if (len > file_name_len)
{

View file

@ -1,61 +0,0 @@
commit cfb5a97a93ea656e3b2263e42142a4032986d9ba
Author: Florian Weimer <fweimer@redhat.com>
Date: Mon Oct 23 12:53:16 2023 +0200
ldconfig: Fixes for skipping temporary files.
Arguments to a memchr call were swapped, causing incorrect skipping
of files.
Files related to dpkg have different names: they actually end in
.dpkg-new and .dpkg-tmp, not .tmp as I mistakenly assumed.
Fixes commit 2aa0974d2573441bffd59 ("elf: ldconfig should skip
temporary files created by package managers").
diff --git a/elf/ldconfig.c b/elf/ldconfig.c
index 51de08f91fbaf093..fb19dd68d41c07a4 100644
--- a/elf/ldconfig.c
+++ b/elf/ldconfig.c
@@ -771,6 +771,17 @@ struct dlib_entry
struct dlib_entry *next;
};
+/* Return true if the N bytes at NAME end with with the characters in
+ the string SUFFIX. (NAME[N + 1] does not have to be a null byte.)
+ Expected to be called with a string literal for SUFFIX. */
+static inline bool
+endswithn (const char *name, size_t n, const char *suffix)
+{
+ return (n >= strlen (suffix)
+ && memcmp (name + n - strlen (suffix), suffix,
+ strlen (suffix)) == 0);
+}
+
/* Skip some temporary DSO files. These files may be partially written
and lead to ldconfig crashes when examined. */
static bool
@@ -780,8 +791,7 @@ skip_dso_based_on_name (const char *name, size_t len)
names like these are never really DSOs we want to look at. */
if (len >= sizeof (".#prelink#") - 1)
{
- if (strcmp (name + len - sizeof (".#prelink#") + 1,
- ".#prelink#") == 0)
+ if (endswithn (name, len, ".#prelink#"))
return true;
if (len >= sizeof (".#prelink#.XXXXXX") - 1
&& memcmp (name + len - sizeof (".#prelink#.XXXXXX")
@@ -789,10 +799,11 @@ skip_dso_based_on_name (const char *name, size_t len)
return true;
}
/* Skip temporary files created by RPM. */
- if (memchr (name, len, ';') != NULL)
+ if (memchr (name, ';', len) != NULL)
return true;
/* Skip temporary files created by dpkg. */
- if (len > 4 && memcmp (name + len - 4, ".tmp", 4) == 0)
+ if (endswithn (name, len, ".dpkg-new")
+ || endswithn (name, len, ".dpkg-tmp"))
return true;
return false;
}

View file

@ -1,259 +0,0 @@
From 97700a34f36721b11a754cf37a1cc40695ece1fd Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 21 Jan 2019 11:23:59 -0800
Subject: [PATCH] x86-64 memchr/wmemchr: Properly handle the length parameter
[BZ# 24097]
Content-type: text/plain; charset=UTF-8
On x32, the size_t parameter may be passed in the lower 32 bits of a
64-bit register with the non-zero upper 32 bits. The string/memory
functions written in assembly can only use the lower 32 bits of a
64-bit register as length or must clear the upper 32 bits before using
the full 64-bit register for length.
This pach fixes memchr/wmemchr for x32. Tested on x86-64 and x32. On
x86-64, libc.so is the same with and withou the fix.
[BZ# 24097]
CVE-2019-6488
* sysdeps/x86_64/memchr.S: Use RDX_LP for length. Clear the
upper 32 bits of RDX register.
* sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise.
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memchr and
tst-size_t-wmemchr.
* sysdeps/x86_64/x32/test-size_t.h: New file.
* sysdeps/x86_64/x32/tst-size_t-memchr.c: Likewise.
* sysdeps/x86_64/x32/tst-size_t-wmemchr.c: Likewise.
---
sysdeps/x86_64/memchr.S | 10 ++--
sysdeps/x86_64/multiarch/memchr-avx2.S | 8 ++-
sysdeps/x86_64/x32/Makefile | 8 +++
sysdeps/x86_64/x32/test-size_t.h | 35 ++++++++++++
sysdeps/x86_64/x32/tst-size_t-memchr.c | 72 +++++++++++++++++++++++++
sysdeps/x86_64/x32/tst-size_t-wmemchr.c | 20 +++++++
6 files changed, 148 insertions(+), 5 deletions(-)
create mode 100644 sysdeps/x86_64/x32/test-size_t.h
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memchr.c
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemchr.c
Conflicts:
ChangeLog
(removed)
NEWS
(removed)
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index feef5d4f..cb320257 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -34,12 +34,16 @@ ENTRY(MEMCHR)
mov %edi, %ecx
#ifdef USE_AS_WMEMCHR
- test %rdx, %rdx
+ test %RDX_LP, %RDX_LP
jz L(return_null)
- shl $2, %rdx
+ shl $2, %RDX_LP
#else
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
+# endif
punpcklbw %xmm1, %xmm1
- test %rdx, %rdx
+ test %RDX_LP, %RDX_LP
jz L(return_null)
punpcklbw %xmm1, %xmm1
#endif
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 5f5e7725..c81da19b 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -40,16 +40,20 @@
ENTRY (MEMCHR)
# ifndef USE_AS_RAWMEMCHR
/* Check for zero length. */
- testq %rdx, %rdx
+ test %RDX_LP, %RDX_LP
jz L(null)
# endif
movl %edi, %ecx
/* Broadcast CHAR to YMM0. */
vmovd %esi, %xmm0
# ifdef USE_AS_WMEMCHR
- shl $2, %rdx
+ shl $2, %RDX_LP
vpbroadcastd %xmm0, %ymm0
# else
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
+# endif
vpbroadcastb %xmm0, %ymm0
# endif
/* Check if we may cross page boundary with one vector load. */
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
index f2ebc24f..7d528889 100644
--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
@@ -4,3 +4,11 @@ ifeq ($(subdir),math)
# 64-bit llround. Add -fno-builtin-lround to silence the compiler.
CFLAGS-s_llround.c += -fno-builtin-lround
endif
+
+ifeq ($(subdir),string)
+tests += tst-size_t-memchr
+endif
+
+ifeq ($(subdir),wcsmbs)
+tests += tst-size_t-wmemchr
+endif
diff --git a/sysdeps/x86_64/x32/test-size_t.h b/sysdeps/x86_64/x32/test-size_t.h
new file mode 100644
index 00000000..78a94086
--- /dev/null
+++ b/sysdeps/x86_64/x32/test-size_t.h
@@ -0,0 +1,35 @@
+/* Test string/memory functions with size_t in the lower 32 bits of
+ 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define TEST_MAIN
+#include <string/test-string.h>
+
+/* On x32, parameter_t may be passed in a 64-bit register with the LEN
+ field in the lower 32 bits. When the LEN field of 64-bit register
+ is passed to string/memory function as the size_t parameter, only
+ the lower 32 bits can be used. */
+typedef struct
+{
+ union
+ {
+ size_t len;
+ void (*fn) (void);
+ };
+ void *p;
+} parameter_t;
diff --git a/sysdeps/x86_64/x32/tst-size_t-memchr.c b/sysdeps/x86_64/x32/tst-size_t-memchr.c
new file mode 100644
index 00000000..29a3daf1
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memchr.c
@@ -0,0 +1,72 @@
+/* Test memchr with size_t in the lower 32 bits of 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef WIDE
+# define TEST_NAME "memchr"
+#else
+# define TEST_NAME "wmemchr"
+#endif /* WIDE */
+#include "test-size_t.h"
+
+#ifndef WIDE
+# define MEMCHR memchr
+# define CHAR char
+# define UCHAR unsigned char
+#else
+# include <wchar.h>
+# define MEMCHR wmemchr
+# define CHAR wchar_t
+# define UCHAR wchar_t
+#endif /* WIDE */
+
+IMPL (MEMCHR, 1)
+
+typedef CHAR * (*proto_t) (const CHAR*, int, size_t);
+
+static CHAR *
+__attribute__ ((noinline, noclone))
+do_memchr (parameter_t a, parameter_t b)
+{
+ return CALL (&b, a.p, (uintptr_t) b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+ test_init ();
+
+ parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
+ parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
+
+ int ret = 0;
+ FOR_EACH_IMPL (impl, 0)
+ {
+ c.fn = impl->fn;
+ CHAR *res = do_memchr (src, c);
+ if (res)
+ {
+ error (0, 0, "Wrong result in function %s: %p != NULL",
+ impl->name, res);
+ ret = 1;
+ }
+ }
+
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemchr.c b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
new file mode 100644
index 00000000..877801d6
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
@@ -0,0 +1,20 @@
+/* Test wmemchr with size_t in the lower 32 bits of 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define WIDE 1
+#include "tst-size_t-memchr.c"
--
GitLab

View file

@ -1,41 +0,0 @@
From ddf0992cf57a93200e0c782e2a94d0733a5a0b87 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Sun, 9 Jan 2022 16:02:21 -0600
Subject: [PATCH] x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
Content-type: text/plain; charset=UTF-8
Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to
__wcscmp_avx2. For x86_64 this covers the entire address range so any
length larger could not possibly be used to bound `s1` or `s2`.
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/strcmp-avx2.S | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 156c1949..8fb8eedc 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -83,6 +83,16 @@ ENTRY (STRCMP)
je L(char0)
jb L(zero)
# ifdef USE_AS_WCSCMP
+# ifndef __ILP32__
+ movq %rdx, %rcx
+ /* Check if length could overflow when multiplied by
+ sizeof(wchar_t). Checking top 8 bits will cover all potential
+ overflow cases as well as redirect cases where its impossible to
+ length to bound a valid memory region. In these cases just use
+ 'wcscmp'. */
+ shrq $56, %rcx
+ jnz __wcscmp_avx2
+# endif
/* Convert units: from wide to byte char. */
shl $2, %RDX_LP
# endif
--
GitLab

View file

@ -1,257 +0,0 @@
From 244b415d386487521882debb845a040a4758cb18 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 25 Mar 2022 17:13:33 -0500
Subject: [PATCH] x86: Small improvements for wcslen
Content-type: text/plain; charset=UTF-8
Just a few QOL changes.
1. Prefer `add` > `lea` as it has high execution units it can run
on.
2. Don't break macro-fusion between `test` and `jcc`
3. Reduce code size by removing gratuitous padding bytes (-90
bytes).
geometric_mean(N=20) of all benchmarks New / Original: 0.959
All string/memory tests pass.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/wcslen.S | 86 ++++++++++++++++++++---------------------
1 file changed, 41 insertions(+), 45 deletions(-)
diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
index 9f5f7232..254bb030 100644
--- a/sysdeps/x86_64/wcslen.S
+++ b/sysdeps/x86_64/wcslen.S
@@ -41,82 +41,82 @@ ENTRY (__wcslen)
pxor %xmm0, %xmm0
lea 32(%rdi), %rax
- lea 16(%rdi), %rcx
+ addq $16, %rdi
and $-16, %rax
pcmpeqd (%rax), %xmm0
pmovmskb %xmm0, %edx
pxor %xmm1, %xmm1
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm1
pmovmskb %xmm1, %edx
pxor %xmm2, %xmm2
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm2
pmovmskb %xmm2, %edx
pxor %xmm3, %xmm3
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm0
pmovmskb %xmm0, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm1
pmovmskb %xmm1, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm2
pmovmskb %xmm2, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm0
pmovmskb %xmm0, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm1
pmovmskb %xmm1, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm2
pmovmskb %xmm2, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
and $-0x40, %rax
@@ -133,104 +133,100 @@ L(aligned_64_loop):
pminub %xmm0, %xmm2
pcmpeqd %xmm3, %xmm2
pmovmskb %xmm2, %edx
+ addq $64, %rax
test %edx, %edx
- lea 64(%rax), %rax
jz L(aligned_64_loop)
pcmpeqd -64(%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $48, %rdi
test %edx, %edx
- lea 48(%rcx), %rcx
jnz L(exit)
pcmpeqd %xmm1, %xmm3
pmovmskb %xmm3, %edx
+ addq $-16, %rdi
test %edx, %edx
- lea -16(%rcx), %rcx
jnz L(exit)
pcmpeqd -32(%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $-16, %rdi
test %edx, %edx
- lea -16(%rcx), %rcx
jnz L(exit)
pcmpeqd %xmm6, %xmm3
pmovmskb %xmm3, %edx
+ addq $-16, %rdi
test %edx, %edx
- lea -16(%rcx), %rcx
- jnz L(exit)
-
- jmp L(aligned_64_loop)
+ jz L(aligned_64_loop)
.p2align 4
L(exit):
- sub %rcx, %rax
+ sub %rdi, %rax
shr $2, %rax
test %dl, %dl
jz L(exit_high)
- mov %dl, %cl
- and $15, %cl
+ andl $15, %edx
jz L(exit_1)
ret
- .p2align 4
+ /* No align here. Naturally aligned % 16 == 1. */
L(exit_high):
- mov %dh, %ch
- and $15, %ch
+ andl $(15 << 8), %edx
jz L(exit_3)
add $2, %rax
ret
- .p2align 4
+ .p2align 3
L(exit_1):
add $1, %rax
ret
- .p2align 4
+ .p2align 3
L(exit_3):
add $3, %rax
ret
- .p2align 4
+ .p2align 3
L(exit_tail0):
- xor %rax, %rax
+ xorl %eax, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail1):
- mov $1, %rax
+ movl $1, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail2):
- mov $2, %rax
+ movl $2, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail3):
- mov $3, %rax
+ movl $3, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail4):
- mov $4, %rax
+ movl $4, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail5):
- mov $5, %rax
+ movl $5, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail6):
- mov $6, %rax
+ movl $6, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail7):
- mov $7, %rax
+ movl $7, %eax
ret
END (__wcslen)
--
GitLab

View file

@ -1,964 +0,0 @@
From 7cbc03d03091d5664060924789afe46d30a5477e Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 15 Apr 2022 12:28:00 -0500
Subject: [PATCH] x86: Remove memcmp-sse4.S
Content-type: text/plain; charset=UTF-8
Code didn't actually use any sse4 instructions since `ptest` was
removed in:
commit 2f9062d7171850451e6044ef78d91ff8c017b9c0
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed Nov 10 16:18:56 2021 -0600
x86: Shrink memcmp-sse4.S code size
The new memcmp-sse2 implementation is also faster.
geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905
Note there are two regressions preferring SSE2 for Size = 1 and Size =
65.
Size = 1:
size, align0, align1, ret, New Time/Old Time
1, 1, 1, 0, 1.2
1, 1, 1, 1, 1.197
1, 1, 1, -1, 1.2
This is intentional. Size == 1 is significantly less hot based on
profiles of GCC11 and Python3 than sizes [4, 8] (which is made
hotter).
Python3 Size = 1 -> 13.64%
Python3 Size = [4, 8] -> 60.92%
GCC11 Size = 1 -> 1.29%
GCC11 Size = [4, 8] -> 33.86%
size, align0, align1, ret, New Time/Old Time
4, 4, 4, 0, 0.622
4, 4, 4, 1, 0.797
4, 4, 4, -1, 0.805
5, 5, 5, 0, 0.623
5, 5, 5, 1, 0.777
5, 5, 5, -1, 0.802
6, 6, 6, 0, 0.625
6, 6, 6, 1, 0.813
6, 6, 6, -1, 0.788
7, 7, 7, 0, 0.625
7, 7, 7, 1, 0.799
7, 7, 7, -1, 0.795
8, 8, 8, 0, 0.625
8, 8, 8, 1, 0.848
8, 8, 8, -1, 0.914
9, 9, 9, 0, 0.625
Size = 65:
size, align0, align1, ret, New Time/Old Time
65, 0, 0, 0, 1.103
65, 0, 0, 1, 1.216
65, 0, 0, -1, 1.227
65, 65, 0, 0, 1.091
65, 0, 65, 1, 1.19
65, 65, 65, -1, 1.215
This is because A) the checks in range [65, 96] are now unrolled 2x
and B) because smaller values <= 16 are now given a hotter path. By
contrast the SSE4 version has a branch for Size = 80. The unrolled
version has get better performance for returns which need both
comparisons.
size, align0, align1, ret, New Time/Old Time
128, 4, 8, 0, 0.858
128, 4, 8, 1, 0.879
128, 4, 8, -1, 0.888
As well, out of microbenchmark environments that are not full
predictable the branch will have a real-cost.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/Makefile | 2 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 -
sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 -
sysdeps/x86_64/multiarch/memcmp-sse4.S | 804 ---------------------
4 files changed, 814 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/memcmp-sse4.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index bca82e38..b503e4b8 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -11,7 +11,6 @@ sysdep_routines += \
memcmp-avx2-movbe-rtm \
memcmp-evex-movbe \
memcmp-sse2 \
- memcmp-sse4 \
memcmp-ssse3 \
memcpy-ssse3 \
memcpy-ssse3-back \
@@ -174,7 +173,6 @@ sysdep_routines += \
wmemcmp-avx2-movbe-rtm \
wmemcmp-c \
wmemcmp-evex-movbe \
- wmemcmp-sse4 \
wmemcmp-ssse3 \
# sysdep_routines
endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 14314367..450a2917 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -78,8 +78,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__memcmp_evex_movbe)
- IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
- __memcmp_sse4_1)
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
__memcmp_ssse3)
IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
@@ -824,8 +822,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__wmemcmp_evex_movbe)
- IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
- __wmemcmp_sse4_1)
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
__wmemcmp_ssse3)
IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index 690dffe8..0bc47a7f 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -21,7 +21,6 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
@@ -47,9 +46,6 @@ IFUNC_SELECTOR (void)
return OPTIMIZE (avx2_movbe);
}
- if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
- return OPTIMIZE (sse4_1);
-
if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
return OPTIMIZE (ssse3);
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
deleted file mode 100644
index 50060006..00000000
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ /dev/null
@@ -1,804 +0,0 @@
-/* memcmp with SSE4.1, wmemcmp with SSE4.1
- Copyright (C) 2010-2018 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef MEMCMP
-# define MEMCMP __memcmp_sse4_1
-# endif
-
-#ifdef USE_AS_WMEMCMP
-# define CMPEQ pcmpeqd
-# define CHAR_SIZE 4
-#else
-# define CMPEQ pcmpeqb
-# define CHAR_SIZE 1
-#endif
-
-
-/* Warning!
- wmemcmp has to use SIGNED comparison for elements.
- memcmp has to use UNSIGNED comparison for elemnts.
-*/
-
- .section .text.sse4.1,"ax",@progbits
-ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
- shl $2, %RDX_LP
-# elif defined __ILP32__
- /* Clear the upper 32 bits. */
- mov %edx, %edx
-# endif
- cmp $79, %RDX_LP
- ja L(79bytesormore)
-
- cmp $CHAR_SIZE, %RDX_LP
- jbe L(firstbyte)
-
- /* N in (CHAR_SIZE, 79) bytes. */
- cmpl $32, %edx
- ja L(more_32_bytes)
-
- cmpl $16, %edx
- jae L(16_to_32_bytes)
-
-# ifndef USE_AS_WMEMCMP
- cmpl $8, %edx
- jae L(8_to_16_bytes)
-
- cmpl $4, %edx
- jb L(2_to_3_bytes)
-
- movl (%rdi), %eax
- movl (%rsi), %ecx
-
- bswap %eax
- bswap %ecx
-
- shlq $32, %rax
- shlq $32, %rcx
-
- movl -4(%rdi, %rdx), %edi
- movl -4(%rsi, %rdx), %esi
-
- bswap %edi
- bswap %esi
-
- orq %rdi, %rax
- orq %rsi, %rcx
- subq %rcx, %rax
- cmovne %edx, %eax
- sbbl %ecx, %ecx
- orl %ecx, %eax
- ret
-
- .p2align 4,, 8
-L(2_to_3_bytes):
- movzwl (%rdi), %eax
- movzwl (%rsi), %ecx
- shll $8, %eax
- shll $8, %ecx
- bswap %eax
- bswap %ecx
- movzbl -1(%rdi, %rdx), %edi
- movzbl -1(%rsi, %rdx), %esi
- orl %edi, %eax
- orl %esi, %ecx
- subl %ecx, %eax
- ret
-
- .p2align 4,, 8
-L(8_to_16_bytes):
- movq (%rdi), %rax
- movq (%rsi), %rcx
-
- bswap %rax
- bswap %rcx
-
- subq %rcx, %rax
- jne L(8_to_16_bytes_done)
-
- movq -8(%rdi, %rdx), %rax
- movq -8(%rsi, %rdx), %rcx
-
- bswap %rax
- bswap %rcx
-
- subq %rcx, %rax
-
-L(8_to_16_bytes_done):
- cmovne %edx, %eax
- sbbl %ecx, %ecx
- orl %ecx, %eax
- ret
-# else
- xorl %eax, %eax
- movl (%rdi), %ecx
- cmpl (%rsi), %ecx
- jne L(8_to_16_bytes_done)
- movl 4(%rdi), %ecx
- cmpl 4(%rsi), %ecx
- jne L(8_to_16_bytes_done)
- movl -4(%rdi, %rdx), %ecx
- cmpl -4(%rsi, %rdx), %ecx
- jne L(8_to_16_bytes_done)
- ret
-# endif
-
- .p2align 4,, 3
-L(ret_zero):
- xorl %eax, %eax
-L(zero):
- ret
-
- .p2align 4,, 8
-L(firstbyte):
- jb L(ret_zero)
-# ifdef USE_AS_WMEMCMP
- xorl %eax, %eax
- movl (%rdi), %ecx
- cmpl (%rsi), %ecx
- je L(zero)
-L(8_to_16_bytes_done):
- setg %al
- leal -1(%rax, %rax), %eax
-# else
- movzbl (%rdi), %eax
- movzbl (%rsi), %ecx
- sub %ecx, %eax
-# endif
- ret
-
- .p2align 4
-L(vec_return_begin_48):
- addq $16, %rdi
- addq $16, %rsi
-L(vec_return_begin_32):
- bsfl %eax, %eax
-# ifdef USE_AS_WMEMCMP
- movl 32(%rdi, %rax), %ecx
- xorl %edx, %edx
- cmpl 32(%rsi, %rax), %ecx
- setg %dl
- leal -1(%rdx, %rdx), %eax
-# else
- movzbl 32(%rsi, %rax), %ecx
- movzbl 32(%rdi, %rax), %eax
- subl %ecx, %eax
-# endif
- ret
-
- .p2align 4
-L(vec_return_begin_16):
- addq $16, %rdi
- addq $16, %rsi
-L(vec_return_begin):
- bsfl %eax, %eax
-# ifdef USE_AS_WMEMCMP
- movl (%rdi, %rax), %ecx
- xorl %edx, %edx
- cmpl (%rsi, %rax), %ecx
- setg %dl
- leal -1(%rdx, %rdx), %eax
-# else
- movzbl (%rsi, %rax), %ecx
- movzbl (%rdi, %rax), %eax
- subl %ecx, %eax
-# endif
- ret
-
- .p2align 4
-L(vec_return_end_16):
- subl $16, %edx
-L(vec_return_end):
- bsfl %eax, %eax
- addl %edx, %eax
-# ifdef USE_AS_WMEMCMP
- movl -16(%rdi, %rax), %ecx
- xorl %edx, %edx
- cmpl -16(%rsi, %rax), %ecx
- setg %dl
- leal -1(%rdx, %rdx), %eax
-# else
- movzbl -16(%rsi, %rax), %ecx
- movzbl -16(%rdi, %rax), %eax
- subl %ecx, %eax
-# endif
- ret
-
- .p2align 4,, 8
-L(more_32_bytes):
- movdqu (%rdi), %xmm0
- movdqu (%rsi), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqu 16(%rdi), %xmm0
- movdqu 16(%rsi), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- cmpl $64, %edx
- jbe L(32_to_64_bytes)
- movdqu 32(%rdi), %xmm0
- movdqu 32(%rsi), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- .p2align 4,, 6
-L(32_to_64_bytes):
- movdqu -32(%rdi, %rdx), %xmm0
- movdqu -32(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end_16)
-
- movdqu -16(%rdi, %rdx), %xmm0
- movdqu -16(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end)
- ret
-
- .p2align 4
-L(16_to_32_bytes):
- movdqu (%rdi), %xmm0
- movdqu (%rsi), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqu -16(%rdi, %rdx), %xmm0
- movdqu -16(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end)
- ret
-
-
- .p2align 4
-L(79bytesormore):
- movdqu (%rdi), %xmm0
- movdqu (%rsi), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
-
- mov %rsi, %rcx
- and $-16, %rsi
- add $16, %rsi
- sub %rsi, %rcx
-
- sub %rcx, %rdi
- add %rcx, %rdx
- test $0xf, %rdi
- jz L(2aligned)
-
- cmp $128, %rdx
- ja L(128bytesormore)
-
- .p2align 4,, 6
-L(less128bytes):
- movdqu (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqu 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqu 32(%rdi), %xmm1
- CMPEQ 32(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- movdqu 48(%rdi), %xmm1
- CMPEQ 48(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_48)
-
- cmp $96, %rdx
- jb L(32_to_64_bytes)
-
- addq $64, %rdi
- addq $64, %rsi
- subq $64, %rdx
-
- .p2align 4,, 6
-L(last_64_bytes):
- movdqu (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqu 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqu -32(%rdi, %rdx), %xmm0
- movdqu -32(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end_16)
-
- movdqu -16(%rdi, %rdx), %xmm0
- movdqu -16(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end)
- ret
-
- .p2align 4
-L(128bytesormore):
- cmp $256, %rdx
- ja L(unaligned_loop)
-L(less256bytes):
- movdqu (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqu 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqu 32(%rdi), %xmm1
- CMPEQ 32(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- movdqu 48(%rdi), %xmm1
- CMPEQ 48(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_48)
-
- addq $64, %rdi
- addq $64, %rsi
-
- movdqu (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqu 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqu 32(%rdi), %xmm1
- CMPEQ 32(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- movdqu 48(%rdi), %xmm1
- CMPEQ 48(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_48)
-
- addq $-128, %rdx
- subq $-64, %rsi
- subq $-64, %rdi
-
- cmp $64, %rdx
- ja L(less128bytes)
-
- cmp $32, %rdx
- ja L(last_64_bytes)
-
- movdqu -32(%rdi, %rdx), %xmm0
- movdqu -32(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end_16)
-
- movdqu -16(%rdi, %rdx), %xmm0
- movdqu -16(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end)
- ret
-
- .p2align 4
-L(unaligned_loop):
-# ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %R8_LP
-# else
- mov __x86_data_cache_size_half(%rip), %R8_LP
-# endif
- movq %r8, %r9
- addq %r8, %r8
- addq %r9, %r8
- cmpq %r8, %rdx
- ja L(L2_L3_cache_unaligned)
- sub $64, %rdx
- .p2align 4
-L(64bytesormore_loop):
- movdqu (%rdi), %xmm0
- movdqu 16(%rdi), %xmm1
- movdqu 32(%rdi), %xmm2
- movdqu 48(%rdi), %xmm3
-
- CMPEQ (%rsi), %xmm0
- CMPEQ 16(%rsi), %xmm1
- CMPEQ 32(%rsi), %xmm2
- CMPEQ 48(%rsi), %xmm3
-
- pand %xmm0, %xmm1
- pand %xmm2, %xmm3
- pand %xmm1, %xmm3
-
- pmovmskb %xmm3, %eax
- incw %ax
- jnz L(64bytesormore_loop_end)
-
- add $64, %rsi
- add $64, %rdi
- sub $64, %rdx
- ja L(64bytesormore_loop)
-
- .p2align 4,, 6
-L(loop_tail):
- addq %rdx, %rdi
- movdqu (%rdi), %xmm0
- movdqu 16(%rdi), %xmm1
- movdqu 32(%rdi), %xmm2
- movdqu 48(%rdi), %xmm3
-
- addq %rdx, %rsi
- movdqu (%rsi), %xmm4
- movdqu 16(%rsi), %xmm5
- movdqu 32(%rsi), %xmm6
- movdqu 48(%rsi), %xmm7
-
- CMPEQ %xmm4, %xmm0
- CMPEQ %xmm5, %xmm1
- CMPEQ %xmm6, %xmm2
- CMPEQ %xmm7, %xmm3
-
- pand %xmm0, %xmm1
- pand %xmm2, %xmm3
- pand %xmm1, %xmm3
-
- pmovmskb %xmm3, %eax
- incw %ax
- jnz L(64bytesormore_loop_end)
- ret
-
-L(L2_L3_cache_unaligned):
- subq $64, %rdx
- .p2align 4
-L(L2_L3_unaligned_128bytes_loop):
- prefetchnta 0x1c0(%rdi)
- prefetchnta 0x1c0(%rsi)
-
- movdqu (%rdi), %xmm0
- movdqu 16(%rdi), %xmm1
- movdqu 32(%rdi), %xmm2
- movdqu 48(%rdi), %xmm3
-
- CMPEQ (%rsi), %xmm0
- CMPEQ 16(%rsi), %xmm1
- CMPEQ 32(%rsi), %xmm2
- CMPEQ 48(%rsi), %xmm3
-
- pand %xmm0, %xmm1
- pand %xmm2, %xmm3
- pand %xmm1, %xmm3
-
- pmovmskb %xmm3, %eax
- incw %ax
- jnz L(64bytesormore_loop_end)
-
- add $64, %rsi
- add $64, %rdi
- sub $64, %rdx
- ja L(L2_L3_unaligned_128bytes_loop)
- jmp L(loop_tail)
-
-
- /* This case is for machines which are sensitive for unaligned
- * instructions. */
- .p2align 4
-L(2aligned):
- cmp $128, %rdx
- ja L(128bytesormorein2aligned)
-L(less128bytesin2aligned):
- movdqa (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqa 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqa 32(%rdi), %xmm1
- CMPEQ 32(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- movdqa 48(%rdi), %xmm1
- CMPEQ 48(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_48)
-
- cmp $96, %rdx
- jb L(32_to_64_bytes)
-
- addq $64, %rdi
- addq $64, %rsi
- subq $64, %rdx
-
- .p2align 4,, 6
-L(aligned_last_64_bytes):
- movdqa (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqa 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqu -32(%rdi, %rdx), %xmm0
- movdqu -32(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end_16)
-
- movdqu -16(%rdi, %rdx), %xmm0
- movdqu -16(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end)
- ret
-
- .p2align 4
-L(128bytesormorein2aligned):
- cmp $256, %rdx
- ja L(aligned_loop)
-L(less256bytesin2alinged):
- movdqa (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqa 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqa 32(%rdi), %xmm1
- CMPEQ 32(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- movdqa 48(%rdi), %xmm1
- CMPEQ 48(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_48)
-
- addq $64, %rdi
- addq $64, %rsi
-
- movdqa (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqa 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqa 32(%rdi), %xmm1
- CMPEQ 32(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- movdqa 48(%rdi), %xmm1
- CMPEQ 48(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_48)
-
- addq $-128, %rdx
- subq $-64, %rsi
- subq $-64, %rdi
-
- cmp $64, %rdx
- ja L(less128bytesin2aligned)
-
- cmp $32, %rdx
- ja L(aligned_last_64_bytes)
-
- movdqu -32(%rdi, %rdx), %xmm0
- movdqu -32(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end_16)
-
- movdqu -16(%rdi, %rdx), %xmm0
- movdqu -16(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end)
- ret
-
- .p2align 4
-L(aligned_loop):
-# ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %R8_LP
-# else
- mov __x86_data_cache_size_half(%rip), %R8_LP
-# endif
- movq %r8, %r9
- addq %r8, %r8
- addq %r9, %r8
- cmpq %r8, %rdx
- ja L(L2_L3_cache_aligned)
-
- sub $64, %rdx
- .p2align 4
-L(64bytesormore_loopin2aligned):
- movdqa (%rdi), %xmm0
- movdqa 16(%rdi), %xmm1
- movdqa 32(%rdi), %xmm2
- movdqa 48(%rdi), %xmm3
-
- CMPEQ (%rsi), %xmm0
- CMPEQ 16(%rsi), %xmm1
- CMPEQ 32(%rsi), %xmm2
- CMPEQ 48(%rsi), %xmm3
-
- pand %xmm0, %xmm1
- pand %xmm2, %xmm3
- pand %xmm1, %xmm3
-
- pmovmskb %xmm3, %eax
- incw %ax
- jnz L(64bytesormore_loop_end)
- add $64, %rsi
- add $64, %rdi
- sub $64, %rdx
- ja L(64bytesormore_loopin2aligned)
- jmp L(loop_tail)
-
-L(L2_L3_cache_aligned):
- subq $64, %rdx
- .p2align 4
-L(L2_L3_aligned_128bytes_loop):
- prefetchnta 0x1c0(%rdi)
- prefetchnta 0x1c0(%rsi)
- movdqa (%rdi), %xmm0
- movdqa 16(%rdi), %xmm1
- movdqa 32(%rdi), %xmm2
- movdqa 48(%rdi), %xmm3
-
- CMPEQ (%rsi), %xmm0
- CMPEQ 16(%rsi), %xmm1
- CMPEQ 32(%rsi), %xmm2
- CMPEQ 48(%rsi), %xmm3
-
- pand %xmm0, %xmm1
- pand %xmm2, %xmm3
- pand %xmm1, %xmm3
-
- pmovmskb %xmm3, %eax
- incw %ax
- jnz L(64bytesormore_loop_end)
-
- addq $64, %rsi
- addq $64, %rdi
- subq $64, %rdx
- ja L(L2_L3_aligned_128bytes_loop)
- jmp L(loop_tail)
-
- .p2align 4
-L(64bytesormore_loop_end):
- pmovmskb %xmm0, %ecx
- incw %cx
- jnz L(loop_end_ret)
-
- pmovmskb %xmm1, %ecx
- notw %cx
- sall $16, %ecx
- jnz L(loop_end_ret)
-
- pmovmskb %xmm2, %ecx
- notw %cx
- shlq $32, %rcx
- jnz L(loop_end_ret)
-
- addq $48, %rdi
- addq $48, %rsi
- movq %rax, %rcx
-
- .p2align 4,, 6
-L(loop_end_ret):
- bsfq %rcx, %rcx
-# ifdef USE_AS_WMEMCMP
- movl (%rdi, %rcx), %eax
- xorl %edx, %edx
- cmpl (%rsi, %rcx), %eax
- setg %dl
- leal -1(%rdx, %rdx), %eax
-# else
- movzbl (%rdi, %rcx), %eax
- movzbl (%rsi, %rcx), %ecx
- subl %ecx, %eax
-# endif
- ret
-END (MEMCMP)
-#endif
--
GitLab

View file

@ -1,263 +0,0 @@
From 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 15 Apr 2022 12:28:01 -0500
Subject: [PATCH] x86: Cleanup page cross code in memcmp-avx2-movbe.S
Content-type: text/plain; charset=UTF-8
Old code was both inefficient and wasted code size. New code (-62
bytes) and comparable or better performance in the page cross case.
geometric_mean(N=20) of page cross cases New / Original: 0.960
size, align0, align1, ret, New Time/Old Time
1, 4095, 0, 0, 1.001
1, 4095, 0, 1, 0.999
1, 4095, 0, -1, 1.0
2, 4094, 0, 0, 1.0
2, 4094, 0, 1, 1.0
2, 4094, 0, -1, 1.0
3, 4093, 0, 0, 1.0
3, 4093, 0, 1, 1.0
3, 4093, 0, -1, 1.0
4, 4092, 0, 0, 0.987
4, 4092, 0, 1, 1.0
4, 4092, 0, -1, 1.0
5, 4091, 0, 0, 0.984
5, 4091, 0, 1, 1.002
5, 4091, 0, -1, 1.005
6, 4090, 0, 0, 0.993
6, 4090, 0, 1, 1.001
6, 4090, 0, -1, 1.003
7, 4089, 0, 0, 0.991
7, 4089, 0, 1, 1.0
7, 4089, 0, -1, 1.001
8, 4088, 0, 0, 0.875
8, 4088, 0, 1, 0.881
8, 4088, 0, -1, 0.888
9, 4087, 0, 0, 0.872
9, 4087, 0, 1, 0.879
9, 4087, 0, -1, 0.883
10, 4086, 0, 0, 0.878
10, 4086, 0, 1, 0.886
10, 4086, 0, -1, 0.873
11, 4085, 0, 0, 0.878
11, 4085, 0, 1, 0.881
11, 4085, 0, -1, 0.879
12, 4084, 0, 0, 0.873
12, 4084, 0, 1, 0.889
12, 4084, 0, -1, 0.875
13, 4083, 0, 0, 0.873
13, 4083, 0, 1, 0.863
13, 4083, 0, -1, 0.863
14, 4082, 0, 0, 0.838
14, 4082, 0, 1, 0.869
14, 4082, 0, -1, 0.877
15, 4081, 0, 0, 0.841
15, 4081, 0, 1, 0.869
15, 4081, 0, -1, 0.876
16, 4080, 0, 0, 0.988
16, 4080, 0, 1, 0.99
16, 4080, 0, -1, 0.989
17, 4079, 0, 0, 0.978
17, 4079, 0, 1, 0.981
17, 4079, 0, -1, 0.98
18, 4078, 0, 0, 0.981
18, 4078, 0, 1, 0.98
18, 4078, 0, -1, 0.985
19, 4077, 0, 0, 0.977
19, 4077, 0, 1, 0.979
19, 4077, 0, -1, 0.986
20, 4076, 0, 0, 0.977
20, 4076, 0, 1, 0.986
20, 4076, 0, -1, 0.984
21, 4075, 0, 0, 0.977
21, 4075, 0, 1, 0.983
21, 4075, 0, -1, 0.988
22, 4074, 0, 0, 0.983
22, 4074, 0, 1, 0.994
22, 4074, 0, -1, 0.993
23, 4073, 0, 0, 0.98
23, 4073, 0, 1, 0.992
23, 4073, 0, -1, 0.995
24, 4072, 0, 0, 0.989
24, 4072, 0, 1, 0.989
24, 4072, 0, -1, 0.991
25, 4071, 0, 0, 0.99
25, 4071, 0, 1, 0.999
25, 4071, 0, -1, 0.996
26, 4070, 0, 0, 0.993
26, 4070, 0, 1, 0.995
26, 4070, 0, -1, 0.998
27, 4069, 0, 0, 0.993
27, 4069, 0, 1, 0.999
27, 4069, 0, -1, 1.0
28, 4068, 0, 0, 0.997
28, 4068, 0, 1, 1.0
28, 4068, 0, -1, 0.999
29, 4067, 0, 0, 0.996
29, 4067, 0, 1, 0.999
29, 4067, 0, -1, 0.999
30, 4066, 0, 0, 0.991
30, 4066, 0, 1, 1.001
30, 4066, 0, -1, 0.999
31, 4065, 0, 0, 0.988
31, 4065, 0, 1, 0.998
31, 4065, 0, -1, 0.998
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++--------
1 file changed, 61 insertions(+), 37 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
index 16fc673e..99258cf5 100644
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -429,22 +429,21 @@ L(page_cross_less_vec):
# ifndef USE_AS_WMEMCMP
cmpl $8, %edx
jae L(between_8_15)
+ /* Fall through for [4, 7]. */
cmpl $4, %edx
- jae L(between_4_7)
+ jb L(between_2_3)
- /* Load as big endian to avoid branches. */
- movzwl (%rdi), %eax
- movzwl (%rsi), %ecx
- shll $8, %eax
- shll $8, %ecx
- bswap %eax
- bswap %ecx
- movzbl -1(%rdi, %rdx), %edi
- movzbl -1(%rsi, %rdx), %esi
- orl %edi, %eax
- orl %esi, %ecx
- /* Subtraction is okay because the upper 8 bits are zero. */
- subl %ecx, %eax
+ movbe (%rdi), %eax
+ movbe (%rsi), %ecx
+ shlq $32, %rax
+ shlq $32, %rcx
+ movbe -4(%rdi, %rdx), %edi
+ movbe -4(%rsi, %rdx), %esi
+ orq %rdi, %rax
+ orq %rsi, %rcx
+ subq %rcx, %rax
+ /* Fast path for return zero. */
+ jnz L(ret_nonzero)
/* No ymm register was touched. */
ret
@@ -457,9 +456,33 @@ L(one_or_less):
/* No ymm register was touched. */
ret
+ .p2align 4,, 5
+L(ret_nonzero):
+ sbbl %eax, %eax
+ orl $1, %eax
+ /* No ymm register was touched. */
+ ret
+
+ .p2align 4,, 2
+L(zero):
+ xorl %eax, %eax
+ /* No ymm register was touched. */
+ ret
+
.p2align 4
L(between_8_15):
-# endif
+ movbe (%rdi), %rax
+ movbe (%rsi), %rcx
+ subq %rcx, %rax
+ jnz L(ret_nonzero)
+ movbe -8(%rdi, %rdx), %rax
+ movbe -8(%rsi, %rdx), %rcx
+ subq %rcx, %rax
+ /* Fast path for return zero. */
+ jnz L(ret_nonzero)
+ /* No ymm register was touched. */
+ ret
+# else
/* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
vmovq (%rdi), %xmm1
vmovq (%rsi), %xmm2
@@ -475,16 +498,13 @@ L(between_8_15):
VPCMPEQ %xmm1, %xmm2, %xmm2
vpmovmskb %xmm2, %eax
subl $0xffff, %eax
+ /* Fast path for return zero. */
jnz L(return_vec_0)
/* No ymm register was touched. */
ret
+# endif
- .p2align 4
-L(zero):
- xorl %eax, %eax
- ret
-
- .p2align 4
+ .p2align 4,, 10
L(between_16_31):
/* From 16 to 31 bytes. No branch when size == 16. */
vmovdqu (%rsi), %xmm2
@@ -501,11 +521,17 @@ L(between_16_31):
VPCMPEQ (%rdi), %xmm2, %xmm2
vpmovmskb %xmm2, %eax
subl $0xffff, %eax
+ /* Fast path for return zero. */
jnz L(return_vec_0)
/* No ymm register was touched. */
ret
# ifdef USE_AS_WMEMCMP
+ .p2align 4,, 2
+L(zero):
+ xorl %eax, %eax
+ ret
+
.p2align 4
L(one_or_less):
jb L(zero)
@@ -520,22 +546,20 @@ L(one_or_less):
# else
.p2align 4
-L(between_4_7):
- /* Load as big endian with overlapping movbe to avoid branches.
- */
- movbe (%rdi), %eax
- movbe (%rsi), %ecx
- shlq $32, %rax
- shlq $32, %rcx
- movbe -4(%rdi, %rdx), %edi
- movbe -4(%rsi, %rdx), %esi
- orq %rdi, %rax
- orq %rsi, %rcx
- subq %rcx, %rax
- jz L(zero_4_7)
- sbbl %eax, %eax
- orl $1, %eax
-L(zero_4_7):
+L(between_2_3):
+ /* Load as big endian to avoid branches. */
+ movzwl (%rdi), %eax
+ movzwl (%rsi), %ecx
+ bswap %eax
+ bswap %ecx
+ shrl %eax
+ shrl %ecx
+ movzbl -1(%rdi, %rdx), %edi
+ movzbl -1(%rsi, %rdx), %esi
+ orl %edi, %eax
+ orl %esi, %ecx
+ /* Subtraction is okay because the upper bit is zero. */
+ subl %ecx, %eax
/* No ymm register was touched. */
ret
# endif
--
GitLab

View file

@ -1,876 +0,0 @@
From 5307aa9c1800f36a64c183c091c9af392c1fa75c Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Thu, 21 Apr 2022 20:52:28 -0500
Subject: [PATCH] x86: Optimize {str|wcs}rchr-sse2
Content-type: text/plain; charset=UTF-8
The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.
Geometric Mean of all benchmarks New / Old: 0.741
See email for all results.
Full xcheck passes on x86_64 with and without multiarch enabled.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +-
sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +-
sysdeps/x86_64/strrchr.S | 510 +++++++++++++++---------
sysdeps/x86_64/wcsrchr.S | 266 +-----------
4 files changed, 338 insertions(+), 443 deletions(-)
Conflicts:
sysdeps/x86_64/wcsrchr.S
(copyright header)
diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
index 0ec76fe9..6bb1284b 100644
--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
@@ -17,7 +17,7 @@
<http://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# define strrchr __strrchr_sse2
+# define STRRCHR __strrchr_sse2
# undef weak_alias
# define weak_alias(strrchr, rindex)
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
index d015e953..f26d53b5 100644
--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
@@ -17,7 +17,6 @@
<http://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# define wcsrchr __wcsrchr_sse2
+# define STRRCHR __wcsrchr_sse2
#endif
-
#include "../wcsrchr.S"
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
index aca98e7e..a58cc220 100644
--- a/sysdeps/x86_64/strrchr.S
+++ b/sysdeps/x86_64/strrchr.S
@@ -19,210 +19,360 @@
#include <sysdep.h>
+#ifndef STRRCHR
+# define STRRCHR strrchr
+#endif
+
+#ifdef USE_AS_WCSRCHR
+# define PCMPEQ pcmpeqd
+# define CHAR_SIZE 4
+# define PMINU pminud
+#else
+# define PCMPEQ pcmpeqb
+# define CHAR_SIZE 1
+# define PMINU pminub
+#endif
+
+#define PAGE_SIZE 4096
+#define VEC_SIZE 16
+
.text
-ENTRY (strrchr)
- movd %esi, %xmm1
+ENTRY(STRRCHR)
+ movd %esi, %xmm0
movq %rdi, %rax
- andl $4095, %eax
- punpcklbw %xmm1, %xmm1
- cmpq $4032, %rax
- punpcklwd %xmm1, %xmm1
- pshufd $0, %xmm1, %xmm1
+ andl $(PAGE_SIZE - 1), %eax
+#ifndef USE_AS_WCSRCHR
+ punpcklbw %xmm0, %xmm0
+ punpcklwd %xmm0, %xmm0
+#endif
+ pshufd $0, %xmm0, %xmm0
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
ja L(cross_page)
- movdqu (%rdi), %xmm0
+
+L(cross_page_continue):
+ movups (%rdi), %xmm1
pxor %xmm2, %xmm2
- movdqa %xmm0, %xmm3
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- pmovmskb %xmm0, %ecx
- pmovmskb %xmm3, %edx
- testq %rdx, %rdx
- je L(next_48_bytes)
- leaq -1(%rdx), %rax
- xorq %rdx, %rax
- andq %rcx, %rax
- je L(exit)
- bsrq %rax, %rax
+ PCMPEQ %xmm1, %xmm2
+ pmovmskb %xmm2, %ecx
+ testl %ecx, %ecx
+ jz L(aligned_more)
+
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
addq %rdi, %rax
+ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+ search CHAR is zero we are correct. Either way `andq
+ -CHAR_SIZE, %rax` gets the correct result. */
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+L(ret0):
ret
+ /* Returns for first vec x1/x2 have hard coded backward search
+ path for earlier matches. */
.p2align 4
-L(next_48_bytes):
- movdqu 16(%rdi), %xmm4
- movdqa %xmm4, %xmm5
- movdqu 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm4
- pcmpeqb %xmm2, %xmm5
- movdqu 48(%rdi), %xmm0
- pmovmskb %xmm5, %edx
- movdqa %xmm3, %xmm5
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm2, %xmm5
- pcmpeqb %xmm0, %xmm2
- salq $16, %rdx
- pmovmskb %xmm3, %r8d
- pmovmskb %xmm5, %eax
- pmovmskb %xmm2, %esi
- salq $32, %r8
- salq $32, %rax
- pcmpeqb %xmm1, %xmm0
- orq %rdx, %rax
- movq %rsi, %rdx
- pmovmskb %xmm4, %esi
- salq $48, %rdx
- salq $16, %rsi
- orq %r8, %rsi
- orq %rcx, %rsi
- pmovmskb %xmm0, %ecx
- salq $48, %rcx
- orq %rcx, %rsi
- orq %rdx, %rax
- je L(loop_header2)
- leaq -1(%rax), %rcx
- xorq %rax, %rcx
- andq %rcx, %rsi
- je L(exit)
- bsrq %rsi, %rsi
- leaq (%rdi,%rsi), %rax
+L(first_vec_x0_test):
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ testl %eax, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
+ addq %r8, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
ret
.p2align 4
-L(loop_header2):
- testq %rsi, %rsi
- movq %rdi, %rcx
- je L(no_c_found)
-L(loop_header):
- addq $64, %rdi
- pxor %xmm7, %xmm7
- andq $-64, %rdi
- jmp L(loop_entry)
+L(first_vec_x1):
+ PCMPEQ %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_vec_x0_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
.p2align 4
-L(loop64):
- testq %rdx, %rdx
- cmovne %rdx, %rsi
- cmovne %rdi, %rcx
- addq $64, %rdi
-L(loop_entry):
- movdqa 32(%rdi), %xmm3
- pxor %xmm6, %xmm6
- movdqa 48(%rdi), %xmm2
- movdqa %xmm3, %xmm0
- movdqa 16(%rdi), %xmm4
- pminub %xmm2, %xmm0
- movdqa (%rdi), %xmm5
- pminub %xmm4, %xmm0
- pminub %xmm5, %xmm0
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %eax
- movdqa %xmm5, %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %r9d
- movdqa %xmm4, %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %edx
- movdqa %xmm3, %xmm0
- pcmpeqb %xmm1, %xmm0
- salq $16, %rdx
- pmovmskb %xmm0, %r10d
- movdqa %xmm2, %xmm0
- pcmpeqb %xmm1, %xmm0
- salq $32, %r10
- orq %r10, %rdx
- pmovmskb %xmm0, %r8d
- orq %r9, %rdx
- salq $48, %r8
- orq %r8, %rdx
+L(first_vec_x1_test):
+ PCMPEQ %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
testl %eax, %eax
- je L(loop64)
- pcmpeqb %xmm6, %xmm4
- pcmpeqb %xmm6, %xmm3
- pcmpeqb %xmm6, %xmm5
- pmovmskb %xmm4, %eax
- pmovmskb %xmm3, %r10d
- pcmpeqb %xmm6, %xmm2
- pmovmskb %xmm5, %r9d
- salq $32, %r10
- salq $16, %rax
- pmovmskb %xmm2, %r8d
- orq %r10, %rax
- orq %r9, %rax
- salq $48, %r8
- orq %r8, %rax
- leaq -1(%rax), %r8
- xorq %rax, %r8
- andq %r8, %rdx
- cmovne %rdi, %rcx
- cmovne %rdx, %rsi
- bsrq %rsi, %rsi
- leaq (%rcx,%rsi), %rax
+ jz L(first_vec_x0_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(first_vec_x2):
+ PCMPEQ %xmm0, %xmm3
+ pmovmskb %xmm3, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_vec_x1_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(aligned_more):
+ /* Save original pointer if match was in VEC 0. */
+ movq %rdi, %r8
+ andq $-VEC_SIZE, %rdi
+
+ movaps VEC_SIZE(%rdi), %xmm2
+ pxor %xmm3, %xmm3
+ PCMPEQ %xmm2, %xmm3
+ pmovmskb %xmm3, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x1)
+
+ movaps (VEC_SIZE * 2)(%rdi), %xmm3
+ pxor %xmm4, %xmm4
+ PCMPEQ %xmm3, %xmm4
+ pmovmskb %xmm4, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x2)
+
+ addq $VEC_SIZE, %rdi
+ /* Save pointer again before realigning. */
+ movq %rdi, %rsi
+ andq $-(VEC_SIZE * 2), %rdi
+ .p2align 4
+L(first_loop):
+ /* Do 2x VEC at a time. */
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
+ /* Since SSE2 no pminud so wcsrchr needs seperate logic for
+ detecting zero. Note if this is found to be a bottleneck it
+ may be worth adding an SSE4.1 wcsrchr implementation. */
+#ifdef USE_AS_WCSRCHR
+ movaps %xmm5, %xmm6
+ pxor %xmm8, %xmm8
+
+ PCMPEQ %xmm8, %xmm5
+ PCMPEQ %xmm4, %xmm8
+ por %xmm5, %xmm8
+#else
+ movaps %xmm5, %xmm6
+ PMINU %xmm4, %xmm5
+#endif
+
+ movaps %xmm4, %xmm9
+ PCMPEQ %xmm0, %xmm4
+ PCMPEQ %xmm0, %xmm6
+ movaps %xmm6, %xmm7
+ por %xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+ pxor %xmm8, %xmm8
+ PCMPEQ %xmm5, %xmm8
+#endif
+ pmovmskb %xmm8, %ecx
+ pmovmskb %xmm6, %eax
+
+ addq $(VEC_SIZE * 2), %rdi
+ /* Use `addl` 1) so we can undo it with `subl` and 2) it can
+ macro-fuse with `jz`. */
+ addl %ecx, %eax
+ jz L(first_loop)
+
+ /* Check if there is zero match. */
+ testl %ecx, %ecx
+ jz L(second_loop_match)
+
+ /* Check if there was a match in last iteration. */
+ subl %ecx, %eax
+ jnz L(new_match)
+
+L(first_loop_old_match):
+ PCMPEQ %xmm0, %xmm2
+ PCMPEQ %xmm0, %xmm3
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm3, %eax
+ addl %eax, %ecx
+ jz L(first_vec_x0_test)
+ /* NB: We could move this shift to before the branch and save a
+ bit of code size / performance on the fall through. The
+ branch leads to the null case which generally seems hotter
+ than char in first 3x VEC. */
+ sall $16, %eax
+ orl %ecx, %eax
+
+ bsrl %eax, %eax
+ addq %rsi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(new_match):
+ pxor %xmm6, %xmm6
+ PCMPEQ %xmm9, %xmm6
+ pmovmskb %xmm6, %eax
+ sall $16, %ecx
+ orl %eax, %ecx
+
+ /* We can't reuse either of the old comparisons as since we mask
+ of zeros after first zero (instead of using the full
+ comparison) we can't gurantee no interference between match
+ after end of string and valid match. */
+ pmovmskb %xmm4, %eax
+ pmovmskb %xmm7, %edx
+ sall $16, %edx
+ orl %edx, %eax
+
+ leal -1(%ecx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_loop_old_match)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
ret
+ /* Save minimum state for getting most recent match. We can
+ throw out all previous work. */
.p2align 4
-L(no_c_found):
- movl $1, %esi
- xorl %ecx, %ecx
- jmp L(loop_header)
+L(second_loop_match):
+ movq %rdi, %rsi
+ movaps %xmm4, %xmm2
+ movaps %xmm7, %xmm3
.p2align 4
-L(exit):
- xorl %eax, %eax
+L(second_loop):
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
+ /* Since SSE2 no pminud so wcsrchr needs seperate logic for
+ detecting zero. Note if this is found to be a bottleneck it
+ may be worth adding an SSE4.1 wcsrchr implementation. */
+#ifdef USE_AS_WCSRCHR
+ movaps %xmm5, %xmm6
+ pxor %xmm8, %xmm8
+
+ PCMPEQ %xmm8, %xmm5
+ PCMPEQ %xmm4, %xmm8
+ por %xmm5, %xmm8
+#else
+ movaps %xmm5, %xmm6
+ PMINU %xmm4, %xmm5
+#endif
+
+ movaps %xmm4, %xmm9
+ PCMPEQ %xmm0, %xmm4
+ PCMPEQ %xmm0, %xmm6
+ movaps %xmm6, %xmm7
+ por %xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+ pxor %xmm8, %xmm8
+ PCMPEQ %xmm5, %xmm8
+#endif
+
+ pmovmskb %xmm8, %ecx
+ pmovmskb %xmm6, %eax
+
+ addq $(VEC_SIZE * 2), %rdi
+ /* Either null term or new occurence of CHAR. */
+ addl %ecx, %eax
+ jz L(second_loop)
+
+ /* No null term so much be new occurence of CHAR. */
+ testl %ecx, %ecx
+ jz L(second_loop_match)
+
+
+ subl %ecx, %eax
+ jnz L(second_loop_new_match)
+
+L(second_loop_old_match):
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm3, %eax
+ sall $16, %eax
+ orl %ecx, %eax
+ bsrl %eax, %eax
+ addq %rsi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
ret
.p2align 4
+L(second_loop_new_match):
+ pxor %xmm6, %xmm6
+ PCMPEQ %xmm9, %xmm6
+ pmovmskb %xmm6, %eax
+ sall $16, %ecx
+ orl %eax, %ecx
+
+ /* We can't reuse either of the old comparisons as since we mask
+ of zeros after first zero (instead of using the full
+ comparison) we can't gurantee no interference between match
+ after end of string and valid match. */
+ pmovmskb %xmm4, %eax
+ pmovmskb %xmm7, %edx
+ sall $16, %edx
+ orl %edx, %eax
+
+ leal -1(%ecx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(second_loop_old_match)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4,, 4
L(cross_page):
- movq %rdi, %rax
- pxor %xmm0, %xmm0
- andq $-64, %rax
- movdqu (%rax), %xmm5
- movdqa %xmm5, %xmm6
- movdqu 16(%rax), %xmm4
- pcmpeqb %xmm1, %xmm5
- pcmpeqb %xmm0, %xmm6
- movdqu 32(%rax), %xmm3
- pmovmskb %xmm6, %esi
- movdqa %xmm4, %xmm6
- movdqu 48(%rax), %xmm2
- pcmpeqb %xmm1, %xmm4
- pcmpeqb %xmm0, %xmm6
- pmovmskb %xmm6, %edx
- movdqa %xmm3, %xmm6
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm0, %xmm6
- pcmpeqb %xmm2, %xmm0
- salq $16, %rdx
- pmovmskb %xmm3, %r9d
- pmovmskb %xmm6, %r8d
- pmovmskb %xmm0, %ecx
- salq $32, %r9
- salq $32, %r8
- pcmpeqb %xmm1, %xmm2
- orq %r8, %rdx
- salq $48, %rcx
- pmovmskb %xmm5, %r8d
- orq %rsi, %rdx
- pmovmskb %xmm4, %esi
- orq %rcx, %rdx
- pmovmskb %xmm2, %ecx
- salq $16, %rsi
- salq $48, %rcx
- orq %r9, %rsi
- orq %r8, %rsi
- orq %rcx, %rsi
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rsi
+ movaps (%rsi), %xmm1
+ pxor %xmm2, %xmm2
+ PCMPEQ %xmm1, %xmm2
+ pmovmskb %xmm2, %edx
movl %edi, %ecx
- subl %eax, %ecx
- shrq %cl, %rdx
- shrq %cl, %rsi
- testq %rdx, %rdx
- je L(loop_header2)
- leaq -1(%rdx), %rax
- xorq %rdx, %rax
- andq %rax, %rsi
- je L(exit)
- bsrq %rsi, %rax
+ andl $(VEC_SIZE - 1), %ecx
+ sarl %cl, %edx
+ jz L(cross_page_continue)
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ sarl %cl, %eax
+ leal -1(%rdx), %ecx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(ret1)
+ bsrl %eax, %eax
addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+L(ret1):
ret
-END (strrchr)
+END(STRRCHR)
-weak_alias (strrchr, rindex)
-libc_hidden_builtin_def (strrchr)
+#ifndef USE_AS_WCSRCHR
+ weak_alias (STRRCHR, rindex)
+ libc_hidden_builtin_def (STRRCHR)
+#endif
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
index 2f388537..ae3cfa7d 100644
--- a/sysdeps/x86_64/wcsrchr.S
+++ b/sysdeps/x86_64/wcsrchr.S
@@ -17,266 +17,12 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#include <sysdep.h>
- .text
-ENTRY (wcsrchr)
+#define USE_AS_WCSRCHR 1
+#define NO_PMINU 1
- movd %rsi, %xmm1
- mov %rdi, %rcx
- punpckldq %xmm1, %xmm1
- pxor %xmm2, %xmm2
- punpckldq %xmm1, %xmm1
- and $63, %rcx
- cmp $48, %rcx
- ja L(crosscache)
+#ifndef STRRCHR
+# define STRRCHR wcsrchr
+#endif
- movdqu (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm2
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- add $16, %rdi
-
- test %rax, %rax
- jnz L(unaligned_match1)
-
- test %rcx, %rcx
- jnz L(return_null)
-
- and $-16, %rdi
- xor %r8, %r8
- jmp L(loop)
-
- .p2align 4
-L(unaligned_match1):
- test %rcx, %rcx
- jnz L(prolog_find_zero_1)
-
- mov %rax, %r8
- mov %rdi, %rsi
- and $-16, %rdi
- jmp L(loop)
-
- .p2align 4
-L(crosscache):
- and $15, %rcx
- and $-16, %rdi
- pxor %xmm3, %xmm3
- movdqa (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm3
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm3, %rdx
- pmovmskb %xmm0, %rax
- shr %cl, %rdx
- shr %cl, %rax
- add $16, %rdi
-
- test %rax, %rax
- jnz L(unaligned_match)
-
- test %rdx, %rdx
- jnz L(return_null)
-
- xor %r8, %r8
- jmp L(loop)
-
- .p2align 4
-L(unaligned_match):
- test %rdx, %rdx
- jnz L(prolog_find_zero)
-
- mov %rax, %r8
- lea (%rdi, %rcx), %rsi
-
-/* Loop start on aligned string. */
- .p2align 4
-L(loop):
- movdqa (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm3
- pcmpeqd %xmm3, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm3
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm3, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm4
- pcmpeqd %xmm4, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm4
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm4, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm5
- pcmpeqd %xmm5, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm5
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm5, %rax
- or %rax, %rcx
- jz L(loop)
-
- .p2align 4
-L(matches):
- test %rax, %rax
- jnz L(match)
-L(return_value):
- test %r8, %r8
- jz L(return_null)
- mov %r8, %rax
- mov %rsi, %rdi
-
- test $15 << 4, %ah
- jnz L(match_fourth_wchar)
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(match):
- pmovmskb %xmm2, %rcx
- test %rcx, %rcx
- jnz L(find_zero)
- mov %rax, %r8
- mov %rdi, %rsi
- jmp L(loop)
-
- .p2align 4
-L(find_zero):
- test $15, %cl
- jnz L(find_zero_in_first_wchar)
- test %cl, %cl
- jnz L(find_zero_in_second_wchar)
- test $15, %ch
- jnz L(find_zero_in_third_wchar)
-
- and $1 << 13 - 1, %rax
- jz L(return_value)
-
- test $15 << 4, %ah
- jnz L(match_fourth_wchar)
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(find_zero_in_first_wchar):
- test $1, %rax
- jz L(return_value)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(find_zero_in_second_wchar):
- and $1 << 5 - 1, %rax
- jz L(return_value)
-
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(find_zero_in_third_wchar):
- and $1 << 9 - 1, %rax
- jz L(return_value)
-
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero):
- add %rcx, %rdi
- mov %rdx, %rcx
-L(prolog_find_zero_1):
- test $15, %cl
- jnz L(prolog_find_zero_in_first_wchar)
- test %cl, %cl
- jnz L(prolog_find_zero_in_second_wchar)
- test $15, %ch
- jnz L(prolog_find_zero_in_third_wchar)
-
- and $1 << 13 - 1, %rax
- jz L(return_null)
-
- test $15 << 4, %ah
- jnz L(match_fourth_wchar)
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_in_first_wchar):
- test $1, %rax
- jz L(return_null)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_in_second_wchar):
- and $1 << 5 - 1, %rax
- jz L(return_null)
-
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_in_third_wchar):
- and $1 << 9 - 1, %rax
- jz L(return_null)
-
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(match_second_wchar):
- lea -12(%rdi), %rax
- ret
-
- .p2align 4
-L(match_third_wchar):
- lea -8(%rdi), %rax
- ret
-
- .p2align 4
-L(match_fourth_wchar):
- lea -4(%rdi), %rax
- ret
-
- .p2align 4
-L(return_null):
- xor %rax, %rax
- ret
-
-END (wcsrchr)
+#include "../strrchr.S"
--
GitLab

View file

@ -1,501 +0,0 @@
From df7e295d18ffa34f629578c0017a9881af7620f6 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Thu, 21 Apr 2022 20:52:29 -0500
Subject: [PATCH] x86: Optimize {str|wcs}rchr-avx2
Content-type: text/plain; charset=UTF-8
The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.
Geometric Mean of all benchmarks New / Old: 0.832
See email for all results.
Full xcheck passes on x86_64 with and without multiarch enabled.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/strrchr-avx2.S | 426 +++++++++++++++---------
1 file changed, 269 insertions(+), 157 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
index c949410b..3d26fad4 100644
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -27,9 +27,13 @@
# ifdef USE_AS_WCSRCHR
# define VPBROADCAST vpbroadcastd
# define VPCMPEQ vpcmpeqd
+# define VPMIN vpminud
+# define CHAR_SIZE 4
# else
# define VPBROADCAST vpbroadcastb
# define VPCMPEQ vpcmpeqb
+# define VPMIN vpminub
+# define CHAR_SIZE 1
# endif
# ifndef VZEROUPPER
@@ -41,196 +45,304 @@
# endif
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
- .section SECTION(.text),"ax",@progbits
-ENTRY (STRRCHR)
- movd %esi, %xmm4
- movl %edi, %ecx
+ .section SECTION(.text), "ax", @progbits
+ENTRY(STRRCHR)
+ movd %esi, %xmm7
+ movl %edi, %eax
/* Broadcast CHAR to YMM4. */
- VPBROADCAST %xmm4, %ymm4
+ VPBROADCAST %xmm7, %ymm7
vpxor %xmm0, %xmm0, %xmm0
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ /* Shift here instead of `andl` to save code size (saves a fetch
+ block). */
+ sall $20, %eax
+ cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
+ ja L(cross_page)
+L(page_cross_continue):
vmovdqu (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- addq $VEC_SIZE, %rdi
+ /* Check end of string match. */
+ VPCMPEQ %ymm1, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
+ testl %ecx, %ecx
+ jz L(aligned_more)
+
+ /* Only check match with search CHAR if needed. */
+ VPCMPEQ %ymm1, %ymm7, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Check if match before first zero. */
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+ search CHAR is zero we are correct. Either way `andq
+ -CHAR_SIZE, %rax` gets the correct result. */
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+L(ret0):
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
+
+ /* Returns for first vec x1/x2 have hard coded backward search
+ path for earlier matches. */
+ .p2align 4,, 10
+L(first_vec_x1):
+ VPCMPEQ %ymm2, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jnz L(first_vec_x1_return)
+
+ .p2align 4,, 4
+L(first_vec_x0_test):
+ VPCMPEQ %ymm1, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ testl %eax, %eax
+ jz L(ret1)
+ bsrl %eax, %eax
+ addq %r8, %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+L(ret1):
+ VZEROUPPER_RETURN
+ .p2align 4,, 10
+L(first_vec_x0_x1_test):
+ VPCMPEQ %ymm2, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ /* Check ymm2 for search CHAR match. If no match then check ymm1
+ before returning. */
testl %eax, %eax
- jnz L(first_vec)
+ jz L(first_vec_x0_test)
+ .p2align 4,, 4
+L(first_vec_x1_return):
+ bsrl %eax, %eax
+ leaq 1(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+ VZEROUPPER_RETURN
- testl %ecx, %ecx
- jnz L(return_null)
- andq $-VEC_SIZE, %rdi
- xorl %edx, %edx
- jmp L(aligned_loop)
+ .p2align 4,, 10
+L(first_vec_x2):
+ VPCMPEQ %ymm3, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ blsmskl %ecx, %ecx
+ /* If no in-range search CHAR match in ymm3 then need to check
+ ymm1/ymm2 for an earlier match (we delay checking search
+ CHAR matches until needed). */
+ andl %ecx, %eax
+ jz L(first_vec_x0_x1_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE + 1)(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+ VZEROUPPER_RETURN
+
.p2align 4
-L(first_vec):
- /* Check if there is a nul CHAR. */
+L(aligned_more):
+ /* Save original pointer if match was in VEC 0. */
+ movq %rdi, %r8
+
+ /* Align src. */
+ orq $(VEC_SIZE - 1), %rdi
+ vmovdqu 1(%rdi), %ymm2
+ VPCMPEQ %ymm2, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
testl %ecx, %ecx
- jnz L(char_and_nul_in_first_vec)
+ jnz L(first_vec_x1)
- /* Remember the match and keep searching. */
- movl %eax, %edx
- movq %rdi, %rsi
- andq $-VEC_SIZE, %rdi
- jmp L(aligned_loop)
+ vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3
+ VPCMPEQ %ymm3, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x2)
+ /* Save pointer again before realigning. */
+ movq %rdi, %rsi
+ addq $(VEC_SIZE + 1), %rdi
+ andq $-(VEC_SIZE * 2), %rdi
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %edx
- vpmovmskb %ymm3, %eax
- shrl %cl, %edx
- shrl %cl, %eax
- addq $VEC_SIZE, %rdi
-
- /* Check if there is a CHAR. */
+L(first_aligned_loop):
+ /* Do 2x VEC at a time. Any more and the cost of finding the
+ match outweights loop benefit. */
+ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
+ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
+
+ VPCMPEQ %ymm4, %ymm7, %ymm6
+ VPMIN %ymm4, %ymm5, %ymm8
+ VPCMPEQ %ymm5, %ymm7, %ymm10
+ vpor %ymm6, %ymm10, %ymm5
+ VPCMPEQ %ymm8, %ymm0, %ymm8
+ vpor %ymm5, %ymm8, %ymm9
+
+ vpmovmskb %ymm9, %eax
+ addq $(VEC_SIZE * 2), %rdi
+ /* No zero or search CHAR. */
testl %eax, %eax
- jnz L(found_char)
-
- testl %edx, %edx
- jnz L(return_null)
+ jz L(first_aligned_loop)
- jmp L(aligned_loop)
-
- .p2align 4
-L(found_char):
- testl %edx, %edx
- jnz L(char_and_nul)
+ /* If no zero CHAR then go to second loop (this allows us to
+ throw away all prior work). */
+ vpmovmskb %ymm8, %ecx
+ testl %ecx, %ecx
+ jz L(second_aligned_loop_prep)
- /* Remember the match and keep searching. */
- movl %eax, %edx
- leaq (%rdi, %rcx), %rsi
+ /* Search char could be zero so we need to get the true match.
+ */
+ vpmovmskb %ymm5, %eax
+ testl %eax, %eax
+ jnz L(first_aligned_loop_return)
- .p2align 4
-L(aligned_loop):
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- addq $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
-
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- add $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
+ .p2align 4,, 4
+L(first_vec_x1_or_x2):
+ VPCMPEQ %ymm3, %ymm7, %ymm3
+ VPCMPEQ %ymm2, %ymm7, %ymm2
vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
-
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- addq $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
-
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- addq $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jz L(aligned_loop)
-
- .p2align 4
-L(char_nor_null):
- /* Find a CHAR or a nul CHAR in a loop. */
- testl %eax, %eax
- jnz L(match)
-L(return_value):
- testl %edx, %edx
- jz L(return_null)
- movl %edx, %eax
- movq %rsi, %rdi
+ vpmovmskb %ymm2, %edx
+ /* Use add for macro-fusion. */
+ addq %rax, %rdx
+ jz L(first_vec_x0_test)
+ /* NB: We could move this shift to before the branch and save a
+ bit of code size / performance on the fall through. The
+ branch leads to the null case which generally seems hotter
+ than char in first 3x VEC. */
+ salq $32, %rax
+ addq %rdx, %rax
+ bsrq %rax, %rax
+ leaq 1(%rsi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+ VZEROUPPER_RETURN
+ .p2align 4,, 8
+L(first_aligned_loop_return):
+ VPCMPEQ %ymm4, %ymm0, %ymm4
+ vpmovmskb %ymm4, %edx
+ salq $32, %rcx
+ orq %rdx, %rcx
+
+ vpmovmskb %ymm10, %eax
+ vpmovmskb %ymm6, %edx
+ salq $32, %rax
+ orq %rdx, %rax
+ blsmskq %rcx, %rcx
+ andq %rcx, %rax
+ jz L(first_vec_x1_or_x2)
+
+ bsrq %rax, %rax
+ leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax
# ifdef USE_AS_WCSRCHR
- /* Keep the first bit for each matching CHAR for bsr. */
- andl $0x11111111, %eax
+ andq $-CHAR_SIZE, %rax
# endif
- bsrl %eax, %eax
- leaq -VEC_SIZE(%rdi, %rax), %rax
-L(return_vzeroupper):
- ZERO_UPPER_VEC_REGISTERS_RETURN
+ VZEROUPPER_RETURN
+ /* Search char cannot be zero. */
.p2align 4
-L(match):
- /* Find a CHAR. Check if there is a nul CHAR. */
- vpmovmskb %ymm2, %ecx
- testl %ecx, %ecx
- jnz L(find_nul)
-
- /* Remember the match and keep searching. */
- movl %eax, %edx
+L(second_aligned_loop_set_furthest_match):
+ /* Save VEC and pointer from most recent match. */
+L(second_aligned_loop_prep):
movq %rdi, %rsi
- jmp L(aligned_loop)
+ vmovdqu %ymm6, %ymm2
+ vmovdqu %ymm10, %ymm3
.p2align 4
-L(find_nul):
-# ifdef USE_AS_WCSRCHR
- /* Keep the first bit for each matching CHAR for bsr. */
- andl $0x11111111, %ecx
- andl $0x11111111, %eax
-# endif
- /* Mask out any matching bits after the nul CHAR. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
+L(second_aligned_loop):
+ /* Search 2x at at time. */
+ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
+ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
+
+ VPCMPEQ %ymm4, %ymm7, %ymm6
+ VPMIN %ymm4, %ymm5, %ymm1
+ VPCMPEQ %ymm5, %ymm7, %ymm10
+ vpor %ymm6, %ymm10, %ymm5
+ VPCMPEQ %ymm1, %ymm0, %ymm1
+ vpor %ymm5, %ymm1, %ymm9
+
+ vpmovmskb %ymm9, %eax
+ addq $(VEC_SIZE * 2), %rdi
testl %eax, %eax
- /* If there is no CHAR here, return the remembered one. */
- jz L(return_value)
- bsrl %eax, %eax
- leaq -VEC_SIZE(%rdi, %rax), %rax
- VZEROUPPER_RETURN
-
- .p2align 4
-L(char_and_nul):
- /* Find both a CHAR and a nul CHAR. */
- addq %rcx, %rdi
- movl %edx, %ecx
-L(char_and_nul_in_first_vec):
-# ifdef USE_AS_WCSRCHR
- /* Keep the first bit for each matching CHAR for bsr. */
- andl $0x11111111, %ecx
- andl $0x11111111, %eax
-# endif
- /* Mask out any matching bits after the nul CHAR. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
+ jz L(second_aligned_loop)
+ vpmovmskb %ymm1, %ecx
+ testl %ecx, %ecx
+ jz L(second_aligned_loop_set_furthest_match)
+ vpmovmskb %ymm5, %eax
testl %eax, %eax
- /* Return null pointer if the nul CHAR comes first. */
- jz L(return_null)
- bsrl %eax, %eax
- leaq -VEC_SIZE(%rdi, %rax), %rax
+ jnz L(return_new_match)
+
+ /* This is the hot patch. We know CHAR is inbounds and that
+ ymm3/ymm2 have latest match. */
+ .p2align 4,, 4
+L(return_old_match):
+ vpmovmskb %ymm3, %eax
+ vpmovmskb %ymm2, %edx
+ salq $32, %rax
+ orq %rdx, %rax
+ bsrq %rax, %rax
+ /* Search char cannot be zero so safe to just use lea for
+ wcsrchr. */
+ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
VZEROUPPER_RETURN
- .p2align 4
-L(return_null):
- xorl %eax, %eax
+ /* Last iteration also potentially has a match. */
+ .p2align 4,, 8
+L(return_new_match):
+ VPCMPEQ %ymm4, %ymm0, %ymm4
+ vpmovmskb %ymm4, %edx
+ salq $32, %rcx
+ orq %rdx, %rcx
+
+ vpmovmskb %ymm10, %eax
+ vpmovmskb %ymm6, %edx
+ salq $32, %rax
+ orq %rdx, %rax
+ blsmskq %rcx, %rcx
+ andq %rcx, %rax
+ jz L(return_old_match)
+ bsrq %rax, %rax
+ /* Search char cannot be zero so safe to just use lea for
+ wcsrchr. */
+ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
VZEROUPPER_RETURN
-END (STRRCHR)
+ .p2align 4,, 4
+L(cross_page):
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rsi
+ vmovdqu (%rsi), %ymm1
+ VPCMPEQ %ymm1, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
+ /* Shift out zero CHAR matches that are before the begining of
+ src (rdi). */
+ shrxl %edi, %ecx, %ecx
+ testl %ecx, %ecx
+ jz L(page_cross_continue)
+ VPCMPEQ %ymm1, %ymm7, %ymm1
+ vpmovmskb %ymm1, %eax
+
+ /* Shift out search CHAR matches that are before the begining of
+ src (rdi). */
+ shrxl %edi, %eax, %eax
+ blsmskl %ecx, %ecx
+ /* Check if any search CHAR match in range. */
+ andl %ecx, %eax
+ jz L(ret2)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+L(ret2):
+ VZEROUPPER_RETURN
+END(STRRCHR)
#endif
--
GitLab

View file

@ -1,558 +0,0 @@
From c966099cdc3e0fdf92f63eac09b22fa7e5f5f02d Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Thu, 21 Apr 2022 20:52:30 -0500
Subject: [PATCH] x86: Optimize {str|wcs}rchr-evex
Content-type: text/plain; charset=UTF-8
The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.
Geometric Mean of all benchmarks New / Old: 0.755
See email for all results.
Full xcheck passes on x86_64 with and without multiarch enabled.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/strrchr-evex.S | 471 +++++++++++++++---------
1 file changed, 290 insertions(+), 181 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
index f920b5a5..f5b6d755 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
@@ -24,242 +24,351 @@
# define STRRCHR __strrchr_evex
# endif
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
# ifdef USE_AS_WCSRCHR
+# define SHIFT_REG esi
+
+# define kunpck kunpckbw
+# define kmov_2x kmovd
+# define maskz_2x ecx
+# define maskm_2x eax
+# define CHAR_SIZE 4
+# define VPMIN vpminud
+# define VPTESTN vptestnmd
# define VPBROADCAST vpbroadcastd
-# define VPCMP vpcmpd
-# define SHIFT_REG r8d
+# define VPCMP vpcmpd
# else
+# define SHIFT_REG edi
+
+# define kunpck kunpckdq
+# define kmov_2x kmovq
+# define maskz_2x rcx
+# define maskm_2x rax
+
+# define CHAR_SIZE 1
+# define VPMIN vpminub
+# define VPTESTN vptestnmb
# define VPBROADCAST vpbroadcastb
-# define VPCMP vpcmpb
-# define SHIFT_REG ecx
+# define VPCMP vpcmpb
# endif
# define XMMZERO xmm16
# define YMMZERO ymm16
# define YMMMATCH ymm17
-# define YMM1 ymm18
+# define YMMSAVE ymm18
+
+# define YMM1 ymm19
+# define YMM2 ymm20
+# define YMM3 ymm21
+# define YMM4 ymm22
+# define YMM5 ymm23
+# define YMM6 ymm24
+# define YMM7 ymm25
+# define YMM8 ymm26
-# define VEC_SIZE 32
- .section .text.evex,"ax",@progbits
-ENTRY (STRRCHR)
- movl %edi, %ecx
+# define VEC_SIZE 32
+# define PAGE_SIZE 4096
+ .section .text.evex, "ax", @progbits
+ENTRY(STRRCHR)
+ movl %edi, %eax
/* Broadcast CHAR to YMMMATCH. */
VPBROADCAST %esi, %YMMMATCH
- vpxorq %XMMZERO, %XMMZERO, %XMMZERO
-
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ jg L(cross_page_boundary)
+L(page_cross_continue):
VMOVU (%rdi), %YMM1
-
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
+ /* k0 has a 1 for each zero CHAR in YMM1. */
+ VPTESTN %YMM1, %YMM1, %k0
kmovd %k0, %ecx
- kmovd %k1, %eax
-
- addq $VEC_SIZE, %rdi
-
- testl %eax, %eax
- jnz L(first_vec)
-
testl %ecx, %ecx
- jnz L(return_null)
-
- andq $-VEC_SIZE, %rdi
- xorl %edx, %edx
- jmp L(aligned_loop)
-
- .p2align 4
-L(first_vec):
- /* Check if there is a null byte. */
- testl %ecx, %ecx
- jnz L(char_and_nul_in_first_vec)
-
- /* Remember the match and keep searching. */
- movl %eax, %edx
- movq %rdi, %rsi
- andq $-VEC_SIZE, %rdi
- jmp L(aligned_loop)
-
- .p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ jz L(aligned_more)
+ /* fallthrough: zero CHAR in first VEC. */
+ /* K1 has a 1 for each search CHAR match in YMM1. */
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k1, %eax
+ /* Build mask up until first zero CHAR (used to mask of
+ potential search CHAR matches past the end of the string).
+ */
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(ret0)
+ /* Get last match (the `andl` removed any out of bounds
+ matches). */
+ bsrl %eax, %eax
# ifdef USE_AS_WCSRCHR
- /* NB: Divide shift count by 4 since each bit in K1 represent 4
- bytes. */
- movl %ecx, %SHIFT_REG
- sarl $2, %SHIFT_REG
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+# else
+ addq %rdi, %rax
# endif
+L(ret0):
+ ret
- VMOVA (%rdi), %YMM1
-
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
+ /* Returns for first vec x1/x2/x3 have hard coded backward
+ search path for earlier matches. */
+ .p2align 4,, 6
+L(first_vec_x1):
+ VPCMP $0, %YMMMATCH, %YMM2, %k1
+ kmovd %k1, %eax
+ blsmskl %ecx, %ecx
+ /* eax non-zero if search CHAR in range. */
+ andl %ecx, %eax
+ jnz L(first_vec_x1_return)
+
+ /* fallthrough: no match in YMM2 then need to check for earlier
+ matches (in YMM1). */
+ .p2align 4,, 4
+L(first_vec_x0_test):
VPCMP $0, %YMMMATCH, %YMM1, %k1
- kmovd %k0, %edx
kmovd %k1, %eax
-
- shrxl %SHIFT_REG, %edx, %edx
- shrxl %SHIFT_REG, %eax, %eax
- addq $VEC_SIZE, %rdi
-
- /* Check if there is a CHAR. */
testl %eax, %eax
- jnz L(found_char)
-
- testl %edx, %edx
- jnz L(return_null)
-
- jmp L(aligned_loop)
-
- .p2align 4
-L(found_char):
- testl %edx, %edx
- jnz L(char_and_nul)
-
- /* Remember the match and keep searching. */
- movl %eax, %edx
- leaq (%rdi, %rcx), %rsi
+ jz L(ret1)
+ bsrl %eax, %eax
+# ifdef USE_AS_WCSRCHR
+ leaq (%rsi, %rax, CHAR_SIZE), %rax
+# else
+ addq %rsi, %rax
+# endif
+L(ret1):
+ ret
- .p2align 4
-L(aligned_loop):
- VMOVA (%rdi), %YMM1
- addq $VEC_SIZE, %rdi
+ .p2align 4,, 10
+L(first_vec_x1_or_x2):
+ VPCMP $0, %YMM3, %YMMMATCH, %k3
+ VPCMP $0, %YMM2, %YMMMATCH, %k2
+ /* K2 and K3 have 1 for any search CHAR match. Test if any
+ matches between either of them. Otherwise check YMM1. */
+ kortestd %k2, %k3
+ jz L(first_vec_x0_test)
+
+ /* Guranteed that YMM2 and YMM3 are within range so merge the
+ two bitmasks then get last result. */
+ kunpck %k2, %k3, %k3
+ kmovq %k3, %rax
+ bsrq %rax, %rax
+ leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
+ ret
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
- kmovd %k0, %ecx
+ .p2align 4,, 6
+L(first_vec_x3):
+ VPCMP $0, %YMMMATCH, %YMM4, %k1
kmovd %k1, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
+ blsmskl %ecx, %ecx
+ /* If no search CHAR match in range check YMM1/YMM2/YMM3. */
+ andl %ecx, %eax
+ jz L(first_vec_x1_or_x2)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- VMOVA (%rdi), %YMM1
- add $VEC_SIZE, %rdi
+ .p2align 4,, 6
+L(first_vec_x0_x1_test):
+ VPCMP $0, %YMMMATCH, %YMM2, %k1
+ kmovd %k1, %eax
+ /* Check YMM2 for last match first. If no match try YMM1. */
+ testl %eax, %eax
+ jz L(first_vec_x0_test)
+ .p2align 4,, 4
+L(first_vec_x1_return):
+ bsrl %eax, %eax
+ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
- kmovd %k0, %ecx
+ .p2align 4,, 10
+L(first_vec_x2):
+ VPCMP $0, %YMMMATCH, %YMM3, %k1
kmovd %k1, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
+ blsmskl %ecx, %ecx
+ /* Check YMM3 for last match first. If no match try YMM2/YMM1.
+ */
+ andl %ecx, %eax
+ jz L(first_vec_x0_x1_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- VMOVA (%rdi), %YMM1
- addq $VEC_SIZE, %rdi
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
+ .p2align 4
+L(aligned_more):
+ /* Need to keep original pointer incase YMM1 has last match. */
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rdi
+ VMOVU VEC_SIZE(%rdi), %YMM2
+ VPTESTN %YMM2, %YMM2, %k0
kmovd %k0, %ecx
- kmovd %k1, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
+ testl %ecx, %ecx
+ jnz L(first_vec_x1)
- VMOVA (%rdi), %YMM1
- addq $VEC_SIZE, %rdi
+ VMOVU (VEC_SIZE * 2)(%rdi), %YMM3
+ VPTESTN %YMM3, %YMM3, %k0
+ kmovd %k0, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x2)
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
+ VMOVU (VEC_SIZE * 3)(%rdi), %YMM4
+ VPTESTN %YMM4, %YMM4, %k0
kmovd %k0, %ecx
- kmovd %k1, %eax
- orl %eax, %ecx
- jz L(aligned_loop)
+ movq %rdi, %r8
+ testl %ecx, %ecx
+ jnz L(first_vec_x3)
+ andq $-(VEC_SIZE * 2), %rdi
.p2align 4
-L(char_nor_null):
- /* Find a CHAR or a null byte in a loop. */
+L(first_aligned_loop):
+ /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
+ they don't store a match. */
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM5
+ VMOVA (VEC_SIZE * 5)(%rdi), %YMM6
+
+ VPCMP $0, %YMM5, %YMMMATCH, %k2
+ vpxord %YMM6, %YMMMATCH, %YMM7
+
+ VPMIN %YMM5, %YMM6, %YMM8
+ VPMIN %YMM8, %YMM7, %YMM7
+
+ VPTESTN %YMM7, %YMM7, %k1
+ subq $(VEC_SIZE * -2), %rdi
+ kortestd %k1, %k2
+ jz L(first_aligned_loop)
+
+ VPCMP $0, %YMM6, %YMMMATCH, %k3
+ VPTESTN %YMM8, %YMM8, %k1
+ ktestd %k1, %k1
+ jz L(second_aligned_loop_prep)
+
+ kortestd %k2, %k3
+ jnz L(return_first_aligned_loop)
+
+ .p2align 4,, 6
+L(first_vec_x1_or_x2_or_x3):
+ VPCMP $0, %YMM4, %YMMMATCH, %k4
+ kmovd %k4, %eax
testl %eax, %eax
- jnz L(match)
-L(return_value):
- testl %edx, %edx
- jz L(return_null)
- movl %edx, %eax
- movq %rsi, %rdi
+ jz L(first_vec_x1_or_x2)
bsrl %eax, %eax
-# ifdef USE_AS_WCSRCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
-# else
- leaq -VEC_SIZE(%rdi, %rax), %rax
-# endif
+ leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
ret
- .p2align 4
-L(match):
- /* Find a CHAR. Check if there is a null byte. */
- kmovd %k0, %ecx
- testl %ecx, %ecx
- jnz L(find_nul)
+ .p2align 4,, 8
+L(return_first_aligned_loop):
+ VPTESTN %YMM5, %YMM5, %k0
+ kunpck %k0, %k1, %k0
+ kmov_2x %k0, %maskz_2x
+
+ blsmsk %maskz_2x, %maskz_2x
+ kunpck %k2, %k3, %k3
+ kmov_2x %k3, %maskm_2x
+ and %maskz_2x, %maskm_2x
+ jz L(first_vec_x1_or_x2_or_x3)
- /* Remember the match and keep searching. */
- movl %eax, %edx
+ bsr %maskm_2x, %maskm_2x
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+
+ .p2align 4
+ /* We can throw away the work done for the first 4x checks here
+ as we have a later match. This is the 'fast' path persay.
+ */
+L(second_aligned_loop_prep):
+L(second_aligned_loop_set_furthest_match):
movq %rdi, %rsi
- jmp L(aligned_loop)
+ kunpck %k2, %k3, %k4
.p2align 4
-L(find_nul):
- /* Mask out any matching bits after the null byte. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
- testl %eax, %eax
- /* If there is no CHAR here, return the remembered one. */
- jz L(return_value)
- bsrl %eax, %eax
+L(second_aligned_loop):
+ VMOVU (VEC_SIZE * 4)(%rdi), %YMM1
+ VMOVU (VEC_SIZE * 5)(%rdi), %YMM2
+
+ VPCMP $0, %YMM1, %YMMMATCH, %k2
+ vpxord %YMM2, %YMMMATCH, %YMM3
+
+ VPMIN %YMM1, %YMM2, %YMM4
+ VPMIN %YMM3, %YMM4, %YMM3
+
+ VPTESTN %YMM3, %YMM3, %k1
+ subq $(VEC_SIZE * -2), %rdi
+ kortestd %k1, %k2
+ jz L(second_aligned_loop)
+
+ VPCMP $0, %YMM2, %YMMMATCH, %k3
+ VPTESTN %YMM4, %YMM4, %k1
+ ktestd %k1, %k1
+ jz L(second_aligned_loop_set_furthest_match)
+
+ kortestd %k2, %k3
+ /* branch here because there is a significant advantage interms
+ of output dependency chance in using edx. */
+ jnz L(return_new_match)
+L(return_old_match):
+ kmovq %k4, %rax
+ bsrq %rax, %rax
+ leaq (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
+ ret
+
+L(return_new_match):
+ VPTESTN %YMM1, %YMM1, %k0
+ kunpck %k0, %k1, %k0
+ kmov_2x %k0, %maskz_2x
+
+ blsmsk %maskz_2x, %maskz_2x
+ kunpck %k2, %k3, %k3
+ kmov_2x %k3, %maskm_2x
+ and %maskz_2x, %maskm_2x
+ jz L(return_old_match)
+
+ bsr %maskm_2x, %maskm_2x
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+
+L(cross_page_boundary):
+ /* eax contains all the page offset bits of src (rdi). `xor rdi,
+ rax` sets pointer will all page offset bits cleared so
+ offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
+ before page cross (guranteed to be safe to read). Doing this
+ as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
+ a bit of code size. */
+ xorq %rdi, %rax
+ VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
+ VPTESTN %YMM1, %YMM1, %k0
+ kmovd %k0, %ecx
+
+ /* Shift out zero CHAR matches that are before the begining of
+ src (rdi). */
# ifdef USE_AS_WCSRCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
-# else
- leaq -VEC_SIZE(%rdi, %rax), %rax
+ movl %edi, %esi
+ andl $(VEC_SIZE - 1), %esi
+ shrl $2, %esi
# endif
- ret
+ shrxl %SHIFT_REG, %ecx, %ecx
- .p2align 4
-L(char_and_nul):
- /* Find both a CHAR and a null byte. */
- addq %rcx, %rdi
- movl %edx, %ecx
-L(char_and_nul_in_first_vec):
- /* Mask out any matching bits after the null byte. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
- testl %eax, %eax
- /* Return null pointer if the null byte comes first. */
- jz L(return_null)
+ testl %ecx, %ecx
+ jz L(page_cross_continue)
+
+ /* Found zero CHAR so need to test for search CHAR. */
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k1, %eax
+ /* Shift out search CHAR matches that are before the begining of
+ src (rdi). */
+ shrxl %SHIFT_REG, %eax, %eax
+
+ /* Check if any search CHAR match in range. */
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(ret3)
bsrl %eax, %eax
# ifdef USE_AS_WCSRCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
- leaq -VEC_SIZE(%rdi, %rax), %rax
+ addq %rdi, %rax
# endif
+L(ret3):
ret
- .p2align 4
-L(return_null):
- xorl %eax, %eax
- ret
-
-END (STRRCHR)
+END(STRRCHR)
#endif
--
GitLab

View file

@ -1,73 +0,0 @@
From 911c63a51c690dd1a97dfc587097277029baf00f Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed, 27 Apr 2022 15:13:02 -0500
Subject: [PATCH] sysdeps: Add 'get_fast_jitter' interace in fast-jitter.h
Content-type: text/plain; charset=UTF-8
'get_fast_jitter' is meant to be used purely for performance
purposes. In all cases it's used it should be acceptable to get no
randomness (see default case). An example use case is in setting
jitter for retries between threads at a lock. There is a
performance benefit to having jitter, but only if the jitter can
be generated very quickly and ultimately there is no serious issue
if no jitter is generated.
The implementation generally uses 'HP_TIMING_NOW' iff it is
inlined (avoid any potential syscall paths).
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/generic/fast-jitter.h | 42 +++++++++++++++++++++++++++++++++++
1 file changed, 42 insertions(+)
create mode 100644 sysdeps/generic/fast-jitter.h
diff --git a/sysdeps/generic/fast-jitter.h b/sysdeps/generic/fast-jitter.h
new file mode 100644
index 00000000..4dd53e34
--- /dev/null
+++ b/sysdeps/generic/fast-jitter.h
@@ -0,0 +1,42 @@
+/* Fallback for fast jitter just return 0.
+ Copyright (C) 2019-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _FAST_JITTER_H
+# define _FAST_JITTER_H
+
+# include <stdint.h>
+# include <hp-timing.h>
+
+/* Baseline just return 0. We could create jitter using a clock or
+ 'random_bits' but that may imply a syscall and the goal of
+ 'get_fast_jitter' is minimal overhead "randomness" when such
+ randomness helps performance. Adding high overhead the function
+ defeats the purpose. */
+static inline uint32_t
+get_fast_jitter (void)
+{
+# if HP_TIMING_INLINE
+ hp_timing_t jitter;
+ HP_TIMING_NOW (jitter);
+ return (uint32_t) jitter;
+# else
+ return 0;
+# endif
+}
+
+#endif
--
GitLab

View file

@ -1,226 +0,0 @@
From 8162147872491bb5b48e91543b19c49a29ae6b6d Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Fri, 6 May 2022 01:50:10 +0000
Subject: [PATCH] nptl: Add backoff mechanism to spinlock loop
Content-type: text/plain; charset=UTF-8
When mutiple threads waiting for lock at the same time, once lock owner
releases the lock, waiters will see lock available and all try to lock,
which may cause an expensive CAS storm.
Binary exponential backoff with random jitter is introduced. As try-lock
attempt increases, there is more likely that a larger number threads
compete for adaptive mutex lock, so increase wait time in exponential.
A random jitter is also added to avoid synchronous try-lock from other
threads.
v2: Remove read-check before try-lock for performance.
v3:
1. Restore read-check since it works well in some platform.
2. Make backoff arch dependent, and enable it for x86_64.
3. Limit max backoff to reduce latency in large critical section.
v4: Fix strict-prototypes error in sysdeps/nptl/pthread_mutex_backoff.h
v5: Commit log updated for regression in large critical section.
Result of pthread-mutex-locks bench
Test Platform: Xeon 8280L (2 socket, 112 CPUs in total)
First Row: thread number
First Col: critical section length
Values: backoff vs upstream, time based, low is better
non-critical-length: 1
1 2 4 8 16 32 64 112 140
0 0.99 0.58 0.52 0.49 0.43 0.44 0.46 0.52 0.54
1 0.98 0.43 0.56 0.50 0.44 0.45 0.50 0.56 0.57
2 0.99 0.41 0.57 0.51 0.45 0.47 0.48 0.60 0.61
4 0.99 0.45 0.59 0.53 0.48 0.49 0.52 0.64 0.65
8 1.00 0.66 0.71 0.63 0.56 0.59 0.66 0.72 0.71
16 0.97 0.78 0.91 0.73 0.67 0.70 0.79 0.80 0.80
32 0.95 1.17 0.98 0.87 0.82 0.86 0.89 0.90 0.90
64 0.96 0.95 1.01 1.01 0.98 1.00 1.03 0.99 0.99
128 0.99 1.01 1.01 1.17 1.08 1.12 1.02 0.97 1.02
non-critical-length: 32
1 2 4 8 16 32 64 112 140
0 1.03 0.97 0.75 0.65 0.58 0.58 0.56 0.70 0.70
1 0.94 0.95 0.76 0.65 0.58 0.58 0.61 0.71 0.72
2 0.97 0.96 0.77 0.66 0.58 0.59 0.62 0.74 0.74
4 0.99 0.96 0.78 0.66 0.60 0.61 0.66 0.76 0.77
8 0.99 0.99 0.84 0.70 0.64 0.66 0.71 0.80 0.80
16 0.98 0.97 0.95 0.76 0.70 0.73 0.81 0.85 0.84
32 1.04 1.12 1.04 0.89 0.82 0.86 0.93 0.91 0.91
64 0.99 1.15 1.07 1.00 0.99 1.01 1.05 0.99 0.99
128 1.00 1.21 1.20 1.22 1.25 1.31 1.12 1.10 0.99
non-critical-length: 128
1 2 4 8 16 32 64 112 140
0 1.02 1.00 0.99 0.67 0.61 0.61 0.61 0.74 0.73
1 0.95 0.99 1.00 0.68 0.61 0.60 0.60 0.74 0.74
2 1.00 1.04 1.00 0.68 0.59 0.61 0.65 0.76 0.76
4 1.00 0.96 0.98 0.70 0.63 0.63 0.67 0.78 0.77
8 1.01 1.02 0.89 0.73 0.65 0.67 0.71 0.81 0.80
16 0.99 0.96 0.96 0.79 0.71 0.73 0.80 0.84 0.84
32 0.99 0.95 1.05 0.89 0.84 0.85 0.94 0.92 0.91
64 1.00 0.99 1.16 1.04 1.00 1.02 1.06 0.99 0.99
128 1.00 1.06 0.98 1.14 1.39 1.26 1.08 1.02 0.98
There is regression in large critical section. But adaptive mutex is
aimed for "quick" locks. Small critical section is more common when
users choose to use adaptive pthread_mutex.
Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Conflicts:
pthreadP.h
(had been moved)
nptl/pthread_mutex_lock.c
(max_adaptive_count renamed)
---
nptl/pthreadP.h | 1 +
nptl/pthread_mutex_lock.c | 16 +++++++--
sysdeps/nptl/pthread_mutex_backoff.h | 35 ++++++++++++++++++
sysdeps/x86_64/nptl/pthread_mutex_backoff.h | 39 +++++++++++++++++++++
4 files changed, 89 insertions(+), 2 deletions(-)
create mode 100644 sysdeps/nptl/pthread_mutex_backoff.h
create mode 100644 sysdeps/x86_64/nptl/pthread_mutex_backoff.h
diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h
index 7ddc166c..1550e3b6 100644
--- a/nptl/pthreadP.h
+++ b/nptl/pthreadP.h
@@ -33,6 +33,7 @@
#include <kernel-features.h>
#include <errno.h>
#include <internal-signals.h>
+#include <pthread_mutex_backoff.h>
/* Atomic operations on TLS memory. */
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
index d96a9933..c7770fc9 100644
--- a/nptl/pthread_mutex_lock.c
+++ b/nptl/pthread_mutex_lock.c
@@ -133,14 +133,26 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
int cnt = 0;
int max_cnt = MIN (MAX_ADAPTIVE_COUNT,
mutex->__data.__spins * 2 + 10);
+ int spin_count, exp_backoff = 1;
+ unsigned int jitter = get_jitter ();
do
{
- if (cnt++ >= max_cnt)
+ /* In each loop, spin count is exponential backoff plus
+ random jitter, random range is [0, exp_backoff-1]. */
+ spin_count = exp_backoff + (jitter & (exp_backoff - 1));
+ cnt += spin_count;
+ if (cnt >= max_cnt)
{
+ /* If cnt exceeds max spin count, just go to wait
+ queue. */
LLL_MUTEX_LOCK (mutex);
break;
}
- atomic_spin_nop ();
+ do
+ atomic_spin_nop ();
+ while (--spin_count > 0);
+ /* Prepare for next loop. */
+ exp_backoff = get_next_backoff (exp_backoff);
}
while (LLL_MUTEX_READ_LOCK (mutex) != 0
|| LLL_MUTEX_TRYLOCK (mutex) != 0);
diff --git a/sysdeps/nptl/pthread_mutex_backoff.h b/sysdeps/nptl/pthread_mutex_backoff.h
new file mode 100644
index 00000000..5b26c22a
--- /dev/null
+++ b/sysdeps/nptl/pthread_mutex_backoff.h
@@ -0,0 +1,35 @@
+/* Pthread mutex backoff configuration.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+#ifndef _PTHREAD_MUTEX_BACKOFF_H
+#define _PTHREAD_MUTEX_BACKOFF_H 1
+
+static inline unsigned int
+get_jitter (void)
+{
+ /* Arch dependent random jitter, return 0 disables random. */
+ return 0;
+}
+
+static inline int
+get_next_backoff (int backoff)
+{
+ /* Next backoff, return 1 disables mutex backoff. */
+ return 1;
+}
+
+#endif
diff --git a/sysdeps/x86_64/nptl/pthread_mutex_backoff.h b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
new file mode 100644
index 00000000..ec74c3d9
--- /dev/null
+++ b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
@@ -0,0 +1,39 @@
+/* Pthread mutex backoff configuration.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+#ifndef _PTHREAD_MUTEX_BACKOFF_H
+#define _PTHREAD_MUTEX_BACKOFF_H 1
+
+#include <fast-jitter.h>
+
+static inline unsigned int
+get_jitter (void)
+{
+ return get_fast_jitter ();
+}
+
+#define MAX_BACKOFF 16
+
+static inline int
+get_next_backoff (int backoff)
+{
+ /* Binary expontial backoff. Limiting max backoff
+ can reduce latency in large critical section. */
+ return (backoff < MAX_BACKOFF) ? backoff << 1 : backoff;
+}
+
+#endif
--
GitLab

View file

@ -1,55 +0,0 @@
From c6272098323153db373f2986c67786ea8c85f1cf Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Tue, 15 Feb 2022 08:18:15 -0600
Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ
#28896]
Content-type: text/plain; charset=UTF-8
In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
call strcmp-avx2 and wcscmp-avx2 respectively. This would have
not checks around vzeroupper and would trigger spurious
aborts. This commit fixes that.
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
AVX2 machines with and without RTM.
Co-authored-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/strcmp-avx2.S | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
Conflicts:
sysdeps/x86_64/multiarch/strcmp-avx2.S
(split into two patches due to upstream bug differences)
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 28cc98b6..e267c6cb 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -345,10 +345,10 @@ L(one_or_less):
movq %LOCALE_REG, %rdx
# endif
jb L(ret_zero)
-# ifdef USE_AS_WCSCMP
/* 'nbe' covers the case where length is negative (large
unsigned). */
- jnbe __wcscmp_avx2
+ jnbe OVERFLOW_STRCMP
+# ifdef USE_AS_WCSCMP
movl (%rdi), %edx
xorl %eax, %eax
cmpl (%rsi), %edx
@@ -357,10 +357,6 @@ L(one_or_less):
negl %eax
orl $1, %eax
# else
- /* 'nbe' covers the case where length is negative (large
- unsigned). */
-
- jnbe __strcmp_avx2
movzbl (%rdi), %eax
movzbl (%rsi), %ecx
TOLOWER_gpr (%rax, %eax)
--
GitLab

View file

@ -1,60 +0,0 @@
From 259a17cc98058d2576511201f85d28cb5d9de2a2 Mon Sep 17 00:00:00 2001
From: Stefan Liebler <stli@linux.ibm.com>
Date: Mon, 28 Jun 2021 13:01:07 +0200
Subject: s390x: Update math: redirect roundeven function
After recent commit
447954a206837b5f153869cfeeeab44631c3fac9
"math: redirect roundeven function", building on
s390x fails with:
Error: symbol `__roundevenl' is already defined
Similar to aarch64/riscv fix, this patch redirects target
specific functions for s390x:
commit 3213ed770cbc5821920d16caa93c85e92dd7b9f6
"Update math: redirect roundeven function"
diff --git a/sysdeps/s390/fpu/s_roundeven.c b/sysdeps/s390/fpu/s_roundeven.c
index 40b07e054b..0773adfed0 100644
--- a/sysdeps/s390/fpu/s_roundeven.c
+++ b/sysdeps/s390/fpu/s_roundeven.c
@@ -18,6 +18,7 @@
<https://www.gnu.org/licenses/>. */
#ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
+# define NO_MATH_REDIRECT
# include <math.h>
# include <libm-alias-double.h>
@@ -31,7 +32,6 @@ __roundeven (double x)
__asm__ ("fidbra %0,4,%1,4" : "=f" (y) : "f" (x));
return y;
}
-hidden_def (__roundeven)
libm_alias_double (__roundeven, roundeven)
#else
diff --git a/sysdeps/s390/fpu/s_roundevenf.c b/sysdeps/s390/fpu/s_roundevenf.c
index d2fbf3d2b6..289785bc4a 100644
--- a/sysdeps/s390/fpu/s_roundevenf.c
+++ b/sysdeps/s390/fpu/s_roundevenf.c
@@ -18,6 +18,7 @@
<https://www.gnu.org/licenses/>. */
#ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
+# define NO_MATH_REDIRECT
# include <math.h>
# include <libm-alias-float.h>
diff --git a/sysdeps/s390/fpu/s_roundevenl.c b/sysdeps/s390/fpu/s_roundevenl.c
index 29ab7a8616..94b6459ab4 100644
--- a/sysdeps/s390/fpu/s_roundevenl.c
+++ b/sysdeps/s390/fpu/s_roundevenl.c
@@ -18,6 +18,7 @@
<https://www.gnu.org/licenses/>. */
#ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
+# define NO_MATH_REDIRECT
# include <math.h>
# include <math_private.h>
# include <libm-alias-ldouble.h>

View file

@ -1,74 +0,0 @@
From 1da50d4bda07f04135dca39f40e79fc9eabed1f8 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 26 Feb 2021 05:36:59 -0800
Subject: [PATCH] x86: Set Prefer_No_VZEROUPPER and add Prefer_AVX2_STRCMP
Content-type: text/plain; charset=UTF-8
1. Set Prefer_No_VZEROUPPER if RTM is usable to avoid RTM abort triggered
by VZEROUPPER inside a transactionally executing RTM region.
2. Since to compare 2 32-byte strings, 256-bit EVEX strcmp requires 2
loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp requires 1 load, 2 VPCMPEQs,
1 VPMINU and 1 VPMOVMSKB, AVX2 strcmp is faster than EVEX strcmp. Add
Prefer_AVX2_STRCMP to prefer AVX2 strcmp family functions.
---
sysdeps/x86/cpu-features.c | 20 +++++++++++++++++--
sysdeps/x86/cpu-tunables.c | 2 ++
...cpu-features-preferred_feature_index_1.def | 1 +
3 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 91042505..3610ee5c 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -524,8 +524,24 @@ init_cpu_features (struct cpu_features *cpu_features)
cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
|= bit_arch_Prefer_No_VZEROUPPER;
else
- cpu_features->preferred[index_arch_Prefer_No_AVX512]
- |= bit_arch_Prefer_No_AVX512;
+ {
+ cpu_features->preferred[index_arch_Prefer_No_AVX512]
+ |= bit_arch_Prefer_No_AVX512;
+
+ /* Avoid RTM abort triggered by VZEROUPPER inside a
+ transactionally executing RTM region. */
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
+ |= bit_arch_Prefer_No_VZEROUPPER;
+
+ /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp
+ requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp
+ requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB,
+ AVX2 strcmp is faster than EVEX strcmp. */
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+ cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
+ |= bit_arch_Prefer_AVX2_STRCMP;
+ }
}
/* This spells out "AuthenticAMD". */
else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
index 3173b2b9..73adbaba 100644
--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
@@ -239,6 +239,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
Fast_Copy_Backward,
disable, 18);
+ CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
+ (n, cpu_features, Prefer_AVX2_STRCMP, AVX2, disable, 18);
}
break;
case 19:
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
index 17a5cc42..4ca70b40 100644
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
@@ -32,3 +32,4 @@ BIT (Prefer_ERMS)
BIT (Prefer_FSRM)
BIT (Prefer_No_AVX512)
BIT (MathVec_Prefer_No_AVX512)
+BIT (Prefer_AVX2_STRCMP)
--
GitLab

View file

@ -1,26 +0,0 @@
From 3213ed770cbc5821920d16caa93c85e92dd7b9f6 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Wed, 23 Jun 2021 13:29:41 -0700
Subject: Update math: redirect roundeven function
Redirect target specific roundeven functions for aarch64, ldbl-128ibm
and riscv.
Conflicts:
sysdeps/aarch64/*
(not needed)
sysdeps/riscv/*
(not supported)
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
index 6701970f4a..90eecf496b 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
@@ -17,6 +17,7 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
+#define NO_MATH_REDIRECT
#include <math.h>
#include <math_private.h>

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,242 +0,0 @@
From 63ad43566f7a25d140dc723598aeb441ad657eed Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 5 Mar 2021 06:46:08 -0800
Subject: [PATCH] x86-64: Add memmove family functions with 256-bit EVEX
Content-type: text/plain; charset=UTF-8
Update ifunc-memmove.h to select the function optimized with 256-bit EVEX
instructions using YMM16-YMM31 registers to avoid RTM abort with usable
AVX512VL since VZEROUPPER isn't needed at function exit.
---
sysdeps/x86_64/multiarch/Makefile | 1 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 36 +++++++++++++++++++
sysdeps/x86_64/multiarch/ifunc-memmove.h | 21 +++++++++--
.../multiarch/memmove-evex-unaligned-erms.S | 33 +++++++++++++++++
.../multiarch/memmove-vec-unaligned-erms.S | 24 ++++++++-----
5 files changed, 104 insertions(+), 11 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 46783cd1..4563fc56 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
memset-avx2-unaligned-erms \
memset-avx512-unaligned-erms \
memchr-evex \
+ memmove-evex-unaligned-erms \
memrchr-evex \
rawmemchr-evex \
stpcpy-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 082e4da3..6bd3abfc 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -80,6 +80,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX),
__memmove_chk_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memmove_chk_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memmove_chk_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
CPU_FEATURE_USABLE (SSSE3),
__memmove_chk_ssse3_back)
@@ -102,6 +108,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memmove,
CPU_FEATURE_USABLE (AVX),
__memmove_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, memmove,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memmove_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, memmove,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memmove_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memmove,
CPU_FEATURE_USABLE (AVX512F),
__memmove_avx512_no_vzeroupper)
@@ -565,6 +577,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX),
__memcpy_chk_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memcpy_chk_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memcpy_chk_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
__memcpy_chk_ssse3_back)
@@ -587,6 +605,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy,
CPU_FEATURE_USABLE (AVX),
__memcpy_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memcpy_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memcpy_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
__memcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
@@ -623,6 +647,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX),
__mempcpy_chk_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __mempcpy_chk_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __mempcpy_chk_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
__mempcpy_chk_ssse3_back)
@@ -654,6 +684,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX),
__mempcpy_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, mempcpy,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __mempcpy_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, mempcpy,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __mempcpy_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
__mempcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index 5e5f0299..6f8bce5f 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+ attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
@@ -59,10 +63,21 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
- return OPTIMIZE (avx_unaligned_erms);
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE (evex_unaligned_erms);
+
+ return OPTIMIZE (evex_unaligned);
+ }
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE (avx_unaligned_erms);
- return OPTIMIZE (avx_unaligned);
+ return OPTIMIZE (avx_unaligned);
+ }
}
if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
new file mode 100644
index 00000000..0cbce8f9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
@@ -0,0 +1,33 @@
+#if IS_IN (libc)
+# define VEC_SIZE 32
+# define XMM0 xmm16
+# define XMM1 xmm17
+# define YMM0 ymm16
+# define YMM1 ymm17
+# define VEC0 ymm16
+# define VEC1 ymm17
+# define VEC2 ymm18
+# define VEC3 ymm19
+# define VEC4 ymm20
+# define VEC5 ymm21
+# define VEC6 ymm22
+# define VEC7 ymm23
+# define VEC8 ymm24
+# define VEC9 ymm25
+# define VEC10 ymm26
+# define VEC11 ymm27
+# define VEC12 ymm28
+# define VEC13 ymm29
+# define VEC14 ymm30
+# define VEC15 ymm31
+# define VEC(i) VEC##i
+# define VMOVNT vmovntdq
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
+# define VZEROUPPER
+
+# define SECTION(p) p##.evex
+# define MEMMOVE_SYMBOL(p,s) p##_evex_##s
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 274aa1c7..08e21692 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -48,6 +48,14 @@
# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
#endif
+#ifndef XMM0
+# define XMM0 xmm0
+#endif
+
+#ifndef YMM0
+# define YMM0 ymm0
+#endif
+
#ifndef VZEROUPPER
# if VEC_SIZE > 16
# define VZEROUPPER vzeroupper
@@ -277,20 +285,20 @@ L(less_vec):
#if VEC_SIZE > 32
L(between_32_63):
/* From 32 to 63. No branch when size == 32. */
- vmovdqu (%rsi), %ymm0
- vmovdqu -32(%rsi,%rdx), %ymm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, -32(%rdi,%rdx)
+ VMOVU (%rsi), %YMM0
+ VMOVU -32(%rsi,%rdx), %YMM1
+ VMOVU %YMM0, (%rdi)
+ VMOVU %YMM1, -32(%rdi,%rdx)
VZEROUPPER
ret
#endif
#if VEC_SIZE > 16
/* From 16 to 31. No branch when size == 16. */
L(between_16_31):
- vmovdqu (%rsi), %xmm0
- vmovdqu -16(%rsi,%rdx), %xmm1
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm1, -16(%rdi,%rdx)
+ VMOVU (%rsi), %XMM0
+ VMOVU -16(%rsi,%rdx), %XMM1
+ VMOVU %XMM0, (%rdi)
+ VMOVU %XMM1, -16(%rdi,%rdx)
ret
#endif
L(between_8_15):
--
GitLab

View file

@ -1,254 +0,0 @@
From 1b968b6b9b3aac702ac2f133e0dd16cfdbb415ee Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 5 Mar 2021 07:15:03 -0800
Subject: [PATCH] x86-64: Add memset family functions with 256-bit EVEX
Content-type: text/plain; charset=UTF-8
Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
with 256-bit EVEX instructions using YMM16-YMM31 registers to avoid RTM
abort with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
function exit.
---
sysdeps/x86_64/multiarch/Makefile | 1 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 22 +++++++++++++++++
sysdeps/x86_64/multiarch/ifunc-memset.h | 24 +++++++++++++++----
sysdeps/x86_64/multiarch/ifunc-wmemset.h | 13 ++++++----
.../multiarch/memset-evex-unaligned-erms.S | 24 +++++++++++++++++++
.../multiarch/memset-vec-unaligned-erms.S | 20 +++++++++++-----
6 files changed, 90 insertions(+), 14 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 4563fc56..1cc0a10e 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -43,6 +43,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
memchr-evex \
memmove-evex-unaligned-erms \
memrchr-evex \
+ memset-evex-unaligned-erms \
rawmemchr-evex \
stpcpy-evex \
stpncpy-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 6bd3abfc..7cf83485 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -160,6 +160,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memset_chk,
CPU_FEATURE_USABLE (AVX2),
__memset_chk_avx2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __memset_chk_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __memset_chk_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memset_chk,
CPU_FEATURE_USABLE (AVX512F),
__memset_chk_avx512_unaligned_erms)
@@ -185,6 +193,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memset,
CPU_FEATURE_USABLE (AVX2),
__memset_avx2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, memset,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __memset_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, memset,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __memset_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memset,
CPU_FEATURE_USABLE (AVX512F),
__memset_avx512_unaligned_erms)
@@ -555,6 +571,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, wmemset,
CPU_FEATURE_USABLE (AVX2),
__wmemset_avx2_unaligned)
+ IFUNC_IMPL_ADD (array, i, wmemset,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __wmemset_evex_unaligned)
IFUNC_IMPL_ADD (array, i, wmemset,
CPU_FEATURE_USABLE (AVX512F),
__wmemset_avx512_unaligned))
@@ -723,6 +742,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
CPU_FEATURE_USABLE (AVX2),
__wmemset_chk_avx2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __wmemset_chk_evex_unaligned)
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
CPU_FEATURE_USABLE (AVX512F),
__wmemset_chk_avx512_unaligned))
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
index 708bd72e..6f31f4dc 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
@@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+ attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
@@ -56,10 +60,22 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
- return OPTIMIZE (avx2_unaligned_erms);
- else
- return OPTIMIZE (avx2_unaligned);
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE (evex_unaligned_erms);
+
+ return OPTIMIZE (evex_unaligned);
+ }
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE (avx2_unaligned_erms);
+
+ return OPTIMIZE (avx2_unaligned);
+ }
}
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
index eb242210..9290c4bf 100644
--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
@@ -20,6 +20,7 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
static inline void *
@@ -27,14 +28,18 @@ IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
- && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
{
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
+ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
return OPTIMIZE (avx512_unaligned);
- else
+
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+ return OPTIMIZE (evex_unaligned);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
return OPTIMIZE (avx2_unaligned);
}
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
new file mode 100644
index 00000000..ae0a4d6e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -0,0 +1,24 @@
+#if IS_IN (libc)
+# define VEC_SIZE 32
+# define XMM0 xmm16
+# define YMM0 ymm16
+# define VEC0 ymm16
+# define VEC(i) VEC##i
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
+# define VZEROUPPER
+
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ movq r, %rax; \
+ vpbroadcastb d, %VEC0
+
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ movq r, %rax; \
+ vpbroadcastd d, %VEC0
+
+# define SECTION(p) p##.evex
+# define MEMSET_SYMBOL(p,s) p##_evex_##s
+# define WMEMSET_SYMBOL(p,s) p##_evex_##s
+
+# include "memset-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 9a0fd818..71e91a8f 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -34,6 +34,14 @@
# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
#endif
+#ifndef XMM0
+# define XMM0 xmm0
+#endif
+
+#ifndef YMM0
+# define YMM0 ymm0
+#endif
+
#ifndef VZEROUPPER
# if VEC_SIZE > 16
# define VZEROUPPER vzeroupper
@@ -67,7 +75,7 @@
ENTRY (__bzero)
mov %RDI_LP, %RAX_LP /* Set return value. */
mov %RSI_LP, %RDX_LP /* Set n. */
- pxor %xmm0, %xmm0
+ pxor %XMM0, %XMM0
jmp L(entry_from_bzero)
END (__bzero)
weak_alias (__bzero, bzero)
@@ -223,7 +231,7 @@ L(less_vec):
cmpb $16, %dl
jae L(between_16_31)
# endif
- MOVQ %xmm0, %rcx
+ MOVQ %XMM0, %rcx
cmpb $8, %dl
jae L(between_8_15)
cmpb $4, %dl
@@ -238,16 +246,16 @@ L(less_vec):
# if VEC_SIZE > 32
/* From 32 to 63. No branch when size == 32. */
L(between_32_63):
- vmovdqu %ymm0, -32(%rdi,%rdx)
- vmovdqu %ymm0, (%rdi)
+ VMOVU %YMM0, -32(%rdi,%rdx)
+ VMOVU %YMM0, (%rdi)
VZEROUPPER
ret
# endif
# if VEC_SIZE > 16
/* From 16 to 31. No branch when size == 16. */
L(between_16_31):
- vmovdqu %xmm0, -16(%rdi,%rdx)
- vmovdqu %xmm0, (%rdi)
+ VMOVU %XMM0, -16(%rdi,%rdx)
+ VMOVU %XMM0, (%rdi)
VZEROUPPER
ret
# endif
--
GitLab

View file

@ -1,561 +0,0 @@
From 91264fe3577fe887b4860923fa6142b5274c8965 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 5 Mar 2021 07:20:28 -0800
Subject: [PATCH] x86-64: Add memcmp family functions with 256-bit EVEX
Content-type: text/plain; charset=UTF-8
Update ifunc-memcmp.h to select the function optimized with 256-bit EVEX
instructions using YMM16-YMM31 registers to avoid RTM abort with usable
AVX512VL, AVX512BW and MOVBE since VZEROUPPER isn't needed at function
exit.
---
sysdeps/x86_64/multiarch/Makefile | 4 +-
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 10 +
sysdeps/x86_64/multiarch/ifunc-memcmp.h | 13 +-
sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 440 ++++++++++++++++++
sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S | 4 +
5 files changed, 467 insertions(+), 4 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 1cc0a10e..9d79b138 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
memset-avx2-unaligned-erms \
memset-avx512-unaligned-erms \
memchr-evex \
+ memcmp-evex-movbe \
memmove-evex-unaligned-erms \
memrchr-evex \
memset-evex-unaligned-erms \
@@ -81,7 +82,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
wcsncmp-evex \
wcsnlen-evex \
wcsrchr-evex \
- wmemchr-evex
+ wmemchr-evex \
+ wmemcmp-evex-movbe
endif
ifeq ($(subdir),debug)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7cf83485..c8da910e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -56,6 +56,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (MOVBE)),
__memcmp_avx2_movbe)
+ IFUNC_IMPL_ADD (array, i, memcmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (MOVBE)),
+ __memcmp_evex_movbe)
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
__memcmp_sse4_1)
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
@@ -558,6 +563,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (MOVBE)),
__wmemcmp_avx2_movbe)
+ IFUNC_IMPL_ADD (array, i, wmemcmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (MOVBE)),
+ __wmemcmp_evex_movbe)
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
__wmemcmp_sse4_1)
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index 6c1f3153..3ca1f0a6 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -23,17 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
- && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
&& CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
- return OPTIMIZE (avx2_movbe);
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ return OPTIMIZE (evex_movbe);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ return OPTIMIZE (avx2_movbe);
+ }
if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
return OPTIMIZE (sse4_1);
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
new file mode 100644
index 00000000..9c093972
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -0,0 +1,440 @@
+/* memcmp/wmemcmp optimized with 256-bit EVEX instructions.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+/* memcmp/wmemcmp is implemented as:
+ 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
+ to avoid branches.
+ 2. Use overlapping compare to avoid branch.
+ 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
+ bytes for wmemcmp.
+ 4. If size is 8 * VEC_SIZE or less, unroll the loop.
+ 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
+ area.
+ 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+ 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+ 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+# define MEMCMP __memcmp_evex_movbe
+# endif
+
+# define VMOVU vmovdqu64
+
+# ifdef USE_AS_WMEMCMP
+# define VPCMPEQ vpcmpeqd
+# else
+# define VPCMPEQ vpcmpeqb
+# endif
+
+# define XMM1 xmm17
+# define XMM2 xmm18
+# define YMM1 ymm17
+# define YMM2 ymm18
+# define YMM3 ymm19
+# define YMM4 ymm20
+# define YMM5 ymm21
+# define YMM6 ymm22
+
+# define VEC_SIZE 32
+# ifdef USE_AS_WMEMCMP
+# define VEC_MASK 0xff
+# define XMM_MASK 0xf
+# else
+# define VEC_MASK 0xffffffff
+# define XMM_MASK 0xffff
+# endif
+
+/* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+ .section .text.evex,"ax",@progbits
+ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+ shl $2, %RDX_LP
+# elif defined __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
+# endif
+ cmp $VEC_SIZE, %RDX_LP
+ jb L(less_vec)
+
+ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k1
+ kmovd %k1, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+ cmpq $(VEC_SIZE * 2), %rdx
+ jbe L(last_vec)
+
+ /* More than 2 * VEC. */
+ cmpq $(VEC_SIZE * 8), %rdx
+ ja L(more_8x_vec)
+ cmpq $(VEC_SIZE * 4), %rdx
+ jb L(last_4x_vec)
+
+ /* From 4 * VEC to 8 * VEC, inclusively. */
+ VMOVU (%rsi), %YMM1
+ VPCMPEQ (%rdi), %YMM1, %k1
+
+ VMOVU VEC_SIZE(%rsi), %YMM2
+ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+
+ kandd %k1, %k2, %k5
+ kandd %k3, %k4, %k6
+ kandd %k5, %k6, %k6
+
+ kmovd %k6, %eax
+ cmpl $VEC_MASK, %eax
+ jne L(4x_vec_end)
+
+ leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+ leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+ VMOVU (%rsi), %YMM1
+ VPCMPEQ (%rdi), %YMM1, %k1
+
+ VMOVU VEC_SIZE(%rsi), %YMM2
+ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+ kandd %k1, %k2, %k5
+
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+ kandd %k3, %k5, %k5
+
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+ kandd %k4, %k5, %k5
+
+ kmovd %k5, %eax
+ cmpl $VEC_MASK, %eax
+ jne L(4x_vec_end)
+ xorl %eax, %eax
+ ret
+
+ .p2align 4
+L(last_2x_vec):
+ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+L(last_vec):
+ /* Use overlapping loads to avoid branches. */
+ leaq -VEC_SIZE(%rdi, %rdx), %rdi
+ leaq -VEC_SIZE(%rsi, %rdx), %rsi
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+ ret
+
+ .p2align 4
+L(first_vec):
+ /* A byte or int32 is different within 16 or 32 bytes. */
+ tzcntl %eax, %ecx
+# ifdef USE_AS_WMEMCMP
+ xorl %eax, %eax
+ movl (%rdi, %rcx, 4), %edx
+ cmpl (%rsi, %rcx, 4), %edx
+L(wmemcmp_return):
+ setl %al
+ negl %eax
+ orl $1, %eax
+# else
+ movzbl (%rdi, %rcx), %eax
+ movzbl (%rsi, %rcx), %edx
+ sub %edx, %eax
+# endif
+ ret
+
+# ifdef USE_AS_WMEMCMP
+ .p2align 4
+L(4):
+ xorl %eax, %eax
+ movl (%rdi), %edx
+ cmpl (%rsi), %edx
+ jne L(wmemcmp_return)
+ ret
+# else
+ .p2align 4
+L(between_4_7):
+ /* Load as big endian with overlapping movbe to avoid branches. */
+ movbe (%rdi), %eax
+ movbe (%rsi), %ecx
+ shlq $32, %rax
+ shlq $32, %rcx
+ movbe -4(%rdi, %rdx), %edi
+ movbe -4(%rsi, %rdx), %esi
+ orq %rdi, %rax
+ orq %rsi, %rcx
+ subq %rcx, %rax
+ je L(exit)
+ sbbl %eax, %eax
+ orl $1, %eax
+ ret
+
+ .p2align 4
+L(exit):
+ ret
+
+ .p2align 4
+L(between_2_3):
+ /* Load as big endian to avoid branches. */
+ movzwl (%rdi), %eax
+ movzwl (%rsi), %ecx
+ shll $8, %eax
+ shll $8, %ecx
+ bswap %eax
+ bswap %ecx
+ movb -1(%rdi, %rdx), %al
+ movb -1(%rsi, %rdx), %cl
+ /* Subtraction is okay because the upper 8 bits are zero. */
+ subl %ecx, %eax
+ ret
+
+ .p2align 4
+L(1):
+ movzbl (%rdi), %eax
+ movzbl (%rsi), %ecx
+ subl %ecx, %eax
+ ret
+# endif
+
+ .p2align 4
+L(zero):
+ xorl %eax, %eax
+ ret
+
+ .p2align 4
+L(less_vec):
+# ifdef USE_AS_WMEMCMP
+ /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */
+ cmpb $4, %dl
+ je L(4)
+ jb L(zero)
+# else
+ cmpb $1, %dl
+ je L(1)
+ jb L(zero)
+ cmpb $4, %dl
+ jb L(between_2_3)
+ cmpb $8, %dl
+ jb L(between_4_7)
+# endif
+ cmpb $16, %dl
+ jae L(between_16_31)
+ /* It is between 8 and 15 bytes. */
+ vmovq (%rdi), %XMM1
+ vmovq (%rsi), %XMM2
+ VPCMPEQ %XMM1, %XMM2, %k2
+ kmovw %k2, %eax
+ subl $XMM_MASK, %eax
+ jnz L(first_vec)
+ /* Use overlapping loads to avoid branches. */
+ leaq -8(%rdi, %rdx), %rdi
+ leaq -8(%rsi, %rdx), %rsi
+ vmovq (%rdi), %XMM1
+ vmovq (%rsi), %XMM2
+ VPCMPEQ %XMM1, %XMM2, %k2
+ kmovw %k2, %eax
+ subl $XMM_MASK, %eax
+ jnz L(first_vec)
+ ret
+
+ .p2align 4
+L(between_16_31):
+ /* From 16 to 31 bytes. No branch when size == 16. */
+ VMOVU (%rsi), %XMM2
+ VPCMPEQ (%rdi), %XMM2, %k2
+ kmovw %k2, %eax
+ subl $XMM_MASK, %eax
+ jnz L(first_vec)
+
+ /* Use overlapping loads to avoid branches. */
+ leaq -16(%rdi, %rdx), %rdi
+ leaq -16(%rsi, %rdx), %rsi
+ VMOVU (%rsi), %XMM2
+ VPCMPEQ (%rdi), %XMM2, %k2
+ kmovw %k2, %eax
+ subl $XMM_MASK, %eax
+ jnz L(first_vec)
+ ret
+
+ .p2align 4
+L(more_8x_vec):
+ /* More than 8 * VEC. Check the first VEC. */
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+ /* Align the first memory area for aligned loads in the loop.
+ Compute how much the first memory area is misaligned. */
+ movq %rdi, %rcx
+ andl $(VEC_SIZE - 1), %ecx
+ /* Get the negative of offset for alignment. */
+ subq $VEC_SIZE, %rcx
+ /* Adjust the second memory area. */
+ subq %rcx, %rsi
+ /* Adjust the first memory area which should be aligned now. */
+ subq %rcx, %rdi
+ /* Adjust length. */
+ addq %rcx, %rdx
+
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ VMOVU (%rsi), %YMM1
+ VPCMPEQ (%rdi), %YMM1, %k1
+
+ VMOVU VEC_SIZE(%rsi), %YMM2
+ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+ kandd %k2, %k1, %k5
+
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+ kandd %k3, %k5, %k5
+
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+ kandd %k4, %k5, %k5
+
+ kmovd %k5, %eax
+ cmpl $VEC_MASK, %eax
+ jne L(4x_vec_end)
+
+ addq $(VEC_SIZE * 4), %rdi
+ addq $(VEC_SIZE * 4), %rsi
+
+ subq $(VEC_SIZE * 4), %rdx
+ cmpq $(VEC_SIZE * 4), %rdx
+ jae L(loop_4x_vec)
+
+ /* Less than 4 * VEC. */
+ cmpq $VEC_SIZE, %rdx
+ jbe L(last_vec)
+ cmpq $(VEC_SIZE * 2), %rdx
+ jbe L(last_2x_vec)
+
+L(last_4x_vec):
+ /* From 2 * VEC to 4 * VEC. */
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+ addq $VEC_SIZE, %rdi
+ addq $VEC_SIZE, %rsi
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+ /* Use overlapping loads to avoid branches. */
+ leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
+ leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+ addq $VEC_SIZE, %rdi
+ addq $VEC_SIZE, %rsi
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+ ret
+
+ .p2align 4
+L(4x_vec_end):
+ kmovd %k1, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec_x1)
+ kmovd %k3, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec_x2)
+ kmovd %k4, %eax
+ subl $VEC_MASK, %eax
+ tzcntl %eax, %ecx
+# ifdef USE_AS_WMEMCMP
+ xorl %eax, %eax
+ movl (VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
+ cmpl (VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
+ jmp L(wmemcmp_return)
+# else
+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
+ sub %edx, %eax
+# endif
+ ret
+
+ .p2align 4
+L(first_vec_x1):
+ tzcntl %eax, %ecx
+# ifdef USE_AS_WMEMCMP
+ xorl %eax, %eax
+ movl VEC_SIZE(%rdi, %rcx, 4), %edx
+ cmpl VEC_SIZE(%rsi, %rcx, 4), %edx
+ jmp L(wmemcmp_return)
+# else
+ movzbl VEC_SIZE(%rdi, %rcx), %eax
+ movzbl VEC_SIZE(%rsi, %rcx), %edx
+ sub %edx, %eax
+# endif
+ ret
+
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %ecx
+# ifdef USE_AS_WMEMCMP
+ xorl %eax, %eax
+ movl (VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
+ cmpl (VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
+ jmp L(wmemcmp_return)
+# else
+ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
+ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx
+ sub %edx, %eax
+# endif
+ ret
+END (MEMCMP)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
new file mode 100644
index 00000000..4726d74a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
@@ -0,0 +1,4 @@
+#define MEMCMP __wmemcmp_evex_movbe
+#define USE_AS_WMEMCMP 1
+
+#include "memcmp-evex-movbe.S"
--
GitLab

File diff suppressed because it is too large Load diff

View file

@ -1,735 +0,0 @@
From 4bd660be40967cd69072f69ebc2ad32bfcc1f206 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 23 Feb 2021 06:33:10 -0800
Subject: [PATCH] x86: Add string/memory function tests in RTM region
Content-type: text/plain; charset=UTF-8
At function exit, AVX optimized string/memory functions have VZEROUPPER
which triggers RTM abort. When such functions are called inside a
transactionally executing RTM region, RTM abort causes severe performance
degradation. Add tests to verify that string/memory functions won't
cause RTM abort in RTM region.
---
sysdeps/x86/Makefile | 23 +++++++++++
sysdeps/x86/tst-memchr-rtm.c | 54 ++++++++++++++++++++++++++
sysdeps/x86/tst-memcmp-rtm.c | 52 +++++++++++++++++++++++++
sysdeps/x86/tst-memmove-rtm.c | 53 ++++++++++++++++++++++++++
sysdeps/x86/tst-memrchr-rtm.c | 54 ++++++++++++++++++++++++++
sysdeps/x86/tst-memset-rtm.c | 45 ++++++++++++++++++++++
sysdeps/x86/tst-strchr-rtm.c | 54 ++++++++++++++++++++++++++
sysdeps/x86/tst-strcpy-rtm.c | 53 ++++++++++++++++++++++++++
sysdeps/x86/tst-string-rtm.h | 72 +++++++++++++++++++++++++++++++++++
sysdeps/x86/tst-strlen-rtm.c | 53 ++++++++++++++++++++++++++
sysdeps/x86/tst-strncmp-rtm.c | 52 +++++++++++++++++++++++++
sysdeps/x86/tst-strrchr-rtm.c | 53 ++++++++++++++++++++++++++
12 files changed, 618 insertions(+)
create mode 100644 sysdeps/x86/tst-memchr-rtm.c
create mode 100644 sysdeps/x86/tst-memcmp-rtm.c
create mode 100644 sysdeps/x86/tst-memmove-rtm.c
create mode 100644 sysdeps/x86/tst-memrchr-rtm.c
create mode 100644 sysdeps/x86/tst-memset-rtm.c
create mode 100644 sysdeps/x86/tst-strchr-rtm.c
create mode 100644 sysdeps/x86/tst-strcpy-rtm.c
create mode 100644 sysdeps/x86/tst-string-rtm.h
create mode 100644 sysdeps/x86/tst-strlen-rtm.c
create mode 100644 sysdeps/x86/tst-strncmp-rtm.c
create mode 100644 sysdeps/x86/tst-strrchr-rtm.c
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index 59e928e9..5be71ada 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -17,6 +17,29 @@ endif
ifeq ($(subdir),string)
sysdep_routines += cacheinfo
+
+tests += \
+ tst-memchr-rtm \
+ tst-memcmp-rtm \
+ tst-memmove-rtm \
+ tst-memrchr-rtm \
+ tst-memset-rtm \
+ tst-strchr-rtm \
+ tst-strcpy-rtm \
+ tst-strlen-rtm \
+ tst-strncmp-rtm \
+ tst-strrchr-rtm
+
+CFLAGS-tst-memchr-rtm.c += -mrtm
+CFLAGS-tst-memcmp-rtm.c += -mrtm
+CFLAGS-tst-memmove-rtm.c += -mrtm
+CFLAGS-tst-memrchr-rtm.c += -mrtm
+CFLAGS-tst-memset-rtm.c += -mrtm
+CFLAGS-tst-strchr-rtm.c += -mrtm
+CFLAGS-tst-strcpy-rtm.c += -mrtm
+CFLAGS-tst-strlen-rtm.c += -mrtm
+CFLAGS-tst-strncmp-rtm.c += -mrtm
+CFLAGS-tst-strrchr-rtm.c += -mrtm
endif
ifneq ($(enable-cet),no)
diff --git a/sysdeps/x86/tst-memchr-rtm.c b/sysdeps/x86/tst-memchr-rtm.c
new file mode 100644
index 00000000..e4749401
--- /dev/null
+++ b/sysdeps/x86/tst-memchr-rtm.c
@@ -0,0 +1,54 @@
+/* Test case for memchr inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE);
+ string1[100] = 'c';
+ string1[STRING_SIZE - 100] = 'c';
+ char *p = memchr (string1, 'c', STRING_SIZE);
+ if (p == &string1[100])
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ char *p = memchr (string1, 'c', STRING_SIZE);
+ if (p == &string1[100])
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("memchr", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-memcmp-rtm.c b/sysdeps/x86/tst-memcmp-rtm.c
new file mode 100644
index 00000000..e4c8a623
--- /dev/null
+++ b/sysdeps/x86/tst-memcmp-rtm.c
@@ -0,0 +1,52 @@
+/* Test case for memcmp inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE);
+ memset (string2, 'a', STRING_SIZE);
+ if (memcmp (string1, string2, STRING_SIZE) == 0)
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ if (memcmp (string1, string2, STRING_SIZE) == 0)
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("memcmp", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-memmove-rtm.c b/sysdeps/x86/tst-memmove-rtm.c
new file mode 100644
index 00000000..4bf97ef1
--- /dev/null
+++ b/sysdeps/x86/tst-memmove-rtm.c
@@ -0,0 +1,53 @@
+/* Test case for memmove inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE);
+ if (memmove (string2, string1, STRING_SIZE) == string2
+ && memcmp (string2, string1, STRING_SIZE) == 0)
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ if (memmove (string2, string1, STRING_SIZE) == string2
+ && memcmp (string2, string1, STRING_SIZE) == 0)
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("memmove", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-memrchr-rtm.c b/sysdeps/x86/tst-memrchr-rtm.c
new file mode 100644
index 00000000..a57a5a8e
--- /dev/null
+++ b/sysdeps/x86/tst-memrchr-rtm.c
@@ -0,0 +1,54 @@
+/* Test case for memrchr inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE);
+ string1[100] = 'c';
+ string1[STRING_SIZE - 100] = 'c';
+ char *p = memrchr (string1, 'c', STRING_SIZE);
+ if (p == &string1[STRING_SIZE - 100])
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ char *p = memrchr (string1, 'c', STRING_SIZE);
+ if (p == &string1[STRING_SIZE - 100])
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("memrchr", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-memset-rtm.c b/sysdeps/x86/tst-memset-rtm.c
new file mode 100644
index 00000000..bf343a4d
--- /dev/null
+++ b/sysdeps/x86/tst-memset-rtm.c
@@ -0,0 +1,45 @@
+/* Test case for memset inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE);
+ return EXIT_SUCCESS;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ memset (string1, 'a', STRING_SIZE);
+ return 0;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("memset", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-strchr-rtm.c b/sysdeps/x86/tst-strchr-rtm.c
new file mode 100644
index 00000000..a82e29c0
--- /dev/null
+++ b/sysdeps/x86/tst-strchr-rtm.c
@@ -0,0 +1,54 @@
+/* Test case for strchr inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE - 1);
+ string1[100] = 'c';
+ string1[STRING_SIZE - 100] = 'c';
+ char *p = strchr (string1, 'c');
+ if (p == &string1[100])
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ char *p = strchr (string1, 'c');
+ if (p == &string1[100])
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("strchr", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-strcpy-rtm.c b/sysdeps/x86/tst-strcpy-rtm.c
new file mode 100644
index 00000000..2b2a583f
--- /dev/null
+++ b/sysdeps/x86/tst-strcpy-rtm.c
@@ -0,0 +1,53 @@
+/* Test case for strcpy inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE - 1);
+ if (strcpy (string2, string1) == string2
+ && strcmp (string2, string1) == 0)
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ if (strcpy (string2, string1) == string2
+ && strcmp (string2, string1) == 0)
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("strcpy", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-string-rtm.h b/sysdeps/x86/tst-string-rtm.h
new file mode 100644
index 00000000..d2470afa
--- /dev/null
+++ b/sysdeps/x86/tst-string-rtm.h
@@ -0,0 +1,72 @@
+/* Test string function in a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <string.h>
+#include <x86intrin.h>
+#include <sys/platform/x86.h>
+#include <support/check.h>
+#include <support/test-driver.h>
+
+static int
+do_test_1 (const char *name, unsigned int loop, int (*prepare) (void),
+ int (*function) (void))
+{
+ if (!CPU_FEATURE_USABLE (RTM))
+ return EXIT_UNSUPPORTED;
+
+ int status = prepare ();
+ if (status != EXIT_SUCCESS)
+ return status;
+
+ unsigned int i;
+ unsigned int naborts = 0;
+ unsigned int failed = 0;
+ for (i = 0; i < loop; i++)
+ {
+ failed |= function ();
+ if (_xbegin() == _XBEGIN_STARTED)
+ {
+ failed |= function ();
+ _xend();
+ }
+ else
+ {
+ failed |= function ();
+ ++naborts;
+ }
+ }
+
+ if (failed)
+ FAIL_EXIT1 ("%s() failed", name);
+
+ if (naborts)
+ {
+ /* NB: Low single digit (<= 5%) noise-level aborts are normal for
+ TSX. */
+ double rate = 100 * ((double) naborts) / ((double) loop);
+ if (rate > 5)
+ FAIL_EXIT1 ("TSX abort rate: %.2f%% (%d out of %d)",
+ rate, naborts, loop);
+ }
+
+ return EXIT_SUCCESS;
+}
+
+static int do_test (void);
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86/tst-strlen-rtm.c b/sysdeps/x86/tst-strlen-rtm.c
new file mode 100644
index 00000000..0dcf14db
--- /dev/null
+++ b/sysdeps/x86/tst-strlen-rtm.c
@@ -0,0 +1,53 @@
+/* Test case for strlen inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE - 1);
+ string1[STRING_SIZE - 100] = '\0';
+ size_t len = strlen (string1);
+ if (len == STRING_SIZE - 100)
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ size_t len = strlen (string1);
+ if (len == STRING_SIZE - 100)
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("strlen", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
new file mode 100644
index 00000000..236ad951
--- /dev/null
+++ b/sysdeps/x86/tst-strncmp-rtm.c
@@ -0,0 +1,52 @@
+/* Test case for strncmp inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE - 1);
+ memset (string2, 'a', STRING_SIZE - 1);
+ if (strncmp (string1, string2, STRING_SIZE) == 0)
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ if (strncmp (string1, string2, STRING_SIZE) == 0)
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("strncmp", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-strrchr-rtm.c b/sysdeps/x86/tst-strrchr-rtm.c
new file mode 100644
index 00000000..e32bfaf5
--- /dev/null
+++ b/sysdeps/x86/tst-strrchr-rtm.c
@@ -0,0 +1,53 @@
+/* Test case for strrchr inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE - 1);
+ string1[STRING_SIZE - 100] = 'c';
+ char *p = strrchr (string1, 'c');
+ if (p == &string1[STRING_SIZE - 100])
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ char *p = strrchr (string1, 'c');
+ if (p == &string1[STRING_SIZE - 100])
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("strrchr", LOOP, prepare, function);
+}
--
GitLab

View file

@ -1,148 +0,0 @@
From 4e2d8f352774b56078c34648b14a2412c38384f4 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Sun, 7 Mar 2021 09:44:18 -0800
Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memset family functions
Content-type: text/plain; charset=UTF-8
Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
with AVX512 instructions using ZMM16-ZMM31 registers to avoid RTM abort
with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
function exit.
---
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 14 +++++++++-----
sysdeps/x86_64/multiarch/ifunc-memset.h | 13 ++++++++-----
sysdeps/x86_64/multiarch/ifunc-wmemset.h | 12 ++++++------
.../multiarch/memset-avx512-unaligned-erms.S | 16 ++++++++--------
4 files changed, 31 insertions(+), 24 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index c1efeec0..d969a156 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -211,10 +211,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)),
__memset_chk_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memset_chk,
- CPU_FEATURE_USABLE (AVX512F),
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
__memset_chk_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memset_chk,
- CPU_FEATURE_USABLE (AVX512F),
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
__memset_chk_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, __memset_chk,
CPU_FEATURE_USABLE (AVX512F),
@@ -252,10 +254,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)),
__memset_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memset,
- CPU_FEATURE_USABLE (AVX512F),
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
__memset_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memset,
- CPU_FEATURE_USABLE (AVX512F),
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
__memset_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, memset,
CPU_FEATURE_USABLE (AVX512F),
@@ -719,7 +723,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
CPU_FEATURE_USABLE (AVX512VL),
__wmemset_evex_unaligned)
IFUNC_IMPL_ADD (array, i, wmemset,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__wmemset_avx512_unaligned))
#ifdef SHARED
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
index 6f3375cc..19795938 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
@@ -53,13 +53,16 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
{
- if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
- return OPTIMIZE (avx512_no_vzeroupper);
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE (avx512_unaligned_erms);
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
- return OPTIMIZE (avx512_unaligned_erms);
+ return OPTIMIZE (avx512_unaligned);
+ }
- return OPTIMIZE (avx512_unaligned);
+ return OPTIMIZE (avx512_no_vzeroupper);
}
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
index bdc94c6c..98c5d406 100644
--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
@@ -33,13 +33,13 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
- return OPTIMIZE (avx512_unaligned);
-
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
- return OPTIMIZE (evex_unaligned);
+ {
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+ return OPTIMIZE (avx512_unaligned);
+
+ return OPTIMIZE (evex_unaligned);
+ }
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
return OPTIMIZE (avx2_unaligned_rtm);
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index 0783979c..22e7b187 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -1,22 +1,22 @@
#if IS_IN (libc)
# define VEC_SIZE 64
-# define VEC(i) zmm##i
+# define XMM0 xmm16
+# define YMM0 ymm16
+# define VEC0 zmm16
+# define VEC(i) VEC##i
# define VMOVU vmovdqu64
# define VMOVA vmovdqa64
+# define VZEROUPPER
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- vmovd d, %xmm0; \
movq r, %rax; \
- vpbroadcastb %xmm0, %xmm0; \
- vpbroadcastq %xmm0, %zmm0
+ vpbroadcastb d, %VEC0
# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- vmovd d, %xmm0; \
movq r, %rax; \
- vpbroadcastd %xmm0, %xmm0; \
- vpbroadcastq %xmm0, %zmm0
+ vpbroadcastd d, %VEC0
-# define SECTION(p) p##.avx512
+# define SECTION(p) p##.evex512
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
# define WMEMSET_SYMBOL(p,s) p##_avx512_##s
--
GitLab

View file

@ -1,230 +0,0 @@
From b304fc201d2f6baf52ea790df8643e99772243cd Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 21 Jan 2019 11:25:56 -0800
Subject: [PATCH] x86-64 memcmp/wmemcmp: Properly handle the length parameter
[BZ# 24097]
Content-type: text/plain; charset=UTF-8
On x32, the size_t parameter may be passed in the lower 32 bits of a
64-bit register with the non-zero upper 32 bits. The string/memory
functions written in assembly can only use the lower 32 bits of a
64-bit register as length or must clear the upper 32 bits before using
the full 64-bit register for length.
This pach fixes memcmp/wmemcmp for x32. Tested on x86-64 and x32. On
x86-64, libc.so is the same with and withou the fix.
[BZ# 24097]
CVE-2019-6488
* sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S: Use RDX_LP for
length. Clear the upper 32 bits of RDX register.
* sysdeps/x86_64/multiarch/memcmp-sse4.S: Likewise.
* sysdeps/x86_64/multiarch/memcmp-ssse3.S: Likewise.
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp and
tst-size_t-wmemcmp.
* sysdeps/x86_64/x32/tst-size_t-memcmp.c: New file.
* sysdeps/x86_64/x32/tst-size_t-wmemcmp.c: Likewise.
---
sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 7 +-
sysdeps/x86_64/multiarch/memcmp-sse4.S | 9 ++-
sysdeps/x86_64/multiarch/memcmp-ssse3.S | 7 +-
sysdeps/x86_64/x32/Makefile | 4 +-
sysdeps/x86_64/x32/tst-size_t-memcmp.c | 76 ++++++++++++++++++++
sysdeps/x86_64/x32/tst-size_t-wmemcmp.c | 20 ++++++
6 files changed, 114 insertions(+), 9 deletions(-)
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp.c
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
Conflicts:
ChangeLog
(removed)
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
index 30f764c3..e3a35b89 100644
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -58,9 +58,12 @@
.section .text.avx,"ax",@progbits
ENTRY (MEMCMP)
# ifdef USE_AS_WMEMCMP
- shl $2, %rdx
+ shl $2, %RDX_LP
+# elif defined __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
# endif
- cmpq $VEC_SIZE, %rdx
+ cmp $VEC_SIZE, %RDX_LP
jb L(less_vec)
/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
index 8e164f2c..302900f5 100644
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -42,13 +42,16 @@
.section .text.sse4.1,"ax",@progbits
ENTRY (MEMCMP)
# ifdef USE_AS_WMEMCMP
- shl $2, %rdx
+ shl $2, %RDX_LP
+# elif defined __ILP32__
+ /* Clear the upper 32 bits. */
+ mov %edx, %edx
# endif
pxor %xmm0, %xmm0
- cmp $79, %rdx
+ cmp $79, %RDX_LP
ja L(79bytesormore)
# ifndef USE_AS_WMEMCMP
- cmp $1, %rdx
+ cmp $1, %RDX_LP
je L(firstbyte)
# endif
add %rdx, %rsi
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
index 6f76c641..69d030fc 100644
--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
@@ -33,9 +33,12 @@
atom_text_section
ENTRY (MEMCMP)
# ifdef USE_AS_WMEMCMP
- shl $2, %rdx
- test %rdx, %rdx
+ shl $2, %RDX_LP
+ test %RDX_LP, %RDX_LP
jz L(equal)
+# elif defined __ILP32__
+ /* Clear the upper 32 bits. */
+ mov %edx, %edx
# endif
mov %rdx, %rcx
mov %rdi, %rdx
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
index 7d528889..ddec7f04 100644
--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
@@ -6,9 +6,9 @@ CFLAGS-s_llround.c += -fno-builtin-lround
endif
ifeq ($(subdir),string)
-tests += tst-size_t-memchr
+tests += tst-size_t-memchr tst-size_t-memcmp
endif
ifeq ($(subdir),wcsmbs)
-tests += tst-size_t-wmemchr
+tests += tst-size_t-wmemchr tst-size_t-wmemcmp
endif
diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp.c b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
new file mode 100644
index 00000000..9bd6fdb4
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
@@ -0,0 +1,76 @@
+/* Test memcmp with size_t in the lower 32 bits of 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define TEST_MAIN
+#ifdef WIDE
+# define TEST_NAME "wmemcmp"
+#else
+# define TEST_NAME "memcmp"
+#endif
+
+#include "test-size_t.h"
+
+#ifdef WIDE
+# include <inttypes.h>
+# include <wchar.h>
+
+# define MEMCMP wmemcmp
+# define CHAR wchar_t
+#else
+# define MEMCMP memcmp
+# define CHAR char
+#endif
+
+IMPL (MEMCMP, 1)
+
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
+
+static int
+__attribute__ ((noinline, noclone))
+do_memcmp (parameter_t a, parameter_t b)
+{
+ return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+ test_init ();
+
+ parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 };
+ parameter_t src = { { 0 }, buf2 };
+
+ memcpy (buf1, buf2, page_size);
+
+ int ret = 0;
+ FOR_EACH_IMPL (impl, 0)
+ {
+ src.fn = impl->fn;
+ int res = do_memcmp (dest, src);
+ if (res)
+ {
+ error (0, 0, "Wrong result in function %s: %i != 0",
+ impl->name, res);
+ ret = 1;
+ }
+ }
+
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
new file mode 100644
index 00000000..e8b5ffd0
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
@@ -0,0 +1,20 @@
+/* Test wmemcmp with size_t in the lower 32 bits of 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define WIDE 1
+#include "tst-size_t-memcmp.c"
--
GitLab

View file

@ -1,164 +0,0 @@
From e4fda4631017e49d4ee5a2755db34289b6860fa4 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Sun, 7 Mar 2021 09:45:23 -0800
Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memmove family functions
Content-type: text/plain; charset=UTF-8
Update ifunc-memmove.h to select the function optimized with AVX512
instructions using ZMM16-ZMM31 registers to avoid RTM abort with usable
AVX512VL since VZEROUPPER isn't needed at function exit.
---
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 24 +++++++++---------
sysdeps/x86_64/multiarch/ifunc-memmove.h | 12 +++++----
.../multiarch/memmove-avx512-unaligned-erms.S | 25 +++++++++++++++++--
3 files changed, 42 insertions(+), 19 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index d969a156..fec384f6 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -83,10 +83,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
CPU_FEATURE_USABLE (AVX512F),
__memmove_chk_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__memmove_chk_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__memmove_chk_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX),
@@ -148,10 +148,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
CPU_FEATURE_USABLE (AVX512F),
__memmove_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, memmove,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__memmove_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, memmove,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__memmove_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
__memmove_ssse3_back)
@@ -733,10 +733,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
CPU_FEATURE_USABLE (AVX512F),
__memcpy_chk_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__memcpy_chk_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__memcpy_chk_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX),
@@ -802,10 +802,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
CPU_FEATURE_USABLE (AVX512F),
__memcpy_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, memcpy,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__memcpy_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, memcpy,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__memcpy_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, memcpy, 1,
@@ -819,10 +819,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
CPU_FEATURE_USABLE (AVX512F),
__mempcpy_chk_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_chk_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_chk_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX),
@@ -864,10 +864,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
CPU_FEATURE_USABLE (AVX512F),
__mempcpy_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, mempcpy,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, mempcpy,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX),
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index fa09b9fb..014e95c7 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -56,13 +56,15 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
{
- if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
- return OPTIMIZE (avx512_no_vzeroupper);
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE (avx512_unaligned_erms);
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
- return OPTIMIZE (avx512_unaligned_erms);
+ return OPTIMIZE (avx512_unaligned);
+ }
- return OPTIMIZE (avx512_unaligned);
+ return OPTIMIZE (avx512_no_vzeroupper);
}
if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
index aac1515c..848848ab 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -1,11 +1,32 @@
#if IS_IN (libc)
# define VEC_SIZE 64
-# define VEC(i) zmm##i
+# define XMM0 xmm16
+# define XMM1 xmm17
+# define YMM0 ymm16
+# define YMM1 ymm17
+# define VEC0 zmm16
+# define VEC1 zmm17
+# define VEC2 zmm18
+# define VEC3 zmm19
+# define VEC4 zmm20
+# define VEC5 zmm21
+# define VEC6 zmm22
+# define VEC7 zmm23
+# define VEC8 zmm24
+# define VEC9 zmm25
+# define VEC10 zmm26
+# define VEC11 zmm27
+# define VEC12 zmm28
+# define VEC13 zmm29
+# define VEC14 zmm30
+# define VEC15 zmm31
+# define VEC(i) VEC##i
# define VMOVNT vmovntdq
# define VMOVU vmovdqu64
# define VMOVA vmovdqa64
+# define VZEROUPPER
-# define SECTION(p) p##.avx512
+# define SECTION(p) p##.evex512
# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s
# include "memmove-vec-unaligned-erms.S"
--
GitLab

View file

@ -1,71 +0,0 @@
From 595c22ecd8e87a27fd19270ed30fdbae9ad25426 Mon Sep 17 00:00:00 2001
From: Sunil K Pandey <skpgkp2@gmail.com>
Date: Thu, 1 Apr 2021 15:47:04 -0700
Subject: [PATCH] x86-64: Fix ifdef indentation in strlen-evex.S
Content-type: text/plain; charset=UTF-8
Fix some indentations of ifdef in file strlen-evex.S which are off by 1
and confusing to read.
---
sysdeps/x86_64/multiarch/strlen-evex.S | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
index cd022509..05838190 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
@@ -276,10 +276,10 @@ L(last_2x_vec):
.p2align 4
L(first_vec_x0_check):
tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
sall $2, %eax
-# endif
+# endif
/* Check the end of data. */
cmpq %rax, %rsi
jbe L(max)
@@ -293,10 +293,10 @@ L(first_vec_x0_check):
.p2align 4
L(first_vec_x1_check):
tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
sall $2, %eax
-# endif
+# endif
/* Check the end of data. */
cmpq %rax, %rsi
jbe L(max)
@@ -311,10 +311,10 @@ L(first_vec_x1_check):
.p2align 4
L(first_vec_x2_check):
tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
sall $2, %eax
-# endif
+# endif
/* Check the end of data. */
cmpq %rax, %rsi
jbe L(max)
@@ -329,10 +329,10 @@ L(first_vec_x2_check):
.p2align 4
L(first_vec_x3_check):
tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
sall $2, %eax
-# endif
+# endif
/* Check the end of data. */
cmpq %rax, %rsi
jbe L(max)
--
GitLab

View file

@ -1,51 +0,0 @@
From 55bf411b451c13f0fb7ff3d3bf9a820020b45df1 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 19 Apr 2021 07:07:21 -0700
Subject: [PATCH] x86-64: Require BMI2 for __strlen_evex and __strnlen_evex
Content-type: text/plain; charset=UTF-8
Since __strlen_evex and __strnlen_evex added by
commit 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Mar 5 06:24:52 2021 -0800
x86-64: Add ifunc-avx2.h functions with 256-bit EVEX
use sarx:
c4 e2 6a f7 c0 sarx %edx,%eax,%eax
require BMI2 for __strlen_evex and __strnlen_evex in ifunc-impl-list.c.
ifunc-avx2.h already requires BMI2 for EVEX implementation.
---
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index fec384f6..cbfc1a5d 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -293,7 +293,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__strlen_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strlen,
(CPU_FEATURE_USABLE (AVX512VL)
- && CPU_FEATURE_USABLE (AVX512BW)),
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
__strlen_evex)
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
@@ -308,7 +309,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__strnlen_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strnlen,
(CPU_FEATURE_USABLE (AVX512VL)
- && CPU_FEATURE_USABLE (AVX512BW)),
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
__strnlen_evex)
IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
--
GitLab

View file

@ -1,584 +0,0 @@
From acfd088a1963ba51cd83c78f95c0ab25ead79e04 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon, 3 May 2021 03:01:58 -0400
Subject: [PATCH] x86: Optimize memchr-avx2.S
Content-type: text/plain; charset=UTF-8
No bug. This commit optimizes memchr-avx2.S. The optimizations include
replacing some branches with cmovcc, avoiding some branches entirely
in the less_4x_vec case, making the page cross logic less strict,
asaving a few instructions the in loop return loop. test-memchr,
test-rawmemchr, and test-wmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/memchr-avx2.S | 425 ++++++++++++++-----------
1 file changed, 247 insertions(+), 178 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index cf893e77..b377f22e 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -26,8 +26,22 @@
# ifdef USE_AS_WMEMCHR
# define VPCMPEQ vpcmpeqd
+# define VPBROADCAST vpbroadcastd
+# define CHAR_SIZE 4
# else
# define VPCMPEQ vpcmpeqb
+# define VPBROADCAST vpbroadcastb
+# define CHAR_SIZE 1
+# endif
+
+# ifdef USE_AS_RAWMEMCHR
+# define ERAW_PTR_REG ecx
+# define RRAW_PTR_REG rcx
+# define ALGN_PTR_REG rdi
+# else
+# define ERAW_PTR_REG edi
+# define RRAW_PTR_REG rdi
+# define ALGN_PTR_REG rcx
# endif
# ifndef VZEROUPPER
@@ -39,6 +53,7 @@
# endif
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
.section SECTION(.text),"ax",@progbits
ENTRY (MEMCHR)
@@ -47,295 +62,349 @@ ENTRY (MEMCHR)
test %RDX_LP, %RDX_LP
jz L(null)
# endif
- movl %edi, %ecx
- /* Broadcast CHAR to YMM0. */
- vmovd %esi, %xmm0
# ifdef USE_AS_WMEMCHR
shl $2, %RDX_LP
- vpbroadcastd %xmm0, %ymm0
# else
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
# endif
- vpbroadcastb %xmm0, %ymm0
# endif
+ /* Broadcast CHAR to YMMMATCH. */
+ vmovd %esi, %xmm0
+ VPBROADCAST %xmm0, %ymm0
/* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ movl %edi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
+ VPCMPEQ (%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
- testl %eax, %eax
-
# ifndef USE_AS_RAWMEMCHR
- jnz L(first_vec_x0_check)
- /* Adjust length and check the end of data. */
- subq $VEC_SIZE, %rdx
- jbe L(zero)
-# else
- jnz L(first_vec_x0)
+ /* If length < CHAR_PER_VEC handle special. */
+ cmpq $VEC_SIZE, %rdx
+ jbe L(first_vec_x0)
# endif
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
- addq %rcx, %rdx
+ .p2align 5
+L(first_vec_x0):
+ /* Check if first match was before length. */
+ tzcntl %eax, %eax
+ xorl %ecx, %ecx
+ cmpl %eax, %edx
+ leaq (%rdi, %rax), %rax
+ cmovle %rcx, %rax
+ VZEROUPPER_RETURN
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
+L(null):
+ xorl %eax, %eax
+ ret
# endif
- jmp L(more_4x_vec)
-
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- VPCMPEQ (%rdi), %ymm0, %ymm1
+L(cross_page_boundary):
+ /* Save pointer before aligning as its original value is necessary
+ for computer return address if byte is found or adjusting length
+ if it is not and this is memchr. */
+ movq %rdi, %rcx
+ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
+ rdi for rawmemchr. */
+ orq $(VEC_SIZE - 1), %ALGN_PTR_REG
+ VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Calculate length until end of page (length checked for a
+ match). */
+ leaq 1(%ALGN_PTR_REG), %rsi
+ subq %RRAW_PTR_REG, %rsi
+# endif
/* Remove the leading bytes. */
- sarl %cl, %eax
- testl %eax, %eax
- jz L(aligned_more)
- tzcntl %eax, %eax
+ sarxl %ERAW_PTR_REG, %eax, %eax
# ifndef USE_AS_RAWMEMCHR
/* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
+ cmpq %rsi, %rdx
+ jbe L(first_vec_x0)
# endif
- addq %rdi, %rax
- addq %rcx, %rax
+ testl %eax, %eax
+ jz L(cross_page_continue)
+ tzcntl %eax, %eax
+ addq %RRAW_PTR_REG, %rax
L(return_vzeroupper):
ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
-L(aligned_more):
-# ifndef USE_AS_RAWMEMCHR
- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
- overflow. */
- negq %rcx
- addq $VEC_SIZE, %rcx
+L(first_vec_x1):
+ tzcntl %eax, %eax
+ incq %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
- /* Check the end of data. */
- subq %rcx, %rdx
- jbe L(zero)
-# endif
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE + 1), %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
+
+ .p2align 4
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 2 + 1), %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
- addq $VEC_SIZE, %rdi
-# ifndef USE_AS_RAWMEMCHR
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
+ .p2align 4
+L(first_vec_x4):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 3 + 1), %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
-L(more_4x_vec):
+ .p2align 4
+L(aligned_more):
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+# ifndef USE_AS_RAWMEMCHR
+L(cross_page_continue):
+ /* Align data to VEC_SIZE - 1. */
+ xorl %ecx, %ecx
+ subl %edi, %ecx
+ orq $(VEC_SIZE - 1), %rdi
+ /* esi is for adjusting length to see if near the end. */
+ leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+# else
+ orq $(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
+# endif
+ /* Load first VEC regardless. */
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Adjust length. If near end handle specially. */
+ subq %rsi, %rdx
+ jbe L(last_4x_vec_or_less)
+# endif
testl %eax, %eax
jnz L(first_vec_x1)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x2)
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x3)
- addq $(VEC_SIZE * 4), %rdi
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x4)
# ifndef USE_AS_RAWMEMCHR
+ /* Check if at last VEC_SIZE * 4 length. */
subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
-
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
- andq $-(4 * VEC_SIZE), %rdi
-
-# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
+ jbe L(last_4x_vec_or_less_cmpeq)
+ /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
+ length. */
+ incq %rdi
+ movl %edi, %ecx
+ orq $(VEC_SIZE * 4 - 1), %rdi
+ andl $(VEC_SIZE * 4 - 1), %ecx
addq %rcx, %rdx
+# else
+ /* Align data to VEC_SIZE * 4 - 1 for loop. */
+ incq %rdi
+ orq $(VEC_SIZE * 4 - 1), %rdi
# endif
+ /* Compare 4 * VEC at a time forward. */
.p2align 4
L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
-
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
vpor %ymm1, %ymm2, %ymm5
vpor %ymm3, %ymm4, %ymm6
vpor %ymm5, %ymm6, %ymm5
- vpmovmskb %ymm5, %eax
- testl %eax, %eax
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
-
+ vpmovmskb %ymm5, %ecx
# ifdef USE_AS_RAWMEMCHR
- jmp L(loop_4x_vec)
+ subq $-(VEC_SIZE * 4), %rdi
+ testl %ecx, %ecx
+ jz L(loop_4x_vec)
# else
- subq $(VEC_SIZE * 4), %rdx
- ja L(loop_4x_vec)
+ testl %ecx, %ecx
+ jnz L(loop_4x_vec_end)
-L(last_4x_vec_or_less):
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
- addl $(VEC_SIZE * 2), %edx
- jle L(last_2x_vec)
+ subq $-(VEC_SIZE * 4), %rdi
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
+ subq $(VEC_SIZE * 4), %rdx
+ ja L(loop_4x_vec)
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+ /* Fall through into less than 4 remaining vectors of length case.
+ */
+ VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+ .p2align 4
+L(last_4x_vec_or_less):
+ /* Check if first VEC contained match. */
testl %eax, %eax
- jnz L(first_vec_x1)
+ jnz L(first_vec_x1_check)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
+ /* If remaining length > VEC_SIZE * 2. */
+ addl $(VEC_SIZE * 2), %edx
+ jg L(last_4x_vec)
- jnz L(first_vec_x2_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
+L(last_2x_vec):
+ /* If remaining length < VEC_SIZE. */
+ addl $VEC_SIZE, %edx
+ jle L(zero_end)
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ /* Check VEC2 and compare any match with remaining length. */
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
- testl %eax, %eax
-
- jnz L(first_vec_x3_check)
- xorl %eax, %eax
+ tzcntl %eax, %eax
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ addq $(VEC_SIZE + 1), %rdi
+ addq %rdi, %rax
+L(zero_end):
VZEROUPPER_RETURN
.p2align 4
-L(last_2x_vec):
- addl $(VEC_SIZE * 2), %edx
- VPCMPEQ (%rdi), %ymm0, %ymm1
+L(loop_4x_vec_end):
+# endif
+ /* rawmemchr will fall through into this if match was found in
+ loop. */
+
vpmovmskb %ymm1, %eax
testl %eax, %eax
+ jnz L(last_vec_x1_return)
- jnz L(first_vec_x0_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
-
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm2, %eax
testl %eax, %eax
- jnz L(first_vec_x1_check)
- xorl %eax, %eax
- VZEROUPPER_RETURN
+ jnz L(last_vec_x2_return)
- .p2align 4
-L(first_vec_x0_check):
- tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
+ vpmovmskb %ymm3, %eax
+ /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */
+ salq $32, %rcx
+ orq %rcx, %rax
+ tzcntq %rax, %rax
+# ifdef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 2 - 1), %rdi
+# else
+ subq $-(VEC_SIZE * 2 + 1), %rdi
+# endif
addq %rdi, %rax
VZEROUPPER_RETURN
+# ifndef USE_AS_RAWMEMCHR
.p2align 4
L(first_vec_x1_check):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $VEC_SIZE, %rax
+ /* Adjust length. */
+ subl $-(VEC_SIZE * 4), %edx
+ /* Check if match within remaining length. */
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ incq %rdi
addq %rdi, %rax
VZEROUPPER_RETURN
+ .p2align 4
+L(set_zero_end):
+ xorl %eax, %eax
+ VZEROUPPER_RETURN
+# endif
.p2align 4
-L(first_vec_x2_check):
+L(last_vec_x1_return):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 2), %rax
+# ifdef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 4 - 1), %rdi
+# else
+ incq %rdi
+# endif
addq %rdi, %rax
VZEROUPPER_RETURN
.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x2_return):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 3), %rax
+# ifdef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 3 - 1), %rdi
+# else
+ subq $-(VEC_SIZE + 1), %rdi
+# endif
addq %rdi, %rax
VZEROUPPER_RETURN
+# ifndef USE_AS_RAWMEMCHR
.p2align 4
-L(zero):
- xorl %eax, %eax
- jmp L(return_vzeroupper)
+L(last_4x_vec_or_less_cmpeq):
+ VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ subq $-(VEC_SIZE * 4), %rdi
+ /* Check first VEC regardless. */
+ testl %eax, %eax
+ jnz L(first_vec_x1_check)
+ /* If remaining length <= CHAR_PER_VEC * 2. */
+ addl $(VEC_SIZE * 2), %edx
+ jle L(last_2x_vec)
.p2align 4
-L(null):
- xorl %eax, %eax
- ret
-# endif
+L(last_4x_vec):
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2_return)
- .p2align 4
-L(first_vec_x0):
- tzcntl %eax, %eax
- addq %rdi, %rax
- VZEROUPPER_RETURN
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
- .p2align 4
-L(first_vec_x1):
- tzcntl %eax, %eax
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
- VZEROUPPER_RETURN
+ /* Create mask for possible matches within remaining length. */
+ movq $-1, %rcx
+ bzhiq %rdx, %rcx, %rcx
- .p2align 4
-L(first_vec_x2):
+ /* Test matches in data against length match. */
+ andl %ecx, %eax
+ jnz L(last_vec_x3)
+
+ /* if remaining length <= VEC_SIZE * 3 (Note this is after
+ remaining length was found to be > VEC_SIZE * 2. */
+ subl $VEC_SIZE, %edx
+ jbe L(zero_end2)
+
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Shift remaining length mask for last VEC. */
+ shrq $32, %rcx
+ andl %ecx, %eax
+ jz L(zero_end2)
tzcntl %eax, %eax
- addq $(VEC_SIZE * 2), %rax
+ addq $(VEC_SIZE * 3 + 1), %rdi
addq %rdi, %rax
+L(zero_end2):
VZEROUPPER_RETURN
.p2align 4
-L(4x_vec_end):
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- vpmovmskb %ymm2, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
- vpmovmskb %ymm3, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
- vpmovmskb %ymm4, %eax
- testl %eax, %eax
-L(first_vec_x3):
+L(last_vec_x3):
tzcntl %eax, %eax
- addq $(VEC_SIZE * 3), %rax
+ subq $-(VEC_SIZE * 2 + 1), %rdi
addq %rdi, %rax
VZEROUPPER_RETURN
+# endif
END (MEMCHR)
#endif
--
GitLab

View file

@ -1,388 +0,0 @@
From 645a158978f9520e74074e8c14047503be4db0f0 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed, 9 Jun 2021 16:25:32 -0400
Subject: [PATCH] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ
#27974]
Content-type: text/plain; charset=UTF-8
This commit fixes the bug mentioned in the previous commit.
The previous implementations of wmemchr in these files relied
on n * sizeof(wchar_t) which was not guranteed by the standard.
The new overflow tests added in the previous commit now
pass (As well as all the other tests).
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/memchr.S | 77 +++++++++++++++++++-------
sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
2 files changed, 98 insertions(+), 37 deletions(-)
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index cb320257..24f9a0c5 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -21,9 +21,11 @@
#ifdef USE_AS_WMEMCHR
# define MEMCHR wmemchr
# define PCMPEQ pcmpeqd
+# define CHAR_PER_VEC 4
#else
# define MEMCHR memchr
# define PCMPEQ pcmpeqb
+# define CHAR_PER_VEC 16
#endif
/* fast SSE2 version with using pmaxub and 64 byte loop */
@@ -33,15 +35,14 @@ ENTRY(MEMCHR)
movd %esi, %xmm1
mov %edi, %ecx
+#ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
+#endif
#ifdef USE_AS_WMEMCHR
test %RDX_LP, %RDX_LP
jz L(return_null)
- shl $2, %RDX_LP
#else
-# ifdef __ILP32__
- /* Clear the upper 32 bits. */
- movl %edx, %edx
-# endif
punpcklbw %xmm1, %xmm1
test %RDX_LP, %RDX_LP
jz L(return_null)
@@ -60,13 +61,16 @@ ENTRY(MEMCHR)
test %eax, %eax
jnz L(matches_1)
- sub $16, %rdx
+ sub $CHAR_PER_VEC, %rdx
jbe L(return_null)
add $16, %rdi
and $15, %ecx
and $-16, %rdi
+#ifdef USE_AS_WMEMCHR
+ shr $2, %ecx
+#endif
add %rcx, %rdx
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
jmp L(loop_prolog)
@@ -77,16 +81,21 @@ L(crosscache):
movdqa (%rdi), %xmm0
PCMPEQ %xmm1, %xmm0
-/* Check if there is a match. */
+ /* Check if there is a match. */
pmovmskb %xmm0, %eax
-/* Remove the leading bytes. */
+ /* Remove the leading bytes. */
sar %cl, %eax
test %eax, %eax
je L(unaligned_no_match)
-/* Check which byte is a match. */
+ /* Check which byte is a match. */
bsf %eax, %eax
-
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
add %rdi, %rax
add %rcx, %rax
@@ -94,15 +103,18 @@ L(crosscache):
.p2align 4
L(unaligned_no_match):
- /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
+ /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
"rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
possible addition overflow. */
neg %rcx
add $16, %rcx
+#ifdef USE_AS_WMEMCHR
+ shr $2, %ecx
+#endif
sub %rcx, %rdx
jbe L(return_null)
add $16, %rdi
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
.p2align 4
@@ -135,7 +147,7 @@ L(loop_prolog):
test $0x3f, %rdi
jz L(align64_loop)
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
movdqa (%rdi), %xmm0
@@ -167,11 +179,14 @@ L(loop_prolog):
mov %rdi, %rcx
and $-64, %rdi
and $63, %ecx
+#ifdef USE_AS_WMEMCHR
+ shr $2, %ecx
+#endif
add %rcx, %rdx
.p2align 4
L(align64_loop):
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
movdqa (%rdi), %xmm0
movdqa 16(%rdi), %xmm2
@@ -218,7 +233,7 @@ L(align64_loop):
.p2align 4
L(exit_loop):
- add $32, %edx
+ add $(CHAR_PER_VEC * 2), %edx
jle L(exit_loop_32)
movdqa (%rdi), %xmm0
@@ -238,7 +253,7 @@ L(exit_loop):
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(matches32_1)
- sub $16, %edx
+ sub $CHAR_PER_VEC, %edx
jle L(return_null)
PCMPEQ 48(%rdi), %xmm1
@@ -250,13 +265,13 @@ L(exit_loop):
.p2align 4
L(exit_loop_32):
- add $32, %edx
+ add $(CHAR_PER_VEC * 2), %edx
movdqa (%rdi), %xmm0
PCMPEQ %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches_1)
- sub $16, %edx
+ sub $CHAR_PER_VEC, %edx
jbe L(return_null)
PCMPEQ 16(%rdi), %xmm1
@@ -293,7 +308,13 @@ L(matches32):
.p2align 4
L(matches_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
add %rdi, %rax
ret
@@ -301,7 +322,13 @@ L(matches_1):
.p2align 4
L(matches16_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
lea 16(%rdi, %rax), %rax
ret
@@ -309,7 +336,13 @@ L(matches16_1):
.p2align 4
L(matches32_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
lea 32(%rdi, %rax), %rax
ret
@@ -317,7 +350,13 @@ L(matches32_1):
.p2align 4
L(matches48_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
lea 48(%rdi, %rax), %rax
ret
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index b377f22e..16027abb 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -54,21 +54,19 @@
# define VEC_SIZE 32
# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
.section SECTION(.text),"ax",@progbits
ENTRY (MEMCHR)
# ifndef USE_AS_RAWMEMCHR
/* Check for zero length. */
- test %RDX_LP, %RDX_LP
- jz L(null)
-# endif
-# ifdef USE_AS_WMEMCHR
- shl $2, %RDX_LP
-# else
# ifdef __ILP32__
- /* Clear the upper 32 bits. */
- movl %edx, %edx
+ /* Clear upper bits. */
+ and %RDX_LP, %RDX_LP
+# else
+ test %RDX_LP, %RDX_LP
# endif
+ jz L(null)
# endif
/* Broadcast CHAR to YMMMATCH. */
vmovd %esi, %xmm0
@@ -84,7 +82,7 @@ ENTRY (MEMCHR)
vpmovmskb %ymm1, %eax
# ifndef USE_AS_RAWMEMCHR
/* If length < CHAR_PER_VEC handle special. */
- cmpq $VEC_SIZE, %rdx
+ cmpq $CHAR_PER_VEC, %rdx
jbe L(first_vec_x0)
# endif
testl %eax, %eax
@@ -98,6 +96,10 @@ ENTRY (MEMCHR)
L(first_vec_x0):
/* Check if first match was before length. */
tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %edx
+# endif
xorl %ecx, %ecx
cmpl %eax, %edx
leaq (%rdi, %rax), %rax
@@ -110,12 +112,12 @@ L(null):
# endif
.p2align 4
L(cross_page_boundary):
- /* Save pointer before aligning as its original value is necessary
- for computer return address if byte is found or adjusting length
- if it is not and this is memchr. */
+ /* Save pointer before aligning as its original value is
+ necessary for computer return address if byte is found or
+ adjusting length if it is not and this is memchr. */
movq %rdi, %rcx
- /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
- rdi for rawmemchr. */
+ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
+ and rdi for rawmemchr. */
orq $(VEC_SIZE - 1), %ALGN_PTR_REG
VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
@@ -124,6 +126,10 @@ L(cross_page_boundary):
match). */
leaq 1(%ALGN_PTR_REG), %rsi
subq %RRAW_PTR_REG, %rsi
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get wchar_t count. */
+ shrl $2, %esi
+# endif
# endif
/* Remove the leading bytes. */
sarxl %ERAW_PTR_REG, %eax, %eax
@@ -181,6 +187,10 @@ L(cross_page_continue):
orq $(VEC_SIZE - 1), %rdi
/* esi is for adjusting length to see if near the end. */
leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %esi
+# endif
# else
orq $(VEC_SIZE - 1), %rdi
L(cross_page_continue):
@@ -213,7 +223,7 @@ L(cross_page_continue):
# ifndef USE_AS_RAWMEMCHR
/* Check if at last VEC_SIZE * 4 length. */
- subq $(VEC_SIZE * 4), %rdx
+ subq $(CHAR_PER_VEC * 4), %rdx
jbe L(last_4x_vec_or_less_cmpeq)
/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
length. */
@@ -221,6 +231,10 @@ L(cross_page_continue):
movl %edi, %ecx
orq $(VEC_SIZE * 4 - 1), %rdi
andl $(VEC_SIZE * 4 - 1), %ecx
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
+# endif
addq %rcx, %rdx
# else
/* Align data to VEC_SIZE * 4 - 1 for loop. */
@@ -250,15 +264,19 @@ L(loop_4x_vec):
subq $-(VEC_SIZE * 4), %rdi
- subq $(VEC_SIZE * 4), %rdx
+ subq $(CHAR_PER_VEC * 4), %rdx
ja L(loop_4x_vec)
- /* Fall through into less than 4 remaining vectors of length case.
- */
+ /* Fall through into less than 4 remaining vectors of length
+ case. */
VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
.p2align 4
L(last_4x_vec_or_less):
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %edx
+# endif
/* Check if first VEC contained match. */
testl %eax, %eax
jnz L(first_vec_x1_check)
@@ -355,6 +373,10 @@ L(last_vec_x2_return):
L(last_4x_vec_or_less_cmpeq):
VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %edx
+# endif
subq $-(VEC_SIZE * 4), %rdi
/* Check first VEC regardless. */
testl %eax, %eax
--
GitLab

View file

@ -1,767 +0,0 @@
From aaa23c35071537e2dcf5807e956802ed215210aa Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon, 19 Apr 2021 19:36:07 -0400
Subject: [PATCH] x86: Optimize strlen-avx2.S
Content-type: text/plain; charset=UTF-8
No bug. This commit optimizes strlen-avx2.S. The optimizations are
mostly small things but they add up to roughly 10-30% performance
improvement for strlen. The results for strnlen are bit more
ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +-
sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++--------
2 files changed, 334 insertions(+), 214 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index cbfc1a5d..f1a6460a 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -285,10 +285,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strlen.c. */
IFUNC_IMPL (i, name, strlen,
IFUNC_IMPL_ADD (array, i, strlen,
- CPU_FEATURE_USABLE (AVX2),
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)),
__strlen_avx2)
IFUNC_IMPL_ADD (array, i, strlen,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__strlen_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strlen,
@@ -301,10 +303,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strnlen.c. */
IFUNC_IMPL (i, name, strnlen,
IFUNC_IMPL_ADD (array, i, strnlen,
- CPU_FEATURE_USABLE (AVX2),
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)),
__strnlen_avx2)
IFUNC_IMPL_ADD (array, i, strnlen,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__strnlen_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strnlen,
@@ -640,10 +644,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/wcslen.c. */
IFUNC_IMPL (i, name, wcslen,
IFUNC_IMPL_ADD (array, i, wcslen,
- CPU_FEATURE_USABLE (AVX2),
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)),
__wcslen_avx2)
IFUNC_IMPL_ADD (array, i, wcslen,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__wcslen_avx2_rtm)
IFUNC_IMPL_ADD (array, i, wcslen,
@@ -656,10 +662,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/wcsnlen.c. */
IFUNC_IMPL (i, name, wcsnlen,
IFUNC_IMPL_ADD (array, i, wcsnlen,
- CPU_FEATURE_USABLE (AVX2),
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)),
__wcsnlen_avx2)
IFUNC_IMPL_ADD (array, i, wcsnlen,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__wcsnlen_avx2_rtm)
IFUNC_IMPL_ADD (array, i, wcsnlen,
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
index 82826e10..be8a5db5 100644
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -27,9 +27,11 @@
# ifdef USE_AS_WCSLEN
# define VPCMPEQ vpcmpeqd
# define VPMINU vpminud
+# define CHAR_SIZE 4
# else
# define VPCMPEQ vpcmpeqb
# define VPMINU vpminub
+# define CHAR_SIZE 1
# endif
# ifndef VZEROUPPER
@@ -41,349 +43,459 @@
# endif
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
.section SECTION(.text),"ax",@progbits
ENTRY (STRLEN)
# ifdef USE_AS_STRNLEN
- /* Check for zero length. */
+ /* Check zero length. */
test %RSI_LP, %RSI_LP
jz L(zero)
+ /* Store max len in R8_LP before adjusting if using WCSLEN. */
+ mov %RSI_LP, %R8_LP
# ifdef USE_AS_WCSLEN
shl $2, %RSI_LP
# elif defined __ILP32__
/* Clear the upper 32 bits. */
movl %esi, %esi
# endif
- mov %RSI_LP, %R8_LP
# endif
- movl %edi, %ecx
+ movl %edi, %eax
movq %rdi, %rdx
vpxor %xmm0, %xmm0, %xmm0
-
+ /* Clear high bits from edi. Only keeping bits relevant to page
+ cross check. */
+ andl $(PAGE_SIZE - 1), %eax
/* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
-
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
# ifdef USE_AS_STRNLEN
- jnz L(first_vec_x0_check)
- /* Adjust length and check the end of data. */
- subq $VEC_SIZE, %rsi
- jbe L(max)
-# else
- jnz L(first_vec_x0)
+ /* If length < VEC_SIZE handle special. */
+ cmpq $VEC_SIZE, %rsi
+ jbe L(first_vec_x0)
# endif
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ /* If empty continue to aligned_more. Otherwise return bit
+ position of first match. */
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
+ VZEROUPPER_RETURN
# ifdef USE_AS_STRNLEN
- /* Adjust length. */
- addq %rcx, %rsi
+L(zero):
+ xorl %eax, %eax
+ ret
- subq $(VEC_SIZE * 4), %rsi
- jbe L(last_4x_vec_or_less)
+ .p2align 4
+L(first_vec_x0):
+ /* Set bit for max len so that tzcnt will return min of max len
+ and position of first match. */
+ btsq %rsi, %rax
+ tzcntl %eax, %eax
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
+ VZEROUPPER_RETURN
# endif
- jmp L(more_4x_vec)
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- /* Remove the leading bytes. */
- sarl %cl, %eax
- testl %eax, %eax
- jz L(aligned_more)
+L(first_vec_x1):
tzcntl %eax, %eax
+ /* Safe to use 32 bit instructions as these are only called for
+ size = [1, 159]. */
# ifdef USE_AS_STRNLEN
- /* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
+ /* Use ecx which was computed earlier to compute correct value.
+ */
+ subl $(VEC_SIZE * 4 + 1), %ecx
+ addl %ecx, %eax
+# else
+ subl %edx, %edi
+ incl %edi
+ addl %edi, %eax
# endif
- addq %rdi, %rax
- addq %rcx, %rax
- subq %rdx, %rax
# ifdef USE_AS_WCSLEN
- shrq $2, %rax
+ shrl $2, %eax
# endif
-L(return_vzeroupper):
- ZERO_UPPER_VEC_REGISTERS_RETURN
+ VZEROUPPER_RETURN
.p2align 4
-L(aligned_more):
+L(first_vec_x2):
+ tzcntl %eax, %eax
+ /* Safe to use 32 bit instructions as these are only called for
+ size = [1, 159]. */
# ifdef USE_AS_STRNLEN
- /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE"
- with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
- to void possible addition overflow. */
- negq %rcx
- addq $VEC_SIZE, %rcx
-
- /* Check the end of data. */
- subq %rcx, %rsi
- jbe L(max)
+ /* Use ecx which was computed earlier to compute correct value.
+ */
+ subl $(VEC_SIZE * 3 + 1), %ecx
+ addl %ecx, %eax
+# else
+ subl %edx, %edi
+ addl $(VEC_SIZE + 1), %edi
+ addl %edi, %eax
# endif
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
+ VZEROUPPER_RETURN
- addq $VEC_SIZE, %rdi
+ .p2align 4
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ /* Safe to use 32 bit instructions as these are only called for
+ size = [1, 159]. */
+# ifdef USE_AS_STRNLEN
+ /* Use ecx which was computed earlier to compute correct value.
+ */
+ subl $(VEC_SIZE * 2 + 1), %ecx
+ addl %ecx, %eax
+# else
+ subl %edx, %edi
+ addl $(VEC_SIZE * 2 + 1), %edi
+ addl %edi, %eax
+# endif
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
+ VZEROUPPER_RETURN
+ .p2align 4
+L(first_vec_x4):
+ tzcntl %eax, %eax
+ /* Safe to use 32 bit instructions as these are only called for
+ size = [1, 159]. */
# ifdef USE_AS_STRNLEN
- subq $(VEC_SIZE * 4), %rsi
- jbe L(last_4x_vec_or_less)
+ /* Use ecx which was computed earlier to compute correct value.
+ */
+ subl $(VEC_SIZE + 1), %ecx
+ addl %ecx, %eax
+# else
+ subl %edx, %edi
+ addl $(VEC_SIZE * 3 + 1), %edi
+ addl %edi, %eax
# endif
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
+ VZEROUPPER_RETURN
-L(more_4x_vec):
+ .p2align 5
+L(aligned_more):
+ /* Align data to VEC_SIZE - 1. This is the same number of
+ instructions as using andq with -VEC_SIZE but saves 4 bytes of
+ code on the x4 check. */
+ orq $(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
-
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+# ifdef USE_AS_STRNLEN
+ /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
+ it simplies the logic in last_4x_vec_or_less. */
+ leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
+ subq %rdx, %rcx
+# endif
+ /* Load first VEC regardless. */
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
+# ifdef USE_AS_STRNLEN
+ /* Adjust length. If near end handle specially. */
+ subq %rcx, %rsi
+ jb L(last_4x_vec_or_less)
+# endif
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x1)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x2)
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x3)
- addq $(VEC_SIZE * 4), %rdi
-
-# ifdef USE_AS_STRNLEN
- subq $(VEC_SIZE * 4), %rsi
- jbe L(last_4x_vec_or_less)
-# endif
-
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
- andq $-(4 * VEC_SIZE), %rdi
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x4)
+ /* Align data to VEC_SIZE * 4 - 1. */
# ifdef USE_AS_STRNLEN
- /* Adjust length. */
+ /* Before adjusting length check if at last VEC_SIZE * 4. */
+ cmpq $(VEC_SIZE * 4 - 1), %rsi
+ jbe L(last_4x_vec_or_less_load)
+ incq %rdi
+ movl %edi, %ecx
+ orq $(VEC_SIZE * 4 - 1), %rdi
+ andl $(VEC_SIZE * 4 - 1), %ecx
+ /* Readjust length. */
addq %rcx, %rsi
+# else
+ incq %rdi
+ orq $(VEC_SIZE * 4 - 1), %rdi
# endif
-
+ /* Compare 4 * VEC at a time forward. */
.p2align 4
L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- vmovdqa (%rdi), %ymm1
- vmovdqa VEC_SIZE(%rdi), %ymm2
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
- VPMINU %ymm1, %ymm2, %ymm5
- VPMINU %ymm3, %ymm4, %ymm6
- VPMINU %ymm5, %ymm6, %ymm5
-
- VPCMPEQ %ymm5, %ymm0, %ymm5
- vpmovmskb %ymm5, %eax
- testl %eax, %eax
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
-
-# ifndef USE_AS_STRNLEN
- jmp L(loop_4x_vec)
-# else
+# ifdef USE_AS_STRNLEN
+ /* Break if at end of length. */
subq $(VEC_SIZE * 4), %rsi
- ja L(loop_4x_vec)
-
-L(last_4x_vec_or_less):
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
- addl $(VEC_SIZE * 2), %esi
- jle L(last_2x_vec)
+ jb L(last_4x_vec_or_less_cmpeq)
+# endif
+ /* Save some code size by microfusing VPMINU with the load. Since
+ the matches in ymm2/ymm4 can only be returned if there where no
+ matches in ymm1/ymm3 respectively there is no issue with overlap.
+ */
+ vmovdqa 1(%rdi), %ymm1
+ VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
+ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
+ VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
+
+ VPMINU %ymm2, %ymm4, %ymm5
+ VPCMPEQ %ymm5, %ymm0, %ymm5
+ vpmovmskb %ymm5, %ecx
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
+ subq $-(VEC_SIZE * 4), %rdi
+ testl %ecx, %ecx
+ jz L(loop_4x_vec)
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ VPCMPEQ %ymm1, %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ subq %rdx, %rdi
testl %eax, %eax
+ jnz L(last_vec_return_x0)
- jnz L(first_vec_x2_check)
- subl $VEC_SIZE, %esi
- jle L(max)
-
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ VPCMPEQ %ymm2, %ymm0, %ymm2
+ vpmovmskb %ymm2, %eax
testl %eax, %eax
-
- jnz L(first_vec_x3_check)
- movq %r8, %rax
-# ifdef USE_AS_WCSLEN
+ jnz L(last_vec_return_x1)
+
+ /* Combine last 2 VEC. */
+ VPCMPEQ %ymm3, %ymm0, %ymm3
+ vpmovmskb %ymm3, %eax
+ /* rcx has combined result from all 4 VEC. It will only be used if
+ the first 3 other VEC all did not contain a match. */
+ salq $32, %rcx
+ orq %rcx, %rax
+ tzcntq %rax, %rax
+ subq $(VEC_SIZE * 2 - 1), %rdi
+ addq %rdi, %rax
+# ifdef USE_AS_WCSLEN
shrq $2, %rax
-# endif
+# endif
VZEROUPPER_RETURN
+
+# ifdef USE_AS_STRNLEN
.p2align 4
-L(last_2x_vec):
- addl $(VEC_SIZE * 2), %esi
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
+L(last_4x_vec_or_less_load):
+ /* Depending on entry adjust rdi / prepare first VEC in ymm1. */
+ subq $-(VEC_SIZE * 4), %rdi
+L(last_4x_vec_or_less_cmpeq):
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
+L(last_4x_vec_or_less):
- jnz L(first_vec_x0_check)
- subl $VEC_SIZE, %esi
- jle L(max)
+ vpmovmskb %ymm1, %eax
+ /* If remaining length > VEC_SIZE * 2. This works if esi is off by
+ VEC_SIZE * 4. */
+ testl $(VEC_SIZE * 2), %esi
+ jnz L(last_4x_vec)
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ /* length may have been negative or positive by an offset of
+ VEC_SIZE * 4 depending on where this was called from. This fixes
+ that. */
+ andl $(VEC_SIZE * 4 - 1), %esi
testl %eax, %eax
- jnz L(first_vec_x1_check)
- movq %r8, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
-# endif
- VZEROUPPER_RETURN
+ jnz L(last_vec_x1_check)
- .p2align 4
-L(first_vec_x0_check):
+ subl $VEC_SIZE, %esi
+ jb L(max)
+
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
tzcntl %eax, %eax
/* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
+ cmpl %eax, %esi
+ jb L(max)
+ subq %rdx, %rdi
+ addl $(VEC_SIZE + 1), %eax
addq %rdi, %rax
- subq %rdx, %rax
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
VZEROUPPER_RETURN
+# endif
.p2align 4
-L(first_vec_x1_check):
+L(last_vec_return_x0):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
- addq $VEC_SIZE, %rax
+ subq $(VEC_SIZE * 4 - 1), %rdi
addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
shrq $2, %rax
-# endif
+# endif
VZEROUPPER_RETURN
.p2align 4
-L(first_vec_x2_check):
+L(last_vec_return_x1):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
- addq $(VEC_SIZE * 2), %rax
+ subq $(VEC_SIZE * 3 - 1), %rdi
addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
shrq $2, %rax
-# endif
+# endif
VZEROUPPER_RETURN
+# ifdef USE_AS_STRNLEN
.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x1_check):
+
tzcntl %eax, %eax
/* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
- addq $(VEC_SIZE * 3), %rax
+ cmpl %eax, %esi
+ jb L(max)
+ subq %rdx, %rdi
+ incl %eax
addq %rdi, %rax
- subq %rdx, %rax
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
VZEROUPPER_RETURN
- .p2align 4
L(max):
movq %r8, %rax
+ VZEROUPPER_RETURN
+
+ .p2align 4
+L(last_4x_vec):
+ /* Test first 2x VEC normally. */
+ testl %eax, %eax
+ jnz L(last_vec_x1)
+
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2)
+
+ /* Normalize length. */
+ andl $(VEC_SIZE * 4 - 1), %esi
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x3)
+
+ subl $(VEC_SIZE * 3), %esi
+ jb L(max)
+
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ tzcntl %eax, %eax
+ /* Check the end of data. */
+ cmpl %eax, %esi
+ jb L(max)
+ subq %rdx, %rdi
+ addl $(VEC_SIZE * 3 + 1), %eax
+ addq %rdi, %rax
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
VZEROUPPER_RETURN
- .p2align 4
-L(zero):
- xorl %eax, %eax
- ret
-# endif
.p2align 4
-L(first_vec_x0):
+L(last_vec_x1):
+ /* essentially duplicates of first_vec_x1 but use 64 bit
+ instructions. */
tzcntl %eax, %eax
+ subq %rdx, %rdi
+ incl %eax
addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
shrq $2, %rax
-# endif
+# endif
VZEROUPPER_RETURN
.p2align 4
-L(first_vec_x1):
+L(last_vec_x2):
+ /* essentially duplicates of first_vec_x1 but use 64 bit
+ instructions. */
tzcntl %eax, %eax
- addq $VEC_SIZE, %rax
+ subq %rdx, %rdi
+ addl $(VEC_SIZE + 1), %eax
addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
shrq $2, %rax
-# endif
+# endif
VZEROUPPER_RETURN
.p2align 4
-L(first_vec_x2):
+L(last_vec_x3):
tzcntl %eax, %eax
- addq $(VEC_SIZE * 2), %rax
+ subl $(VEC_SIZE * 2), %esi
+ /* Check the end of data. */
+ cmpl %eax, %esi
+ jb L(max_end)
+ subq %rdx, %rdi
+ addl $(VEC_SIZE * 2 + 1), %eax
addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
shrq $2, %rax
-# endif
+# endif
+ VZEROUPPER_RETURN
+L(max_end):
+ movq %r8, %rax
VZEROUPPER_RETURN
+# endif
+ /* Cold case for crossing page with first load. */
.p2align 4
-L(4x_vec_end):
- VPCMPEQ %ymm1, %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- VPCMPEQ %ymm2, %ymm0, %ymm2
- vpmovmskb %ymm2, %eax
+L(cross_page_boundary):
+ /* Align data to VEC_SIZE - 1. */
+ orq $(VEC_SIZE - 1), %rdi
+ VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+ so no need to manually mod rdx. */
+ sarxl %edx, %eax, %eax
+# ifdef USE_AS_STRNLEN
testl %eax, %eax
- jnz L(first_vec_x1)
- VPCMPEQ %ymm3, %ymm0, %ymm3
- vpmovmskb %ymm3, %eax
+ jnz L(cross_page_less_vec)
+ leaq 1(%rdi), %rcx
+ subq %rdx, %rcx
+ /* Check length. */
+ cmpq %rsi, %rcx
+ jb L(cross_page_continue)
+ movq %r8, %rax
+# else
testl %eax, %eax
- jnz L(first_vec_x2)
- VPCMPEQ %ymm4, %ymm0, %ymm4
- vpmovmskb %ymm4, %eax
-L(first_vec_x3):
+ jz L(cross_page_continue)
tzcntl %eax, %eax
- addq $(VEC_SIZE * 3), %rax
- addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
# endif
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
+
+# ifdef USE_AS_STRNLEN
+ .p2align 4
+L(cross_page_less_vec):
+ tzcntl %eax, %eax
+ cmpq %rax, %rsi
+ cmovb %esi, %eax
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
VZEROUPPER_RETURN
+# endif
END (STRLEN)
#endif
--
GitLab

View file

@ -1,701 +0,0 @@
From 2a76821c3081d2c0231ecd2618f52662cb48fccd Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon, 3 May 2021 03:03:19 -0400
Subject: [PATCH] x86: Optimize memchr-evex.S
Content-type: text/plain; charset=UTF-8
No bug. This commit optimizes memchr-evex.S. The optimizations include
replacing some branches with cmovcc, avoiding some branches entirely
in the less_4x_vec case, making the page cross logic less strict,
saving some ALU in the alignment process, and most importantly
increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
test-wmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++----------
1 file changed, 322 insertions(+), 225 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index 6dd5d67b..81d5cd64 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -26,14 +26,28 @@
# ifdef USE_AS_WMEMCHR
# define VPBROADCAST vpbroadcastd
-# define VPCMP vpcmpd
-# define SHIFT_REG r8d
+# define VPMINU vpminud
+# define VPCMP vpcmpd
+# define VPCMPEQ vpcmpeqd
+# define CHAR_SIZE 4
# else
# define VPBROADCAST vpbroadcastb
-# define VPCMP vpcmpb
-# define SHIFT_REG ecx
+# define VPMINU vpminub
+# define VPCMP vpcmpb
+# define VPCMPEQ vpcmpeqb
+# define CHAR_SIZE 1
# endif
+# ifdef USE_AS_RAWMEMCHR
+# define RAW_PTR_REG rcx
+# define ALGN_PTR_REG rdi
+# else
+# define RAW_PTR_REG rdi
+# define ALGN_PTR_REG rcx
+# endif
+
+# define XMMZERO xmm23
+# define YMMZERO ymm23
# define XMMMATCH xmm16
# define YMMMATCH ymm16
# define YMM1 ymm17
@@ -44,6 +58,8 @@
# define YMM6 ymm22
# define VEC_SIZE 32
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+# define PAGE_SIZE 4096
.section .text.evex,"ax",@progbits
ENTRY (MEMCHR)
@@ -51,11 +67,7 @@ ENTRY (MEMCHR)
/* Check for zero length. */
test %RDX_LP, %RDX_LP
jz L(zero)
-# endif
- movl %edi, %ecx
-# ifdef USE_AS_WMEMCHR
- shl $2, %RDX_LP
-# else
+
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
@@ -64,318 +76,403 @@ ENTRY (MEMCHR)
/* Broadcast CHAR to YMMMATCH. */
VPBROADCAST %esi, %YMMMATCH
/* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ movl %edi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. */
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
-
+ VPCMP $0, (%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
# ifndef USE_AS_RAWMEMCHR
- jnz L(first_vec_x0_check)
- /* Adjust length and check the end of data. */
- subq $VEC_SIZE, %rdx
- jbe L(zero)
+ /* If length < CHAR_PER_VEC handle special. */
+ cmpq $CHAR_PER_VEC, %rdx
+ jbe L(first_vec_x0)
+# endif
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
- jnz L(first_vec_x0)
+ addq %rdi, %rax
# endif
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ ret
# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
- addq %rcx, %rdx
-
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
- jmp L(more_4x_vec)
+L(zero):
+ xorl %eax, %eax
+ ret
+ .p2align 5
+L(first_vec_x0):
+ /* Check if first match was before length. */
+ tzcntl %eax, %eax
+ xorl %ecx, %ecx
+ cmpl %eax, %edx
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+ cmovle %rcx, %rax
+ ret
+# else
+ /* NB: first_vec_x0 is 17 bytes which will leave
+ cross_page_boundary (which is relatively cold) close enough
+ to ideal alignment. So only realign L(cross_page_boundary) if
+ rawmemchr. */
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
+# endif
+L(cross_page_boundary):
+ /* Save pointer before aligning as its original value is
+ necessary for computer return address if byte is found or
+ adjusting length if it is not and this is memchr. */
+ movq %rdi, %rcx
+ /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
+ for rawmemchr. */
+ andq $-VEC_SIZE, %ALGN_PTR_REG
+ VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
+ kmovd %k0, %r8d
# ifdef USE_AS_WMEMCHR
- /* NB: Divide shift count by 4 since each bit in K1 represent 4
+ /* NB: Divide shift count by 4 since each bit in K0 represent 4
bytes. */
- movl %ecx, %SHIFT_REG
- sarl $2, %SHIFT_REG
+ sarl $2, %eax
+# endif
+# ifndef USE_AS_RAWMEMCHR
+ movl $(PAGE_SIZE / CHAR_SIZE), %esi
+ subl %eax, %esi
# endif
- andq $-VEC_SIZE, %rdi
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- /* Remove the leading bytes. */
- sarxl %SHIFT_REG, %eax, %eax
- testl %eax, %eax
- jz L(aligned_more)
- tzcntl %eax, %eax
# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
+ andl $(CHAR_PER_VEC - 1), %eax
# endif
+ /* Remove the leading bytes. */
+ sarxl %eax, %r8d, %eax
# ifndef USE_AS_RAWMEMCHR
/* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
+ cmpq %rsi, %rdx
+ jbe L(first_vec_x0)
+# endif
+ testl %eax, %eax
+ jz L(cross_page_continue)
+ tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
+# else
+ addq %RAW_PTR_REG, %rax
# endif
- addq %rdi, %rax
- addq %rcx, %rax
ret
.p2align 4
-L(aligned_more):
-# ifndef USE_AS_RAWMEMCHR
- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
- overflow. */
- negq %rcx
- addq $VEC_SIZE, %rcx
+L(first_vec_x1):
+ tzcntl %eax, %eax
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- /* Check the end of data. */
- subq %rcx, %rdx
- jbe L(zero)
-# endif
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- addq $VEC_SIZE, %rdi
+ .p2align 4
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
-# ifndef USE_AS_RAWMEMCHR
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
+ .p2align 4
+L(first_vec_x4):
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
-L(more_4x_vec):
+ .p2align 5
+L(aligned_more):
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Align data to VEC_SIZE. */
+L(cross_page_continue):
+ xorl %ecx, %ecx
+ subl %edi, %ecx
+ andq $-VEC_SIZE, %rdi
+ /* esi is for adjusting length to see if near the end. */
+ leal (VEC_SIZE * 5)(%rdi, %rcx), %esi
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %esi
+# endif
+# else
+ andq $-VEC_SIZE, %rdi
+L(cross_page_continue):
+# endif
+ /* Load first VEC regardless. */
+ VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Adjust length. If near end handle specially. */
+ subq %rsi, %rdx
+ jbe L(last_4x_vec_or_less)
+# endif
testl %eax, %eax
jnz L(first_vec_x1)
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x2)
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x3)
- addq $(VEC_SIZE * 4), %rdi
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x4)
+
# ifndef USE_AS_RAWMEMCHR
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
+ /* Check if at last CHAR_PER_VEC * 4 length. */
+ subq $(CHAR_PER_VEC * 4), %rdx
+ jbe L(last_4x_vec_or_less_cmpeq)
+ addq $VEC_SIZE, %rdi
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
+ /* Align data to VEC_SIZE * 4 for the loop and readjust length.
+ */
+# ifdef USE_AS_WMEMCHR
+ movl %edi, %ecx
andq $-(4 * VEC_SIZE), %rdi
-
-# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
+ andl $(VEC_SIZE * 4 - 1), %ecx
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
addq %rcx, %rdx
+# else
+ addq %rdi, %rdx
+ andq $-(4 * VEC_SIZE), %rdi
+ subq %rdi, %rdx
+# endif
+# else
+ addq $VEC_SIZE, %rdi
+ andq $-(4 * VEC_SIZE), %rdi
# endif
+ vpxorq %XMMZERO, %XMMZERO, %XMMZERO
+
+ /* Compare 4 * VEC at a time forward. */
.p2align 4
L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
- kord %k1, %k2, %k5
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
-
- kord %k3, %k4, %k6
- kortestd %k5, %k6
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
-
+ /* It would be possible to save some instructions using 4x VPCMP
+ but bottleneck on port 5 makes it not woth it. */
+ VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
+ /* xor will set bytes match esi to zero. */
+ vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
+ vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
+ VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
+ /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
+ VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z}
+ VPCMP $0, %YMM3, %YMMZERO, %k2
# ifdef USE_AS_RAWMEMCHR
- jmp L(loop_4x_vec)
+ subq $-(VEC_SIZE * 4), %rdi
+ kortestd %k2, %k3
+ jz L(loop_4x_vec)
# else
- subq $(VEC_SIZE * 4), %rdx
+ kortestd %k2, %k3
+ jnz L(loop_4x_vec_end)
+
+ subq $-(VEC_SIZE * 4), %rdi
+
+ subq $(CHAR_PER_VEC * 4), %rdx
ja L(loop_4x_vec)
+ /* Fall through into less than 4 remaining vectors of length case.
+ */
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ addq $(VEC_SIZE * 3), %rdi
+ .p2align 4
L(last_4x_vec_or_less):
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
- addl $(VEC_SIZE * 2), %edx
- jle L(last_2x_vec)
-
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+ /* Check if first VEC contained match. */
testl %eax, %eax
- jnz L(first_vec_x0)
+ jnz L(first_vec_x1_check)
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
+ /* If remaining length > CHAR_PER_VEC * 2. */
+ addl $(CHAR_PER_VEC * 2), %edx
+ jg L(last_4x_vec)
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
+L(last_2x_vec):
+ /* If remaining length < CHAR_PER_VEC. */
+ addl $CHAR_PER_VEC, %edx
+ jle L(zero_end)
- jnz L(first_vec_x2_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
+ /* Check VEC2 and compare any match with remaining length. */
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ tzcntl %eax, %eax
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end):
+ ret
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x3_check)
+ .p2align 4
+L(first_vec_x1_check):
+ tzcntl %eax, %eax
+ /* Adjust length. */
+ subl $-(CHAR_PER_VEC * 4), %edx
+ /* Check if match within remaining length. */
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+L(set_zero_end):
xorl %eax, %eax
ret
.p2align 4
-L(last_2x_vec):
- addl $(VEC_SIZE * 2), %edx
- VPCMP $0, (%rdi), %YMMMATCH, %k1
+L(loop_4x_vec_end):
+# endif
+ /* rawmemchr will fall through into this if match was found in
+ loop. */
+
+ /* k1 has not of matches with VEC1. */
kmovd %k1, %eax
- testl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ subl $((1 << CHAR_PER_VEC) - 1), %eax
+# else
+ incl %eax
+# endif
+ jnz L(last_vec_x1_return)
- jnz L(first_vec_x0_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
+ VPCMP $0, %YMM2, %YMMZERO, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2_return)
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+ kmovd %k2, %eax
testl %eax, %eax
- jnz L(first_vec_x1_check)
- xorl %eax, %eax
- ret
+ jnz L(last_vec_x3_return)
- .p2align 4
-L(first_vec_x0_check):
+ kmovd %k3, %eax
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
+# ifdef USE_AS_RAWMEMCHR
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+# else
+ leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
# endif
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq %rdi, %rax
ret
.p2align 4
-L(first_vec_x1_check):
+L(last_vec_x1_return):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
-# endif
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $VEC_SIZE, %rax
+# ifdef USE_AS_RAWMEMCHR
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+# else
addq %rdi, %rax
- ret
-
- .p2align 4
-L(first_vec_x2_check):
- tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
+# endif
+# else
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
# endif
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 2), %rax
- addq %rdi, %rax
ret
.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x2_return):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
+# ifdef USE_AS_RAWMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+# else
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
# endif
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 3), %rax
- addq %rdi, %rax
ret
.p2align 4
-L(zero):
- xorl %eax, %eax
- ret
-# endif
-
- .p2align 4
-L(first_vec_x0):
+L(last_vec_x3_return):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq (%rdi, %rax, 4), %rax
+# ifdef USE_AS_RAWMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
# else
- addq %rdi, %rax
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
# endif
ret
+
+# ifndef USE_AS_RAWMEMCHR
+L(last_4x_vec_or_less_cmpeq):
+ VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ subq $-(VEC_SIZE * 4), %rdi
+ /* Check first VEC regardless. */
+ testl %eax, %eax
+ jnz L(first_vec_x1_check)
+
+ /* If remaining length <= CHAR_PER_VEC * 2. */
+ addl $(CHAR_PER_VEC * 2), %edx
+ jle L(last_2x_vec)
+
.p2align 4
-L(first_vec_x1):
+L(last_4x_vec):
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2)
+
+
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ /* Create mask for possible matches within remaining length. */
+# ifdef USE_AS_WMEMCHR
+ movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
+ bzhil %edx, %ecx, %ecx
+# else
+ movq $-1, %rcx
+ bzhiq %rdx, %rcx, %rcx
+# endif
+ /* Test matches in data against length match. */
+ andl %ecx, %eax
+ jnz L(last_vec_x3)
+
+ /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
+ remaining length was found to be > CHAR_PER_VEC * 2. */
+ subl $CHAR_PER_VEC, %edx
+ jbe L(zero_end2)
+
+
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ /* Shift remaining length mask for last VEC. */
+# ifdef USE_AS_WMEMCHR
+ shrl $CHAR_PER_VEC, %ecx
+# else
+ shrq $CHAR_PER_VEC, %rcx
+# endif
+ andl %ecx, %eax
+ jz L(zero_end2)
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq VEC_SIZE(%rdi, %rax, 4), %rax
-# else
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
-# endif
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end2):
ret
- .p2align 4
-L(first_vec_x2):
+L(last_vec_x2):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
-# else
- addq $(VEC_SIZE * 2), %rax
- addq %rdi, %rax
-# endif
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
ret
.p2align 4
-L(4x_vec_end):
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- kmovd %k2, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
- kmovd %k3, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
- kmovd %k4, %eax
- testl %eax, %eax
-L(first_vec_x3):
+L(last_vec_x3):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
-# else
- addq $(VEC_SIZE * 3), %rax
- addq %rdi, %rax
-# endif
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
ret
+# endif
END (MEMCHR)
#endif
--
GitLab

View file

@ -1,30 +0,0 @@
From 6ea916adfa0ab9af6e7dc6adcf6f977dfe017835 Mon Sep 17 00:00:00 2001
From: Alice Xu <alice.d.xu@gmail.com>
Date: Fri, 7 May 2021 19:03:21 -0700
Subject: [PATCH] x86-64: Fix an unknown vector operation in memchr-evex.S
Content-type: text/plain; charset=UTF-8
An unknown vector operation occurred in commit 2a76821c308. Fixed it
by using "ymm{k1}{z}" but not "ymm {k1} {z}".
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/memchr-evex.S | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index 81d5cd64..f3fdad4f 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -271,7 +271,7 @@ L(loop_4x_vec):
vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
- VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z}
+ VPMINU %YMM2, %YMM3, %YMM3{%k1}{z}
VPCMP $0, %YMM3, %YMMZERO, %k2
# ifdef USE_AS_RAWMEMCHR
subq $-(VEC_SIZE * 4), %rdi
--
GitLab

View file

@ -1,566 +0,0 @@
From a0db678071c60b6c47c468d231dd0b3694ba7a98 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 22 Jun 2021 20:42:10 -0700
Subject: [PATCH] x86-64: Move strlen.S to multiarch/strlen-vec.S
Content-type: text/plain; charset=UTF-8
Since strlen.S contains SSE2 version of strlen/strnlen and SSE4.1
version of wcslen/wcsnlen, move strlen.S to multiarch/strlen-vec.S
and include multiarch/strlen-vec.S from SSE2 and SSE4.1 variants.
This also removes the unused symbols, __GI___strlen_sse2 and
__GI___wcsnlen_sse4_1.
---
sysdeps/x86_64/multiarch/strlen-sse2.S | 2 +-
sysdeps/x86_64/multiarch/strlen-vec.S | 257 ++++++++++++++++++++++
sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S | 2 +-
sysdeps/x86_64/strlen.S | 243 +-------------------
4 files changed, 262 insertions(+), 242 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S
Conflicts:
sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
(Copyright dates, URL)
diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S
index 7bc57b8d..449c8a7f 100644
--- a/sysdeps/x86_64/multiarch/strlen-sse2.S
+++ b/sysdeps/x86_64/multiarch/strlen-sse2.S
@@ -20,4 +20,4 @@
# define strlen __strlen_sse2
#endif
-#include "../strlen.S"
+#include "strlen-vec.S"
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
new file mode 100644
index 00000000..8f660bb9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
@@ -0,0 +1,257 @@
+/* SSE2 version of strlen and SSE4.1 version of wcslen.
+ Copyright (C) 2012-2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifdef AS_WCSLEN
+# define PMINU pminud
+# define PCMPEQ pcmpeqd
+# define SHIFT_RETURN shrq $2, %rax
+#else
+# define PMINU pminub
+# define PCMPEQ pcmpeqb
+# define SHIFT_RETURN
+#endif
+
+/* Long lived register in strlen(s), strnlen(s, n) are:
+
+ %xmm3 - zero
+ %rdi - s
+ %r10 (s+n) & (~(64-1))
+ %r11 s+n
+*/
+
+
+.text
+ENTRY(strlen)
+
+/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
+#define FIND_ZERO \
+ PCMPEQ (%rax), %xmm0; \
+ PCMPEQ 16(%rax), %xmm1; \
+ PCMPEQ 32(%rax), %xmm2; \
+ PCMPEQ 48(%rax), %xmm3; \
+ pmovmskb %xmm0, %esi; \
+ pmovmskb %xmm1, %edx; \
+ pmovmskb %xmm2, %r8d; \
+ pmovmskb %xmm3, %ecx; \
+ salq $16, %rdx; \
+ salq $16, %rcx; \
+ orq %rsi, %rdx; \
+ orq %r8, %rcx; \
+ salq $32, %rcx; \
+ orq %rcx, %rdx;
+
+#ifdef AS_STRNLEN
+/* Do not read anything when n==0. */
+ test %RSI_LP, %RSI_LP
+ jne L(n_nonzero)
+ xor %rax, %rax
+ ret
+L(n_nonzero):
+# ifdef AS_WCSLEN
+ shl $2, %RSI_LP
+# endif
+
+/* Initialize long lived registers. */
+
+ add %RDI_LP, %RSI_LP
+ mov %RSI_LP, %R10_LP
+ and $-64, %R10_LP
+ mov %RSI_LP, %R11_LP
+#endif
+
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+ movq %rdi, %rax
+ movq %rdi, %rcx
+ andq $4095, %rcx
+/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
+ cmpq $4047, %rcx
+/* We cannot unify this branching as it would be ~6 cycles slower. */
+ ja L(cross_page)
+
+#ifdef AS_STRNLEN
+/* Test if end is among first 64 bytes. */
+# define STRNLEN_PROLOG \
+ mov %r11, %rsi; \
+ subq %rax, %rsi; \
+ andq $-64, %rax; \
+ testq $-64, %rsi; \
+ je L(strnlen_ret)
+#else
+# define STRNLEN_PROLOG andq $-64, %rax;
+#endif
+
+/* Ignore bits in mask that come before start of string. */
+#define PROLOG(lab) \
+ movq %rdi, %rcx; \
+ xorq %rax, %rcx; \
+ STRNLEN_PROLOG; \
+ sarq %cl, %rdx; \
+ test %rdx, %rdx; \
+ je L(lab); \
+ bsfq %rdx, %rax; \
+ SHIFT_RETURN; \
+ ret
+
+#ifdef AS_STRNLEN
+ andq $-16, %rax
+ FIND_ZERO
+#else
+ /* Test first 16 bytes unaligned. */
+ movdqu (%rax), %xmm4
+ PCMPEQ %xmm0, %xmm4
+ pmovmskb %xmm4, %edx
+ test %edx, %edx
+ je L(next48_bytes)
+ bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
+ SHIFT_RETURN
+ ret
+
+L(next48_bytes):
+/* Same as FIND_ZERO except we do not check first 16 bytes. */
+ andq $-16, %rax
+ PCMPEQ 16(%rax), %xmm1
+ PCMPEQ 32(%rax), %xmm2
+ PCMPEQ 48(%rax), %xmm3
+ pmovmskb %xmm1, %edx
+ pmovmskb %xmm2, %r8d
+ pmovmskb %xmm3, %ecx
+ salq $16, %rdx
+ salq $16, %rcx
+ orq %r8, %rcx
+ salq $32, %rcx
+ orq %rcx, %rdx
+#endif
+
+ /* When no zero byte is found xmm1-3 are zero so we do not have to
+ zero them. */
+ PROLOG(loop)
+
+ .p2align 4
+L(cross_page):
+ andq $-64, %rax
+ FIND_ZERO
+ PROLOG(loop_init)
+
+#ifdef AS_STRNLEN
+/* We must do this check to correctly handle strnlen (s, -1). */
+L(strnlen_ret):
+ bts %rsi, %rdx
+ sarq %cl, %rdx
+ test %rdx, %rdx
+ je L(loop_init)
+ bsfq %rdx, %rax
+ SHIFT_RETURN
+ ret
+#endif
+ .p2align 4
+L(loop_init):
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+#ifdef AS_STRNLEN
+ .p2align 4
+L(loop):
+
+ addq $64, %rax
+ cmpq %rax, %r10
+ je L(exit_end)
+
+ movdqa (%rax), %xmm0
+ PMINU 16(%rax), %xmm0
+ PMINU 32(%rax), %xmm0
+ PMINU 48(%rax), %xmm0
+ PCMPEQ %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ testl %edx, %edx
+ jne L(exit)
+ jmp L(loop)
+
+ .p2align 4
+L(exit_end):
+ cmp %rax, %r11
+ je L(first) /* Do not read when end is at page boundary. */
+ pxor %xmm0, %xmm0
+ FIND_ZERO
+
+L(first):
+ bts %r11, %rdx
+ bsfq %rdx, %rdx
+ addq %rdx, %rax
+ subq %rdi, %rax
+ SHIFT_RETURN
+ ret
+
+ .p2align 4
+L(exit):
+ pxor %xmm0, %xmm0
+ FIND_ZERO
+
+ bsfq %rdx, %rdx
+ addq %rdx, %rax
+ subq %rdi, %rax
+ SHIFT_RETURN
+ ret
+
+#else
+
+ /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
+ .p2align 4
+L(loop):
+
+ movdqa 64(%rax), %xmm0
+ PMINU 80(%rax), %xmm0
+ PMINU 96(%rax), %xmm0
+ PMINU 112(%rax), %xmm0
+ PCMPEQ %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ testl %edx, %edx
+ jne L(exit64)
+
+ subq $-128, %rax
+
+ movdqa (%rax), %xmm0
+ PMINU 16(%rax), %xmm0
+ PMINU 32(%rax), %xmm0
+ PMINU 48(%rax), %xmm0
+ PCMPEQ %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ testl %edx, %edx
+ jne L(exit0)
+ jmp L(loop)
+
+ .p2align 4
+L(exit64):
+ addq $64, %rax
+L(exit0):
+ pxor %xmm0, %xmm0
+ FIND_ZERO
+
+ bsfq %rdx, %rdx
+ addq %rdx, %rax
+ subq %rdi, %rax
+ SHIFT_RETURN
+ ret
+
+#endif
+
+END(strlen)
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
index a8cab0cb..5fa51fe0 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
@@ -2,4 +2,4 @@
#define AS_STRNLEN
#define strlen __wcsnlen_sse4_1
-#include "../strlen.S"
+#include "strlen-vec.S"
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index f845f3d4..ad047d84 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -1,5 +1,5 @@
-/* SSE2 version of strlen/wcslen.
- Copyright (C) 2012-2018 Free Software Foundation, Inc.
+/* SSE2 version of strlen.
+ Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,243 +16,6 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#include <sysdep.h>
+#include "multiarch/strlen-vec.S"
-#ifdef AS_WCSLEN
-# define PMINU pminud
-# define PCMPEQ pcmpeqd
-# define SHIFT_RETURN shrq $2, %rax
-#else
-# define PMINU pminub
-# define PCMPEQ pcmpeqb
-# define SHIFT_RETURN
-#endif
-
-/* Long lived register in strlen(s), strnlen(s, n) are:
-
- %xmm3 - zero
- %rdi - s
- %r10 (s+n) & (~(64-1))
- %r11 s+n
-*/
-
-
-.text
-ENTRY(strlen)
-
-/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
-#define FIND_ZERO \
- PCMPEQ (%rax), %xmm0; \
- PCMPEQ 16(%rax), %xmm1; \
- PCMPEQ 32(%rax), %xmm2; \
- PCMPEQ 48(%rax), %xmm3; \
- pmovmskb %xmm0, %esi; \
- pmovmskb %xmm1, %edx; \
- pmovmskb %xmm2, %r8d; \
- pmovmskb %xmm3, %ecx; \
- salq $16, %rdx; \
- salq $16, %rcx; \
- orq %rsi, %rdx; \
- orq %r8, %rcx; \
- salq $32, %rcx; \
- orq %rcx, %rdx;
-
-#ifdef AS_STRNLEN
-/* Do not read anything when n==0. */
- test %RSI_LP, %RSI_LP
- jne L(n_nonzero)
- xor %rax, %rax
- ret
-L(n_nonzero):
-# ifdef AS_WCSLEN
- shl $2, %RSI_LP
-# endif
-
-/* Initialize long lived registers. */
-
- add %RDI_LP, %RSI_LP
- mov %RSI_LP, %R10_LP
- and $-64, %R10_LP
- mov %RSI_LP, %R11_LP
-#endif
-
- pxor %xmm0, %xmm0
- pxor %xmm1, %xmm1
- pxor %xmm2, %xmm2
- pxor %xmm3, %xmm3
- movq %rdi, %rax
- movq %rdi, %rcx
- andq $4095, %rcx
-/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
- cmpq $4047, %rcx
-/* We cannot unify this branching as it would be ~6 cycles slower. */
- ja L(cross_page)
-
-#ifdef AS_STRNLEN
-/* Test if end is among first 64 bytes. */
-# define STRNLEN_PROLOG \
- mov %r11, %rsi; \
- subq %rax, %rsi; \
- andq $-64, %rax; \
- testq $-64, %rsi; \
- je L(strnlen_ret)
-#else
-# define STRNLEN_PROLOG andq $-64, %rax;
-#endif
-
-/* Ignore bits in mask that come before start of string. */
-#define PROLOG(lab) \
- movq %rdi, %rcx; \
- xorq %rax, %rcx; \
- STRNLEN_PROLOG; \
- sarq %cl, %rdx; \
- test %rdx, %rdx; \
- je L(lab); \
- bsfq %rdx, %rax; \
- SHIFT_RETURN; \
- ret
-
-#ifdef AS_STRNLEN
- andq $-16, %rax
- FIND_ZERO
-#else
- /* Test first 16 bytes unaligned. */
- movdqu (%rax), %xmm4
- PCMPEQ %xmm0, %xmm4
- pmovmskb %xmm4, %edx
- test %edx, %edx
- je L(next48_bytes)
- bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
- SHIFT_RETURN
- ret
-
-L(next48_bytes):
-/* Same as FIND_ZERO except we do not check first 16 bytes. */
- andq $-16, %rax
- PCMPEQ 16(%rax), %xmm1
- PCMPEQ 32(%rax), %xmm2
- PCMPEQ 48(%rax), %xmm3
- pmovmskb %xmm1, %edx
- pmovmskb %xmm2, %r8d
- pmovmskb %xmm3, %ecx
- salq $16, %rdx
- salq $16, %rcx
- orq %r8, %rcx
- salq $32, %rcx
- orq %rcx, %rdx
-#endif
-
- /* When no zero byte is found xmm1-3 are zero so we do not have to
- zero them. */
- PROLOG(loop)
-
- .p2align 4
-L(cross_page):
- andq $-64, %rax
- FIND_ZERO
- PROLOG(loop_init)
-
-#ifdef AS_STRNLEN
-/* We must do this check to correctly handle strnlen (s, -1). */
-L(strnlen_ret):
- bts %rsi, %rdx
- sarq %cl, %rdx
- test %rdx, %rdx
- je L(loop_init)
- bsfq %rdx, %rax
- SHIFT_RETURN
- ret
-#endif
- .p2align 4
-L(loop_init):
- pxor %xmm1, %xmm1
- pxor %xmm2, %xmm2
- pxor %xmm3, %xmm3
-#ifdef AS_STRNLEN
- .p2align 4
-L(loop):
-
- addq $64, %rax
- cmpq %rax, %r10
- je L(exit_end)
-
- movdqa (%rax), %xmm0
- PMINU 16(%rax), %xmm0
- PMINU 32(%rax), %xmm0
- PMINU 48(%rax), %xmm0
- PCMPEQ %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- testl %edx, %edx
- jne L(exit)
- jmp L(loop)
-
- .p2align 4
-L(exit_end):
- cmp %rax, %r11
- je L(first) /* Do not read when end is at page boundary. */
- pxor %xmm0, %xmm0
- FIND_ZERO
-
-L(first):
- bts %r11, %rdx
- bsfq %rdx, %rdx
- addq %rdx, %rax
- subq %rdi, %rax
- SHIFT_RETURN
- ret
-
- .p2align 4
-L(exit):
- pxor %xmm0, %xmm0
- FIND_ZERO
-
- bsfq %rdx, %rdx
- addq %rdx, %rax
- subq %rdi, %rax
- SHIFT_RETURN
- ret
-
-#else
-
- /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
- .p2align 4
-L(loop):
-
- movdqa 64(%rax), %xmm0
- PMINU 80(%rax), %xmm0
- PMINU 96(%rax), %xmm0
- PMINU 112(%rax), %xmm0
- PCMPEQ %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- testl %edx, %edx
- jne L(exit64)
-
- subq $-128, %rax
-
- movdqa (%rax), %xmm0
- PMINU 16(%rax), %xmm0
- PMINU 32(%rax), %xmm0
- PMINU 48(%rax), %xmm0
- PCMPEQ %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- testl %edx, %edx
- jne L(exit0)
- jmp L(loop)
-
- .p2align 4
-L(exit64):
- addq $64, %rax
-L(exit0):
- pxor %xmm0, %xmm0
- FIND_ZERO
-
- bsfq %rdx, %rdx
- addq %rdx, %rax
- subq %rdi, %rax
- SHIFT_RETURN
- ret
-
-#endif
-
-END(strlen)
libc_hidden_builtin_def (strlen)
--
GitLab

View file

@ -1,181 +0,0 @@
From 6f573a27b6c8b4236445810a44660612323f5a73 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed, 23 Jun 2021 01:19:34 -0400
Subject: [PATCH] x86-64: Add wcslen optimize for sse4.1
Content-type: text/plain; charset=UTF-8
No bug. This comment adds the ifunc / build infrastructure
necessary for wcslen to prefer the sse4.1 implementation
in strlen-vec.S. test-wcslen.c is passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/Makefile | 4 +-
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 3 ++
sysdeps/x86_64/multiarch/ifunc-wcslen.h | 52 ++++++++++++++++++++++
sysdeps/x86_64/multiarch/wcslen-sse4_1.S | 4 ++
sysdeps/x86_64/multiarch/wcslen.c | 2 +-
sysdeps/x86_64/multiarch/wcsnlen.c | 34 +-------------
6 files changed, 63 insertions(+), 36 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/ifunc-wcslen.h
create mode 100644 sysdeps/x86_64/multiarch/wcslen-sse4_1.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 491c7698..65fde4eb 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -93,8 +93,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
wcscpy-ssse3 wcscpy-c \
wcschr-sse2 wcschr-avx2 \
wcsrchr-sse2 wcsrchr-avx2 \
- wcsnlen-sse4_1 wcsnlen-c \
- wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
+ wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \
+ wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \
wcschr-avx2-rtm \
wcscmp-avx2-rtm \
wcslen-avx2-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index f1a6460a..580913ca 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -657,6 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcslen_evex)
+ IFUNC_IMPL_ADD (array, i, wcsnlen,
+ CPU_FEATURE_USABLE (SSE4_1),
+ __wcsnlen_sse4_1)
IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
/* Support sysdeps/x86_64/multiarch/wcsnlen.c. */
diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
new file mode 100644
index 00000000..39e33473
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
@@ -0,0 +1,52 @@
+/* Common definition for ifunc selections for wcslen and wcsnlen
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017-2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+ const struct cpu_features* cpu_features = __get_cpu_features ();
+
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ return OPTIMIZE (evex);
+
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_rtm);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ return OPTIMIZE (avx2);
+ }
+
+ if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
+ return OPTIMIZE (sse4_1);
+
+ return OPTIMIZE (sse2);
+}
diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
new file mode 100644
index 00000000..7e62621a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
@@ -0,0 +1,4 @@
+#define AS_WCSLEN
+#define strlen __wcslen_sse4_1
+
+#include "strlen-vec.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen.c b/sysdeps/x86_64/multiarch/wcslen.c
index 6d06e47c..3b04b75b 100644
--- a/sysdeps/x86_64/multiarch/wcslen.c
+++ b/sysdeps/x86_64/multiarch/wcslen.c
@@ -24,7 +24,7 @@
# undef __wcslen
# define SYMBOL_NAME wcslen
-# include "ifunc-avx2.h"
+# include "ifunc-wcslen.h"
libc_ifunc_redirected (__redirect_wcslen, __wcslen, IFUNC_SELECTOR ());
weak_alias (__wcslen, wcslen);
diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
index 20b731ae..06736410 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen.c
+++ b/sysdeps/x86_64/multiarch/wcsnlen.c
@@ -24,39 +24,7 @@
# undef __wcsnlen
# define SYMBOL_NAME wcsnlen
-# include <init-arch.h>
-
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
-
-static inline void *
-IFUNC_SELECTOR (void)
-{
- const struct cpu_features* cpu_features = __get_cpu_features ();
-
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
- && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
- {
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
- && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
- return OPTIMIZE (evex);
-
- if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
- return OPTIMIZE (avx2_rtm);
-
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
- return OPTIMIZE (avx2);
- }
-
- if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
- return OPTIMIZE (sse4_1);
-
- return OPTIMIZE (sse2);
-}
+# include "ifunc-wcslen.h"
libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());
weak_alias (__wcsnlen, wcsnlen);
--
GitLab

View file

@ -1,396 +0,0 @@
From 231c56760c1e2ded21ad96bbb860b1f08c556c7a Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 21 Jan 2019 11:27:25 -0800
Subject: [PATCH] x86-64 memcpy: Properly handle the length parameter [BZ#
24097]
Content-type: text/plain; charset=UTF-8
On x32, the size_t parameter may be passed in the lower 32 bits of a
64-bit register with the non-zero upper 32 bits. The string/memory
functions written in assembly can only use the lower 32 bits of a
64-bit register as length or must clear the upper 32 bits before using
the full 64-bit register for length.
This pach fixes memcpy for x32. Tested on x86-64 and x32. On x86-64,
libc.so is the same with and withou the fix.
[BZ# 24097]
CVE-2019-6488
* sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: Use RDX_LP for
length. Clear the upper 32 bits of RDX register.
* sysdeps/x86_64/multiarch/memcpy-ssse3.S: Likewise.
* sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S:
Likewise.
* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
Likewise.
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcpy.
tst-size_t-wmemchr.
* sysdeps/x86_64/x32/tst-size_t-memcpy.c: New file.
---
sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 17 ++++--
sysdeps/x86_64/multiarch/memcpy-ssse3.S | 17 ++++--
.../multiarch/memmove-avx512-no-vzeroupper.S | 16 +++--
.../multiarch/memmove-vec-unaligned-erms.S | 54 +++++++++--------
sysdeps/x86_64/x32/Makefile | 2 +-
sysdeps/x86_64/x32/tst-size_t-memcpy.c | 58 +++++++++++++++++++
6 files changed, 122 insertions(+), 42 deletions(-)
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcpy.c
Conflicts:
ChangeLog
(removed)
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
index 3cd11233..568eebd3 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
@@ -45,28 +45,33 @@
.section .text.ssse3,"ax",@progbits
#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
ENTRY (MEMPCPY_CHK)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMPCPY_CHK)
ENTRY (MEMPCPY)
- movq %rdi, %rax
- addq %rdx, %rax
+ mov %RDI_LP, %RAX_LP
+ add %RDX_LP, %RAX_LP
jmp L(start)
END (MEMPCPY)
#endif
#if !defined USE_AS_BCOPY
ENTRY (MEMCPY_CHK)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMCPY_CHK)
#endif
ENTRY (MEMCPY)
- mov %rdi, %rax
+ mov %RDI_LP, %RAX_LP
#ifdef USE_AS_MEMPCPY
- add %rdx, %rax
+ add %RDX_LP, %RAX_LP
+#endif
+
+#ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ mov %edx, %edx
#endif
#ifdef USE_AS_MEMMOVE
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
index 0240bfa3..0bd5ee99 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
@@ -45,28 +45,33 @@
.section .text.ssse3,"ax",@progbits
#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
ENTRY (MEMPCPY_CHK)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMPCPY_CHK)
ENTRY (MEMPCPY)
- movq %rdi, %rax
- addq %rdx, %rax
+ mov %RDI_LP, %RAX_LP
+ add %RDX_LP, %RAX_LP
jmp L(start)
END (MEMPCPY)
#endif
#if !defined USE_AS_BCOPY
ENTRY (MEMCPY_CHK)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMCPY_CHK)
#endif
ENTRY (MEMCPY)
- mov %rdi, %rax
+ mov %RDI_LP, %RAX_LP
#ifdef USE_AS_MEMPCPY
- add %rdx, %rax
+ add %RDX_LP, %RAX_LP
+#endif
+
+#ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ mov %edx, %edx
#endif
#ifdef USE_AS_MEMMOVE
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
index effc3ac2..6ca2bbc9 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
@@ -24,27 +24,31 @@
.section .text.avx512,"ax",@progbits
ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (__mempcpy_chk_avx512_no_vzeroupper)
ENTRY (__mempcpy_avx512_no_vzeroupper)
- movq %rdi, %rax
- addq %rdx, %rax
+ mov %RDI_LP, %RAX_LP
+ add %RDX_LP, %RAX_LP
jmp L(start)
END (__mempcpy_avx512_no_vzeroupper)
ENTRY (__memmove_chk_avx512_no_vzeroupper)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (__memmove_chk_avx512_no_vzeroupper)
ENTRY (__memmove_avx512_no_vzeroupper)
- mov %rdi, %rax
+ mov %RDI_LP, %RAX_LP
# ifdef USE_AS_MEMPCPY
- add %rdx, %rax
+ add %RDX_LP, %RAX_LP
# endif
L(start):
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ mov %edx, %edx
+# endif
lea (%rsi, %rdx), %rcx
lea (%rdi, %rdx), %r9
cmp $512, %rdx
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index c952576c..274aa1c7 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -95,20 +95,20 @@
.section SECTION(.text),"ax",@progbits
#if defined SHARED && IS_IN (libc)
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
#endif
ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
- movq %rdi, %rax
- addq %rdx, %rax
+ mov %RDI_LP, %RAX_LP
+ add %RDX_LP, %RAX_LP
jmp L(start)
END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
#if defined SHARED && IS_IN (libc)
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
#endif
@@ -116,9 +116,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
movq %rdi, %rax
L(start):
- cmpq $VEC_SIZE, %rdx
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
+# endif
+ cmp $VEC_SIZE, %RDX_LP
jb L(less_vec)
- cmpq $(VEC_SIZE * 2), %rdx
+ cmp $(VEC_SIZE * 2), %RDX_LP
ja L(more_2x_vec)
#if !defined USE_MULTIARCH || !IS_IN (libc)
L(last_2x_vec):
@@ -138,38 +142,38 @@ END (MEMMOVE_SYMBOL (__memmove, unaligned))
# if VEC_SIZE == 16
ENTRY (__mempcpy_chk_erms)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (__mempcpy_chk_erms)
/* Only used to measure performance of REP MOVSB. */
ENTRY (__mempcpy_erms)
- movq %rdi, %rax
+ mov %RDI_LP, %RAX_LP
/* Skip zero length. */
- testq %rdx, %rdx
+ test %RDX_LP, %RDX_LP
jz 2f
- addq %rdx, %rax
+ add %RDX_LP, %RAX_LP
jmp L(start_movsb)
END (__mempcpy_erms)
ENTRY (__memmove_chk_erms)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (__memmove_chk_erms)
ENTRY (__memmove_erms)
movq %rdi, %rax
/* Skip zero length. */
- testq %rdx, %rdx
+ test %RDX_LP, %RDX_LP
jz 2f
L(start_movsb):
- movq %rdx, %rcx
- cmpq %rsi, %rdi
+ mov %RDX_LP, %RCX_LP
+ cmp %RSI_LP, %RDI_LP
jb 1f
/* Source == destination is less common. */
je 2f
- leaq (%rsi,%rcx), %rdx
- cmpq %rdx, %rdi
+ lea (%rsi,%rcx), %RDX_LP
+ cmp %RDX_LP, %RDI_LP
jb L(movsb_backward)
1:
rep movsb
@@ -189,20 +193,20 @@ strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
# ifdef SHARED
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
# endif
ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
- movq %rdi, %rax
- addq %rdx, %rax
+ mov %RDI_LP, %RAX_LP
+ add %RDX_LP, %RAX_LP
jmp L(start_erms)
END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
# ifdef SHARED
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
# endif
@@ -210,9 +214,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
movq %rdi, %rax
L(start_erms):
- cmpq $VEC_SIZE, %rdx
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
+# endif
+ cmp $VEC_SIZE, %RDX_LP
jb L(less_vec)
- cmpq $(VEC_SIZE * 2), %rdx
+ cmp $(VEC_SIZE * 2), %RDX_LP
ja L(movsb_more_2x_vec)
L(last_2x_vec):
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
@@ -236,7 +244,7 @@ L(movsb):
/* Avoid slow backward REP MOVSB. */
jb L(more_8x_vec_backward)
1:
- movq %rdx, %rcx
+ mov %RDX_LP, %RCX_LP
rep movsb
L(nop):
ret
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
index ddec7f04..2fe1e5ac 100644
--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
@@ -6,7 +6,7 @@ CFLAGS-s_llround.c += -fno-builtin-lround
endif
ifeq ($(subdir),string)
-tests += tst-size_t-memchr tst-size_t-memcmp
+tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
endif
ifeq ($(subdir),wcsmbs)
diff --git a/sysdeps/x86_64/x32/tst-size_t-memcpy.c b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
new file mode 100644
index 00000000..66b71e17
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
@@ -0,0 +1,58 @@
+/* Test memcpy with size_t in the lower 32 bits of 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define TEST_NAME "memcpy"
+#include "test-size_t.h"
+
+IMPL (memcpy, 1)
+
+typedef void *(*proto_t) (void *, const void *, size_t);
+
+static void *
+__attribute__ ((noinline, noclone))
+do_memcpy (parameter_t a, parameter_t b)
+{
+ return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+ test_init ();
+
+ parameter_t dest = { { page_size }, buf1 };
+ parameter_t src = { { 0 }, buf2 };
+
+ int ret = 0;
+ FOR_EACH_IMPL (impl, 0)
+ {
+ src.fn = impl->fn;
+ do_memcpy (dest, src);
+ int res = memcmp (dest.p, src.p, dest.len);
+ if (res)
+ {
+ error (0, 0, "Wrong result in function %s: %i != 0",
+ impl->name, res);
+ ret = 1;
+ }
+ }
+
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
--
GitLab

View file

@ -1,497 +0,0 @@
From a775a7a3eb1e85b54af0b4ee5ff4dcf66772a1fb Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed, 23 Jun 2021 01:56:29 -0400
Subject: [PATCH] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ
#27974]
Content-type: text/plain; charset=UTF-8
This commit fixes the bug mentioned in the previous commit.
The previous implementations of wmemchr in these files relied
on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
The new overflow tests added in the previous commit now
pass (As well as all the other tests).
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
sysdeps/x86_64/multiarch/strlen-vec.S | 15 ++-
2 files changed, 107 insertions(+), 38 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
index be8a5db5..37688966 100644
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -44,21 +44,21 @@
# define VEC_SIZE 32
# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
.section SECTION(.text),"ax",@progbits
ENTRY (STRLEN)
# ifdef USE_AS_STRNLEN
/* Check zero length. */
+# ifdef __ILP32__
+ /* Clear upper bits. */
+ and %RSI_LP, %RSI_LP
+# else
test %RSI_LP, %RSI_LP
+# endif
jz L(zero)
/* Store max len in R8_LP before adjusting if using WCSLEN. */
mov %RSI_LP, %R8_LP
-# ifdef USE_AS_WCSLEN
- shl $2, %RSI_LP
-# elif defined __ILP32__
- /* Clear the upper 32 bits. */
- movl %esi, %esi
-# endif
# endif
movl %edi, %eax
movq %rdi, %rdx
@@ -72,10 +72,10 @@ ENTRY (STRLEN)
/* Check the first VEC_SIZE bytes. */
VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
# ifdef USE_AS_STRNLEN
/* If length < VEC_SIZE handle special. */
- cmpq $VEC_SIZE, %rsi
+ cmpq $CHAR_PER_VEC, %rsi
jbe L(first_vec_x0)
# endif
/* If empty continue to aligned_more. Otherwise return bit
@@ -84,6 +84,7 @@ ENTRY (STRLEN)
jz L(aligned_more)
tzcntl %eax, %eax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -97,9 +98,14 @@ L(zero):
L(first_vec_x0):
/* Set bit for max len so that tzcnt will return min of max len
and position of first match. */
+# ifdef USE_AS_WCSLEN
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %esi
+# endif
btsq %rsi, %rax
tzcntl %eax, %eax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -113,14 +119,19 @@ L(first_vec_x1):
# ifdef USE_AS_STRNLEN
/* Use ecx which was computed earlier to compute correct value.
*/
+# ifdef USE_AS_WCSLEN
+ leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
+# else
subl $(VEC_SIZE * 4 + 1), %ecx
addl %ecx, %eax
+# endif
# else
subl %edx, %edi
incl %edi
addl %edi, %eax
# endif
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -133,14 +144,19 @@ L(first_vec_x2):
# ifdef USE_AS_STRNLEN
/* Use ecx which was computed earlier to compute correct value.
*/
+# ifdef USE_AS_WCSLEN
+ leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
+# else
subl $(VEC_SIZE * 3 + 1), %ecx
addl %ecx, %eax
+# endif
# else
subl %edx, %edi
addl $(VEC_SIZE + 1), %edi
addl %edi, %eax
# endif
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -153,14 +169,19 @@ L(first_vec_x3):
# ifdef USE_AS_STRNLEN
/* Use ecx which was computed earlier to compute correct value.
*/
+# ifdef USE_AS_WCSLEN
+ leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
+# else
subl $(VEC_SIZE * 2 + 1), %ecx
addl %ecx, %eax
+# endif
# else
subl %edx, %edi
addl $(VEC_SIZE * 2 + 1), %edi
addl %edi, %eax
# endif
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -173,14 +194,19 @@ L(first_vec_x4):
# ifdef USE_AS_STRNLEN
/* Use ecx which was computed earlier to compute correct value.
*/
+# ifdef USE_AS_WCSLEN
+ leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
+# else
subl $(VEC_SIZE + 1), %ecx
addl %ecx, %eax
+# endif
# else
subl %edx, %edi
addl $(VEC_SIZE * 3 + 1), %edi
addl %edi, %eax
# endif
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -195,10 +221,14 @@ L(cross_page_continue):
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
# ifdef USE_AS_STRNLEN
- /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
- it simplies the logic in last_4x_vec_or_less. */
+ /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
+ because it simplies the logic in last_4x_vec_or_less. */
leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
subq %rdx, %rcx
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
+# endif
# endif
/* Load first VEC regardless. */
VPCMPEQ 1(%rdi), %ymm0, %ymm1
@@ -207,34 +237,38 @@ L(cross_page_continue):
subq %rcx, %rsi
jb L(last_4x_vec_or_less)
# endif
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x1)
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x2)
VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x3)
VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x4)
/* Align data to VEC_SIZE * 4 - 1. */
# ifdef USE_AS_STRNLEN
/* Before adjusting length check if at last VEC_SIZE * 4. */
- cmpq $(VEC_SIZE * 4 - 1), %rsi
+ cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
jbe L(last_4x_vec_or_less_load)
incq %rdi
movl %edi, %ecx
orq $(VEC_SIZE * 4 - 1), %rdi
andl $(VEC_SIZE * 4 - 1), %ecx
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
+# endif
/* Readjust length. */
addq %rcx, %rsi
# else
@@ -246,13 +280,13 @@ L(cross_page_continue):
L(loop_4x_vec):
# ifdef USE_AS_STRNLEN
/* Break if at end of length. */
- subq $(VEC_SIZE * 4), %rsi
+ subq $(CHAR_PER_VEC * 4), %rsi
jb L(last_4x_vec_or_less_cmpeq)
# endif
- /* Save some code size by microfusing VPMINU with the load. Since
- the matches in ymm2/ymm4 can only be returned if there where no
- matches in ymm1/ymm3 respectively there is no issue with overlap.
- */
+ /* Save some code size by microfusing VPMINU with the load.
+ Since the matches in ymm2/ymm4 can only be returned if there
+ where no matches in ymm1/ymm3 respectively there is no issue
+ with overlap. */
vmovdqa 1(%rdi), %ymm1
VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
@@ -260,7 +294,7 @@ L(loop_4x_vec):
VPMINU %ymm2, %ymm4, %ymm5
VPCMPEQ %ymm5, %ymm0, %ymm5
- vpmovmskb %ymm5, %ecx
+ vpmovmskb %ymm5, %ecx
subq $-(VEC_SIZE * 4), %rdi
testl %ecx, %ecx
@@ -268,27 +302,28 @@ L(loop_4x_vec):
VPCMPEQ %ymm1, %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
subq %rdx, %rdi
testl %eax, %eax
jnz L(last_vec_return_x0)
VPCMPEQ %ymm2, %ymm0, %ymm2
- vpmovmskb %ymm2, %eax
+ vpmovmskb %ymm2, %eax
testl %eax, %eax
jnz L(last_vec_return_x1)
/* Combine last 2 VEC. */
VPCMPEQ %ymm3, %ymm0, %ymm3
- vpmovmskb %ymm3, %eax
- /* rcx has combined result from all 4 VEC. It will only be used if
- the first 3 other VEC all did not contain a match. */
+ vpmovmskb %ymm3, %eax
+ /* rcx has combined result from all 4 VEC. It will only be used
+ if the first 3 other VEC all did not contain a match. */
salq $32, %rcx
orq %rcx, %rax
tzcntq %rax, %rax
subq $(VEC_SIZE * 2 - 1), %rdi
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -297,15 +332,19 @@ L(loop_4x_vec):
# ifdef USE_AS_STRNLEN
.p2align 4
L(last_4x_vec_or_less_load):
- /* Depending on entry adjust rdi / prepare first VEC in ymm1. */
+ /* Depending on entry adjust rdi / prepare first VEC in ymm1.
+ */
subq $-(VEC_SIZE * 4), %rdi
L(last_4x_vec_or_less_cmpeq):
VPCMPEQ 1(%rdi), %ymm0, %ymm1
L(last_4x_vec_or_less):
-
- vpmovmskb %ymm1, %eax
- /* If remaining length > VEC_SIZE * 2. This works if esi is off by
- VEC_SIZE * 4. */
+# ifdef USE_AS_WCSLEN
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %esi
+# endif
+ vpmovmskb %ymm1, %eax
+ /* If remaining length > VEC_SIZE * 2. This works if esi is off
+ by VEC_SIZE * 4. */
testl $(VEC_SIZE * 2), %esi
jnz L(last_4x_vec)
@@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
jb L(max)
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
tzcntl %eax, %eax
/* Check the end of data. */
cmpl %eax, %esi
@@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
addl $(VEC_SIZE + 1), %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -340,6 +380,7 @@ L(last_vec_return_x0):
subq $(VEC_SIZE * 4 - 1), %rdi
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -350,6 +391,7 @@ L(last_vec_return_x1):
subq $(VEC_SIZE * 3 - 1), %rdi
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -366,6 +408,7 @@ L(last_vec_x1_check):
incl %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -381,14 +424,14 @@ L(last_4x_vec):
jnz L(last_vec_x1)
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(last_vec_x2)
/* Normalize length. */
andl $(VEC_SIZE * 4 - 1), %esi
VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(last_vec_x3)
@@ -396,7 +439,7 @@ L(last_4x_vec):
jb L(max)
VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
tzcntl %eax, %eax
/* Check the end of data. */
cmpl %eax, %esi
@@ -405,6 +448,7 @@ L(last_4x_vec):
addl $(VEC_SIZE * 3 + 1), %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -419,6 +463,7 @@ L(last_vec_x1):
incl %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -432,6 +477,7 @@ L(last_vec_x2):
addl $(VEC_SIZE + 1), %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -447,6 +493,7 @@ L(last_vec_x3):
addl $(VEC_SIZE * 2 + 1), %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -455,13 +502,13 @@ L(max_end):
VZEROUPPER_RETURN
# endif
- /* Cold case for crossing page with first load. */
+ /* Cold case for crossing page with first load. */
.p2align 4
L(cross_page_boundary):
/* Align data to VEC_SIZE - 1. */
orq $(VEC_SIZE - 1), %rdi
VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
so no need to manually mod rdx. */
sarxl %edx, %eax, %eax
@@ -470,6 +517,10 @@ L(cross_page_boundary):
jnz L(cross_page_less_vec)
leaq 1(%rdi), %rcx
subq %rdx, %rcx
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
+ shrl $2, %ecx
+# endif
/* Check length. */
cmpq %rsi, %rcx
jb L(cross_page_continue)
@@ -479,6 +530,7 @@ L(cross_page_boundary):
jz L(cross_page_continue)
tzcntl %eax, %eax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide length by 4 to get wchar_t count. */
shrl $2, %eax
# endif
# endif
@@ -489,6 +541,10 @@ L(return_vzeroupper):
.p2align 4
L(cross_page_less_vec):
tzcntl %eax, %eax
+# ifdef USE_AS_WCSLEN
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %esi
+# endif
cmpq %rax, %rsi
cmovb %esi, %eax
# ifdef USE_AS_WCSLEN
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
index 8f660bb9..439e486a 100644
--- a/sysdeps/x86_64/multiarch/strlen-vec.S
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
@@ -65,12 +65,25 @@ ENTRY(strlen)
ret
L(n_nonzero):
# ifdef AS_WCSLEN
- shl $2, %RSI_LP
+/* Check for overflow from maxlen * sizeof(wchar_t). If it would
+ overflow the only way this program doesn't have undefined behavior
+ is if there is a null terminator in valid memory so wcslen will
+ suffice. */
+ mov %RSI_LP, %R10_LP
+ sar $62, %R10_LP
+ test %R10_LP, %R10_LP
+ jnz __wcslen_sse4_1
+ sal $2, %RSI_LP
# endif
+
/* Initialize long lived registers. */
add %RDI_LP, %RSI_LP
+# ifdef AS_WCSLEN
+/* Check for overflow again from s + maxlen * sizeof(wchar_t). */
+ jbe __wcslen_sse4_1
+# endif
mov %RSI_LP, %R10_LP
and $-64, %R10_LP
mov %RSI_LP, %R11_LP
--
GitLab

View file

@ -1,745 +0,0 @@
From 4ba65586847751372520a36757c17f114588794e Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon, 19 Apr 2021 19:36:06 -0400
Subject: [PATCH] x86: Optimize strlen-evex.S
Content-type: text/plain; charset=UTF-8
No bug. This commit optimizes strlen-evex.S. The
optimizations are mostly small things but they add up to roughly
10-30% performance improvement for strlen. The results for strnlen are
bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and
test-wcsnlen are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/strlen-evex.S | 581 ++++++++++++++-----------
1 file changed, 317 insertions(+), 264 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
index 05838190..4bf6874b 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
@@ -29,11 +29,13 @@
# ifdef USE_AS_WCSLEN
# define VPCMP vpcmpd
# define VPMINU vpminud
-# define SHIFT_REG r9d
+# define SHIFT_REG ecx
+# define CHAR_SIZE 4
# else
# define VPCMP vpcmpb
# define VPMINU vpminub
-# define SHIFT_REG ecx
+# define SHIFT_REG edx
+# define CHAR_SIZE 1
# endif
# define XMMZERO xmm16
@@ -46,132 +48,165 @@
# define YMM6 ymm22
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
.section .text.evex,"ax",@progbits
ENTRY (STRLEN)
# ifdef USE_AS_STRNLEN
- /* Check for zero length. */
+ /* Check zero length. */
test %RSI_LP, %RSI_LP
jz L(zero)
-# ifdef USE_AS_WCSLEN
- shl $2, %RSI_LP
-# elif defined __ILP32__
+# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %esi, %esi
# endif
mov %RSI_LP, %R8_LP
# endif
- movl %edi, %ecx
- movq %rdi, %rdx
+ movl %edi, %eax
vpxorq %XMMZERO, %XMMZERO, %XMMZERO
-
+ /* Clear high bits from edi. Only keeping bits relevant to page
+ cross check. */
+ andl $(PAGE_SIZE - 1), %eax
/* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. Each bit in K0 represents a
null byte. */
VPCMP $0, (%rdi), %YMMZERO, %k0
kmovd %k0, %eax
- testl %eax, %eax
-
# ifdef USE_AS_STRNLEN
- jnz L(first_vec_x0_check)
- /* Adjust length and check the end of data. */
- subq $VEC_SIZE, %rsi
- jbe L(max)
-# else
- jnz L(first_vec_x0)
+ /* If length < CHAR_PER_VEC handle special. */
+ cmpq $CHAR_PER_VEC, %rsi
+ jbe L(first_vec_x0)
# endif
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
-
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+ ret
# ifdef USE_AS_STRNLEN
- /* Adjust length. */
- addq %rcx, %rsi
+L(zero):
+ xorl %eax, %eax
+ ret
- subq $(VEC_SIZE * 4), %rsi
- jbe L(last_4x_vec_or_less)
+ .p2align 4
+L(first_vec_x0):
+ /* Set bit for max len so that tzcnt will return min of max len
+ and position of first match. */
+ btsq %rsi, %rax
+ tzcntl %eax, %eax
+ ret
# endif
- jmp L(more_4x_vec)
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
-
-# ifdef USE_AS_WCSLEN
- /* NB: Divide shift count by 4 since each bit in K0 represent 4
- bytes. */
- movl %ecx, %SHIFT_REG
- sarl $2, %SHIFT_REG
+L(first_vec_x1):
+ tzcntl %eax, %eax
+ /* Safe to use 32 bit instructions as these are only called for
+ size = [1, 159]. */
+# ifdef USE_AS_STRNLEN
+ /* Use ecx which was computed earlier to compute correct value.
+ */
+ leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
+# else
+ subl %edx, %edi
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %edi
+# endif
+ leal CHAR_PER_VEC(%rdi, %rax), %eax
# endif
- VPCMP $0, (%rdi), %YMMZERO, %k0
- kmovd %k0, %eax
+ ret
- /* Remove the leading bytes. */
- sarxl %SHIFT_REG, %eax, %eax
- testl %eax, %eax
- jz L(aligned_more)
+ .p2align 4
+L(first_vec_x2):
tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
-# endif
+ /* Safe to use 32 bit instructions as these are only called for
+ size = [1, 159]. */
# ifdef USE_AS_STRNLEN
- /* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
-# endif
- addq %rdi, %rax
- addq %rcx, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
+ /* Use ecx which was computed earlier to compute correct value.
+ */
+ leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
+# else
+ subl %edx, %edi
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %edi
+# endif
+ leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax
# endif
ret
.p2align 4
-L(aligned_more):
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ /* Safe to use 32 bit instructions as these are only called for
+ size = [1, 159]. */
# ifdef USE_AS_STRNLEN
- /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE"
- with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
- to void possible addition overflow. */
- negq %rcx
- addq $VEC_SIZE, %rcx
-
- /* Check the end of data. */
- subq %rcx, %rsi
- jbe L(max)
+ /* Use ecx which was computed earlier to compute correct value.
+ */
+ leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
+# else
+ subl %edx, %edi
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %edi
+# endif
+ leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax
# endif
+ ret
- addq $VEC_SIZE, %rdi
-
+ .p2align 4
+L(first_vec_x4):
+ tzcntl %eax, %eax
+ /* Safe to use 32 bit instructions as these are only called for
+ size = [1, 159]. */
# ifdef USE_AS_STRNLEN
- subq $(VEC_SIZE * 4), %rsi
- jbe L(last_4x_vec_or_less)
+ /* Use ecx which was computed earlier to compute correct value.
+ */
+ leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
+# else
+ subl %edx, %edi
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %edi
+# endif
+ leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
# endif
+ ret
-L(more_4x_vec):
+ .p2align 5
+L(aligned_more):
+ movq %rdi, %rdx
+ /* Align data to VEC_SIZE. */
+ andq $-(VEC_SIZE), %rdi
+L(cross_page_continue):
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
- VPCMP $0, (%rdi), %YMMZERO, %k0
- kmovd %k0, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
-
+# ifdef USE_AS_STRNLEN
+ /* + CHAR_SIZE because it simplies the logic in
+ last_4x_vec_or_less. */
+ leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
+ subq %rdx, %rcx
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
+# endif
+# endif
+ /* Load first VEC regardless. */
VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
+# ifdef USE_AS_STRNLEN
+ /* Adjust length. If near end handle specially. */
+ subq %rcx, %rsi
+ jb L(last_4x_vec_or_less)
+# endif
kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x1)
VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
kmovd %k0, %eax
- testl %eax, %eax
+ test %eax, %eax
jnz L(first_vec_x2)
VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
@@ -179,258 +214,276 @@ L(more_4x_vec):
testl %eax, %eax
jnz L(first_vec_x3)
- addq $(VEC_SIZE * 4), %rdi
-
-# ifdef USE_AS_STRNLEN
- subq $(VEC_SIZE * 4), %rsi
- jbe L(last_4x_vec_or_less)
-# endif
-
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
- andq $-(4 * VEC_SIZE), %rdi
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x4)
+ addq $VEC_SIZE, %rdi
# ifdef USE_AS_STRNLEN
- /* Adjust length. */
+ /* Check if at last VEC_SIZE * 4 length. */
+ cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
+ jbe L(last_4x_vec_or_less_load)
+ movl %edi, %ecx
+ andl $(VEC_SIZE * 4 - 1), %ecx
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
+# endif
+ /* Readjust length. */
addq %rcx, %rsi
# endif
+ /* Align data to VEC_SIZE * 4. */
+ andq $-(VEC_SIZE * 4), %rdi
+ /* Compare 4 * VEC at a time forward. */
.p2align 4
L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- VMOVA (%rdi), %YMM1
- VMOVA VEC_SIZE(%rdi), %YMM2
- VMOVA (VEC_SIZE * 2)(%rdi), %YMM3
- VMOVA (VEC_SIZE * 3)(%rdi), %YMM4
-
- VPMINU %YMM1, %YMM2, %YMM5
- VPMINU %YMM3, %YMM4, %YMM6
+ /* Load first VEC regardless. */
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
+# ifdef USE_AS_STRNLEN
+ /* Break if at end of length. */
+ subq $(CHAR_PER_VEC * 4), %rsi
+ jb L(last_4x_vec_or_less_cmpeq)
+# endif
+ /* Save some code size by microfusing VPMINU with the load. Since
+ the matches in ymm2/ymm4 can only be returned if there where no
+ matches in ymm1/ymm3 respectively there is no issue with overlap.
+ */
+ VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
+ VMOVA (VEC_SIZE * 6)(%rdi), %YMM3
+ VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
+
+ VPCMP $0, %YMM2, %YMMZERO, %k0
+ VPCMP $0, %YMM4, %YMMZERO, %k1
+ subq $-(VEC_SIZE * 4), %rdi
+ kortestd %k0, %k1
+ jz L(loop_4x_vec)
+
+ /* Check if end was in first half. */
+ kmovd %k0, %eax
+ subq %rdx, %rdi
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rdi
+# endif
+ testl %eax, %eax
+ jz L(second_vec_return)
- VPMINU %YMM5, %YMM6, %YMM5
- VPCMP $0, %YMM5, %YMMZERO, %k0
- ktestd %k0, %k0
- jnz L(4x_vec_end)
+ VPCMP $0, %YMM1, %YMMZERO, %k2
+ kmovd %k2, %edx
+ /* Combine VEC1 matches (edx) with VEC2 matches (eax). */
+# ifdef USE_AS_WCSLEN
+ sall $CHAR_PER_VEC, %eax
+ orl %edx, %eax
+ tzcntl %eax, %eax
+# else
+ salq $CHAR_PER_VEC, %rax
+ orq %rdx, %rax
+ tzcntq %rax, %rax
+# endif
+ addq %rdi, %rax
+ ret
- addq $(VEC_SIZE * 4), %rdi
-# ifndef USE_AS_STRNLEN
- jmp L(loop_4x_vec)
-# else
- subq $(VEC_SIZE * 4), %rsi
- ja L(loop_4x_vec)
+# ifdef USE_AS_STRNLEN
+L(last_4x_vec_or_less_load):
+ /* Depending on entry adjust rdi / prepare first VEC in YMM1. */
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
+L(last_4x_vec_or_less_cmpeq):
+ VPCMP $0, %YMM1, %YMMZERO, %k0
+ addq $(VEC_SIZE * 3), %rdi
L(last_4x_vec_or_less):
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
- addl $(VEC_SIZE * 2), %esi
- jle L(last_2x_vec)
-
- VPCMP $0, (%rdi), %YMMZERO, %k0
kmovd %k0, %eax
+ /* If remaining length > VEC_SIZE * 2. This works if esi is off by
+ VEC_SIZE * 4. */
+ testl $(CHAR_PER_VEC * 2), %esi
+ jnz L(last_4x_vec)
+
+ /* length may have been negative or positive by an offset of
+ CHAR_PER_VEC * 4 depending on where this was called from. This
+ fixes that. */
+ andl $(CHAR_PER_VEC * 4 - 1), %esi
testl %eax, %eax
- jnz L(first_vec_x0)
+ jnz L(last_vec_x1_check)
- VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
- kmovd %k0, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
+ /* Check the end of data. */
+ subl $CHAR_PER_VEC, %esi
+ jb L(max)
VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
kmovd %k0, %eax
- testl %eax, %eax
- jnz L(first_vec_x2_check)
- subl $VEC_SIZE, %esi
- jle L(max)
+ tzcntl %eax, %eax
+ /* Check the end of data. */
+ cmpl %eax, %esi
+ jb L(max)
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
- kmovd %k0, %eax
- testl %eax, %eax
- jnz L(first_vec_x3_check)
+ subq %rdx, %rdi
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarq $2, %rdi
+# endif
+ leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
+ ret
+L(max):
movq %r8, %rax
+ ret
+# endif
+
+ /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
+ in the 4x VEC loop can use 2 byte encoding. */
+ .p2align 4
+L(second_vec_return):
+ VPCMP $0, %YMM3, %YMMZERO, %k0
+ /* Combine YMM3 matches (k0) with YMM4 matches (k1). */
+# ifdef USE_AS_WCSLEN
+ kunpckbw %k0, %k1, %k0
+ kmovd %k0, %eax
+ tzcntl %eax, %eax
+# else
+ kunpckdq %k0, %k1, %k0
+ kmovq %k0, %rax
+ tzcntq %rax, %rax
+# endif
+ leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
+ ret
+
+
+# ifdef USE_AS_STRNLEN
+L(last_vec_x1_check):
+ tzcntl %eax, %eax
+ /* Check the end of data. */
+ cmpl %eax, %esi
+ jb L(max)
+ subq %rdx, %rdi
# ifdef USE_AS_WCSLEN
- shrq $2, %rax
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarq $2, %rdi
# endif
+ leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
ret
.p2align 4
-L(last_2x_vec):
- addl $(VEC_SIZE * 2), %esi
+L(last_4x_vec):
+ /* Test first 2x VEC normally. */
+ testl %eax, %eax
+ jnz L(last_vec_x1)
- VPCMP $0, (%rdi), %YMMZERO, %k0
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
kmovd %k0, %eax
testl %eax, %eax
- jnz L(first_vec_x0_check)
- subl $VEC_SIZE, %esi
- jle L(max)
+ jnz L(last_vec_x2)
- VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
+ /* Normalize length. */
+ andl $(CHAR_PER_VEC * 4 - 1), %esi
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
kmovd %k0, %eax
testl %eax, %eax
- jnz L(first_vec_x1_check)
- movq %r8, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
-# endif
- ret
+ jnz L(last_vec_x3)
- .p2align 4
-L(first_vec_x0_check):
+ /* Check the end of data. */
+ subl $(CHAR_PER_VEC * 3), %esi
+ jb L(max)
+
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
+ kmovd %k0, %eax
tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
-# endif
/* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
- addq %rdi, %rax
- subq %rdx, %rax
+ cmpl %eax, %esi
+ jb L(max_end)
+
+ subq %rdx, %rdi
# ifdef USE_AS_WCSLEN
- shrq $2, %rax
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarq $2, %rdi
# endif
+ leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax
ret
.p2align 4
-L(first_vec_x1_check):
+L(last_vec_x1):
tzcntl %eax, %eax
+ subq %rdx, %rdi
# ifdef USE_AS_WCSLEN
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
-# endif
- /* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarq $2, %rdi
# endif
+ leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
ret
.p2align 4
-L(first_vec_x2_check):
+L(last_vec_x2):
tzcntl %eax, %eax
+ subq %rdx, %rdi
# ifdef USE_AS_WCSLEN
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
-# endif
- /* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
- addq $(VEC_SIZE * 2), %rax
- addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarq $2, %rdi
# endif
+ leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
ret
.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x3):
tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
-# endif
+ subl $(CHAR_PER_VEC * 2), %esi
/* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
- addq $(VEC_SIZE * 3), %rax
- addq %rdi, %rax
- subq %rdx, %rax
+ cmpl %eax, %esi
+ jb L(max_end)
+ subq %rdx, %rdi
# ifdef USE_AS_WCSLEN
- shrq $2, %rax
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarq $2, %rdi
# endif
+ leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax
ret
-
- .p2align 4
-L(max):
+L(max_end):
movq %r8, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
-# endif
- ret
-
- .p2align 4
-L(zero):
- xorl %eax, %eax
ret
# endif
+ /* Cold case for crossing page with first load. */
.p2align 4
-L(first_vec_x0):
- tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
-# endif
- addq %rdi, %rax
- subq %rdx, %rax
+L(cross_page_boundary):
+ movq %rdi, %rdx
+ /* Align data to VEC_SIZE. */
+ andq $-VEC_SIZE, %rdi
+ VPCMP $0, (%rdi), %YMMZERO, %k0
+ kmovd %k0, %eax
+ /* Remove the leading bytes. */
# ifdef USE_AS_WCSLEN
- shrq $2, %rax
+ /* NB: Divide shift count by 4 since each bit in K0 represent 4
+ bytes. */
+ movl %edx, %ecx
+ shrl $2, %ecx
+ andl $(CHAR_PER_VEC - 1), %ecx
# endif
- ret
-
- .p2align 4
-L(first_vec_x1):
+ /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */
+ sarxl %SHIFT_REG, %eax, %eax
+ testl %eax, %eax
+# ifndef USE_AS_STRNLEN
+ jz L(cross_page_continue)
tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
-# endif
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
-# endif
ret
-
- .p2align 4
-L(first_vec_x2):
- tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
-# endif
- addq $(VEC_SIZE * 2), %rax
- addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
-# endif
+# else
+ jnz L(cross_page_less_vec)
+# ifndef USE_AS_WCSLEN
+ movl %edx, %ecx
+ andl $(CHAR_PER_VEC - 1), %ecx
+# endif
+ movl $CHAR_PER_VEC, %eax
+ subl %ecx, %eax
+ /* Check the end of data. */
+ cmpq %rax, %rsi
+ ja L(cross_page_continue)
+ movl %esi, %eax
ret
-
- .p2align 4
-L(4x_vec_end):
- VPCMP $0, %YMM1, %YMMZERO, %k0
- kmovd %k0, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- VPCMP $0, %YMM2, %YMMZERO, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
- VPCMP $0, %YMM3, %YMMZERO, %k2
- kmovd %k2, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
- VPCMP $0, %YMM4, %YMMZERO, %k3
- kmovd %k3, %eax
-L(first_vec_x3):
+L(cross_page_less_vec):
tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
-# endif
- addq $(VEC_SIZE * 3), %rax
- addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
-# endif
+ /* Select min of length and position of first null. */
+ cmpq %rax, %rsi
+ cmovb %esi, %eax
ret
+# endif
END (STRLEN)
#endif
--
GitLab

Some files were not shown because too many files have changed in this diff Show more