95 lines
3.5 KiB
Diff
95 lines
3.5 KiB
Diff
From 799859f6635d68487ea2472bd79d96a7639a1ab1 Mon Sep 17 00:00:00 2001
|
|
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
Date: Sun, 6 Aug 2017 10:44:30 -0700
|
|
Subject: [PATCH 04] x86-64: Use _dl_runtime_resolve_opt only with AVX512F
|
|
[BZ #21871]
|
|
|
|
On AVX machines with XGETBV (ECX == 1) like Skylake processors,
|
|
|
|
(gdb) disass _dl_runtime_resolve_avx_opt
|
|
Dump of assembler code for function _dl_runtime_resolve_avx_opt:
|
|
0x0000000000015890 <+0>: push %rax
|
|
0x0000000000015891 <+1>: push %rcx
|
|
0x0000000000015892 <+2>: push %rdx
|
|
0x0000000000015893 <+3>: mov $0x1,%ecx
|
|
0x0000000000015898 <+8>: xgetbv
|
|
0x000000000001589b <+11>: mov %eax,%r11d
|
|
0x000000000001589e <+14>: pop %rdx
|
|
0x000000000001589f <+15>: pop %rcx
|
|
0x00000000000158a0 <+16>: pop %rax
|
|
0x00000000000158a1 <+17>: and $0x4,%r11d
|
|
0x00000000000158a5 <+21>: bnd je 0x16200 <_dl_runtime_resolve_sse_vex>
|
|
End of assembler dump.
|
|
|
|
is slower than:
|
|
|
|
(gdb) disass _dl_runtime_resolve_avx_slow
|
|
Dump of assembler code for function _dl_runtime_resolve_avx_slow:
|
|
0x0000000000015850 <+0>: vorpd %ymm0,%ymm1,%ymm8
|
|
0x0000000000015854 <+4>: vorpd %ymm2,%ymm3,%ymm9
|
|
0x0000000000015858 <+8>: vorpd %ymm4,%ymm5,%ymm10
|
|
0x000000000001585c <+12>: vorpd %ymm6,%ymm7,%ymm11
|
|
0x0000000000015860 <+16>: vorpd %ymm8,%ymm9,%ymm9
|
|
0x0000000000015865 <+21>: vorpd %ymm10,%ymm11,%ymm10
|
|
0x000000000001586a <+26>: vpcmpeqd %xmm8,%xmm8,%xmm8
|
|
0x000000000001586f <+31>: vorpd %ymm9,%ymm10,%ymm10
|
|
0x0000000000015874 <+36>: vptest %ymm10,%ymm8
|
|
0x0000000000015879 <+41>: bnd jae 0x158b0 <_dl_runtime_resolve_avx>
|
|
0x000000000001587c <+44>: vzeroupper
|
|
0x000000000001587f <+47>: bnd jmpq 0x16200 <_dl_runtime_resolve_sse_vex>
|
|
End of assembler dump.
|
|
(gdb)
|
|
|
|
since xgetbv takes much more cycles than single cycle operations like
|
|
vpord/vvpcmpeq/ptest. _dl_runtime_resolve_opt should be used only with
|
|
AVX512 where AVX512 instructions lead to lower CPU frequency on Skylake
|
|
server.
|
|
|
|
[BZ #21871]
|
|
* sysdeps/x86/cpu-features.c (init_cpu_features): Set
|
|
bit_arch_Use_dl_runtime_resolve_opt only with AVX512F.
|
|
|
|
(cherry picked from commit d2cf37c0a2a375cf2fde69f1afbcc49e45368fc4)
|
|
---
|
|
ChangeLog | 6 ++++++
|
|
sysdeps/x86/cpu-features.c | 7 +++++--
|
|
2 files changed, 11 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/ChangeLog b/ChangeLog
|
|
index 4357ad1..764c827 100644
|
|
--- a/ChangeLog
|
|
+++ b/ChangeLog
|
|
@@ -1,3 +1,9 @@
|
|
+2017-08-06 H.J. Lu <hongjiu.lu@intel.com>
|
|
+
|
|
+ [BZ #21871]
|
|
+ * sysdeps/x86/cpu-features.c (init_cpu_features): Set
|
|
+ bit_arch_Use_dl_runtime_resolve_opt only with AVX512F.
|
|
+
|
|
2017-08-03 Aurelien Jarno <aurelien@aurel32.net>
|
|
|
|
* stdlib/getentropy.c (getentropy): Change return type to int.
|
|
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
|
index 1d087ea..6f90084 100644
|
|
--- a/sysdeps/x86/cpu-features.c
|
|
+++ b/sysdeps/x86/cpu-features.c
|
|
@@ -244,10 +244,13 @@ init_cpu_features (struct cpu_features *cpu_features)
|
|
|= bit_arch_Prefer_No_AVX512;
|
|
|
|
/* To avoid SSE transition penalty, use _dl_runtime_resolve_slow.
|
|
- If XGETBV suports ECX == 1, use _dl_runtime_resolve_opt. */
|
|
+ If XGETBV suports ECX == 1, use _dl_runtime_resolve_opt.
|
|
+ Use _dl_runtime_resolve_opt only with AVX512F since it is
|
|
+ slower than _dl_runtime_resolve_slow with AVX. */
|
|
cpu_features->feature[index_arch_Use_dl_runtime_resolve_slow]
|
|
|= bit_arch_Use_dl_runtime_resolve_slow;
|
|
- if (cpu_features->max_cpuid >= 0xd)
|
|
+ if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
|
|
+ && cpu_features->max_cpuid >= 0xd)
|
|
{
|
|
unsigned int eax;
|
|
|
|
--
|
|
2.7.4.GIT
|
|
|
|
|