# define EFLAGS_ID (1u << 21)
# define CPUID1D_SSE2 (1u << 26)
# define CPUID1D_FXSR (1u << 24)
+# define CPUID1C_PCLMUL (1u << 1)
+# define CPUID1C_SSSE3 (1u << 9)
# define CPUID1C_AESNI (1u << 25)
+# define CPUID1C_AVX (1u << 28)
+# define CPUID1C_RDRAND (1u << 30)
struct cpuid { unsigned a, b, c, d; };
{
unsigned ff;
__asm__ ("pushf; pushl %1; popf; pushf; popl %0; popf"
- : "=g" (ff)
- : "g" (f));
+ : "=r" (ff)
+ : "r" (f));
return (ff);
}
# else
{
unsigned long ff;
__asm__ ("pushf; pushq %1; popf; pushf; popq %0; popf"
- : "=g" (ff)
- : "g" (f));
+ : "=r" (ff)
+ : "r" (f));
return (ff);
}
# endif
#endif
}
+/* --- @rdrand_works_p@ --- *
+ *
+ *
+ * Arguments: ---
+ *
+ * Returns: Nonzero if the `rdrand' instruction actually works. Assumes
+ * that it's already been verified to be safe to issue.
+ */
+
+#ifdef __GNUC__
+static int rdrand(unsigned *x)
+{
+ int i, rc;
+ unsigned _t;
+
+ i = 16;
+ __asm__ ("" : "=g" (_t));
+ __asm__ ("0: rdrand %2; jc 1f; decl %1; jnz 0b\n"
+ "mov $-1, %0; jmp 9f\n"
+ "1: movl %2, (%3); xorl %0, %0\n"
+ "9:"
+ : "=r" (rc), "+r" (i), "+r" (_t)
+ : "r" (x)
+ : "cc");
+ return (rc);
+}
+#endif
+
+static int rdrand_works_p(void)
+{
+ unsigned ref, x, i;
+
+ /* Check that it doesn't always give the same answer. Try four times: this
+ * will fail with probability %$2^{-128}$% with a truly random generator,
+ * which seems fair enough.
+ */
+ if (rdrand(&ref)) goto fail;
+ for (i = 0; i < 4; i++) {
+ if (rdrand(&x)) goto fail;
+ if (x != ref) goto not_stuck;
+ }
+ dispatch_debug("RDRAND always returns 0x%08x!", ref);
+ return (0);
+
+not_stuck:
+ dispatch_debug("RDRAND instruction looks plausible");
+ return (1);
+
+fail:
+ dispatch_debug("RDRAND instruction fails too often");
+ return (0);
+}
+
#endif
/*----- General feature probing using auxiliary vectors -------------------*/
/* Try to find the system's definitions for auxiliary vector entries. */
#ifdef HAVE_SYS_AUXV_H
# include <sys/auxv.h>
-#else
-# ifdef HAVE_LINUX_AUXVEC_H
-# include <linux/auxvec.h>
-# endif
-# ifdef HAVE_ASM_HWCAP_H
-# include <asm/hwcap.h>
-# endif
+#endif
+#ifdef HAVE_LINUX_AUXVEC_H
+# include <linux/auxvec.h>
+#endif
+#ifdef HAVE_ASM_HWCAP_H
+# include <asm/hwcap.h>
#endif
/* The type of entries in the auxiliary vector. I'm assuming that `unsigned
# define WANT_AT_HWCAP(_) _(AT_HWCAP, u, hwcap)
#endif
+#if defined(AT_HWCAP) && CPUFAM_ARM64
+# define WANT_ANY 1
+# define WANT_AT_HWCAP(_) _(AT_HWCAP, u, hwcap)
+#endif
+
+#if defined(AT_HWCAP2) && CPUFAM_ARMEL
+# define WANT_ANY 1
+# define WANT_AT_HWCAP2(_) _(AT_HWCAP2, u, hwcap2)
+#endif
+
/* If we couldn't find any interesting entries then we can switch all of this
* machinery off. Also do that if we have no means for atomic updates.
*/
#ifndef WANT_AT_HWCAP
# define WANT_AT_HWCAP(_)
#endif
+#ifndef WANT_AT_HWCAP2
+# define WANT_AT_HWCAP2(_)
+#endif
/* For each CPU family, define two lists.
*
*/
#if CPUFAM_ARMEL
# define WANTAUX(_) \
- WANT_AT_HWCAP(_)
+ WANT_AT_HWCAP(_) \
+ WANT_AT_HWCAP2(_)
# define CAPMAP(_) \
_(ARM_VFP, "arm:vfp") \
_(ARM_NEON, "arm:neon") \
_(ARM_V4, "arm:v4") \
- _(ARM_D32, "arm:d32")
+ _(ARM_D32, "arm:d32") \
+ _(ARM_AES, "arm:aes") \
+ _(ARM_PMULL, "arm:pmull")
+#endif
+#if CPUFAM_ARM64
+# define WANTAUX(_) \
+ WANT_AT_HWCAP(_)
+# define CAPMAP(_) \
+ _(ARM_NEON, "arm:neon") \
+ _(ARM_AES, "arm:aes") \
+ _(ARM_PMULL, "arm:pmull")
#endif
/* Build the bitmask for `hwcaps' from the `CAPMAP' list. */
/* Shiny new libc lets us request individual entry types. This is almost
* too easy.
*/
-# define CAP__GET(type, slot, ubranch) \
- probed.slot.ubranch = (AUXUTYPE_##ubranch)getauxval(type);
+# define CAP__GET(type, ubranch, slot) \
+ probed.slot = (AUXUTYPE_##ubranch)getauxval(type);
WANTAUX(CAP__GET)
#else
/* Otherwise we're a bit stuck, really. Modern Linux kernels make a copy
if (probed.hwcap & HWCAP_NEON) hw |= HF_ARM_NEON;
if (probed.hwcap & HWCAP_VFPD32) hw |= HF_ARM_D32;
if (probed.hwcap & HWCAP_VFPv4) hw |= HF_ARM_V4;
+# ifdef HWCAP2_AES
+ if (probed.hwcap2 & HWCAP2_AES) hw |= HF_ARM_AES;
+# endif
+# ifdef HWCAP2_PMULL
+ if (probed.hwcap2 & HWCAP2_PMULL) hw |= HF_ARM_PMULL;
+# endif
+#endif
+#if CPUFAM_ARM64
+ if (probed.hwcap & HWCAP_ASIMD) hw |= HF_ARM_NEON;
+ if (probed.hwcap & HWCAP_AES) hw |= HF_ARM_AES;
+ if (probed.hwcap & HWCAP_PMULL) hw |= HF_ARM_PMULL;
#endif
/* Store the bitmask of features we probed for everyone to see. */
switch (feat) {
#if CPUFAM_X86 || CPUFAM_AMD64
CASE_CPUFEAT(X86_SSE2, "x86:sse2",
- xmm_registers_available_p() &&
- cpuid_features_p(CPUID1D_SSE2, 0));
+ cpuid_features_p(CPUID1D_SSE2, 0) &&
+ xmm_registers_available_p());
CASE_CPUFEAT(X86_AESNI, "x86:aesni",
- xmm_registers_available_p() &&
- cpuid_features_p(CPUID1D_SSE2, CPUID1C_AESNI));
+ cpuid_features_p(CPUID1D_SSE2, CPUID1C_AESNI) &&
+ xmm_registers_available_p());
+ CASE_CPUFEAT(X86_RDRAND, "x86:rdrand",
+ cpuid_features_p(0, CPUID1C_RDRAND) && rdrand_works_p());
+ CASE_CPUFEAT(X86_AVX, "x86:avx",
+ cpuid_features_p(0, CPUID1C_AVX) &&
+ xmm_registers_available_p());
+ CASE_CPUFEAT(X86_SSSE3, "x86:ssse3",
+ cpuid_features_p(0, CPUID1C_SSSE3) &&
+ xmm_registers_available_p());
+ CASE_CPUFEAT(X86_PCLMUL, "x86:pclmul",
+ cpuid_features_p(0, CPUID1C_PCLMUL) &&
+ xmm_registers_available_p());
#endif
#ifdef CAPMAP
# define FEATP__CASE(feat, tok) \