Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

From: H. Peter Anvin
Date: Thu Apr 30 2015 - 17:40:01 EST


This is the microbenchmark I used.

For the record, Intel's intention going forward is that 0F 1F will
always be as fast or faster than any other alternative.

-hpa

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <stdbool.h>
#include <sys/time.h>

static void nop_p6(void)
{
asm volatile(".rept 1000\n"
".byte 0x0f,0x1f,0x44,0x00,0x00\n"
".endr");
}

static void nop_k8(void)
{
asm volatile(".rept 1000\n"
".byte 0x66,0x66,0x66,0x66,0x90\n"
".endr");
}

static void nop_lea(void)
{
#ifdef __x86_64__
asm volatile(".rept 1000\n"
".byte 0x48,0x8d,0x74,0x26,0x00\n"
".endr");
#else
asm volatile(".rept 1000\n"
".byte 0x3e,0x8d,0x74,0x26,0x00\n"
".endr");
#endif
}

static void nop_jmp5(void)
{
asm volatile(".rept 1000\n"
".byte 0xe9,0,0,0,0\n"
".endr");
}

static void nop_jmp2(void)
{
asm volatile(".rept 1000\n"
".byte 0xeb,3,0x90,0x90,0x90\n"
".endr");
}

static void nop_xchg(void)
{
asm volatile(".rept 1000\n"
".byte 0x66,0x66,0x66,0x87,0xc0\n"
".endr");
}

static void nop_mov(void)
{
asm volatile(".rept 1000\n"
".byte 0x66,0x66,0x66,0x89,0xc0\n"
".endr");
}

static void nop_fdisi(void)
{
asm volatile(".rept 1000\n"
".byte 0x66,0x66,0x66,0xdb,0xe1\n"
".endr");
}

static void nop_feni(void)
{
asm volatile(".rept 1000\n"
".byte 0x66,0x66,0x66,0xdb,0xe0\n"
".endr");
}

struct test_list {
const char *name;
void (*func)(void);
};

static const struct test_list tests[] = {
{ "P6 NOPs (NOPL)", nop_p6 },
{ "K8 NOPs (66 90)", nop_k8 },
{ "LEA", nop_lea },
{ "XCHG", nop_xchg },
{ "MOV", nop_mov },
{ "FDISI", nop_fdisi },
{ "FENI", nop_feni },
{ "E9 JMP", nop_jmp5 },
{ "EB JMP", nop_jmp2 },
{ NULL, NULL }
};

static void benchmark(const struct test_list *test, bool warmup)
{
struct timeval tv0, tv1;
int i;
const int reps = 100000;
unsigned long long us;

gettimeofday(&tv0, NULL);
for (i = 0; i < reps; i++)
test->func();
gettimeofday(&tv1, NULL);

us = (tv1.tv_sec - tv0.tv_sec) * 1000000ULL +
((int)tv1.tv_usec - (int)tv0.tv_usec);

if (!warmup)
printf("%s: %d repetitions at %llu us\n", test->name, reps, us);
}

int main(void)
{
const struct test_list *test;

for (test = tests; test->func; test++) {
benchmark(test, true);
benchmark(test, false);
}

return 0;
}