Re: [PATCH v2] radeon: Deinline indirect register accessor functions

From: Denys Vlasenko
Date: Wed May 20 2015 - 06:47:55 EST


On 05/19/2015 01:06 AM, Deucher, Alexander wrote:
>> -----Original Message-----
>> From: Denys Vlasenko [mailto:vda.linux@xxxxxxxxxxxxxx]
>> Sent: Monday, May 18, 2015 6:50 PM
>> To: Koenig, Christian
>> Cc: Denys Vlasenko; Deucher, Alexander; Linux Kernel Mailing List
>> Subject: Re: [PATCH v2] radeon: Deinline indirect register accessor functions
>>
>> On Mon, May 18, 2015 at 9:09 PM, Christian KÃnig
>> <christian.koenig@xxxxxxx> wrote:
>>>> r600_uvd_ctx_rreg: 111 bytes, 4 callsites
>>>> r600_uvd_ctx_wreg: 113 bytes, 5 callsites
>>>> eg_pif_phy0_rreg: 106 bytes, 13 callsites
>>>> eg_pif_phy0_wreg: 108 bytes, 13 callsites
>>>> eg_pif_phy1_rreg: 107 bytes, 13 callsites
>>>> eg_pif_phy1_wreg: 108 bytes, 13 callsites
>>>> rv370_pcie_rreg: 111 bytes, 21 callsites
>>>> rv370_pcie_wreg: 113 bytes, 24 callsites
>>>> r600_rcu_rreg: 111 bytes, 16 callsites
>>>> r600_rcu_wreg: 113 bytes, 25 callsites
>>>> cik_didt_rreg: 106 bytes, 10 callsites
>>>> cik_didt_wreg: 107 bytes, 10 callsites
>>>> tn_smc_rreg: 106 bytes, 126 callsites
>>>> tn_smc_wreg: 107 bytes, 116 callsites
>>>> eg_cg_rreg: 107 bytes, 20 callsites
>>>> eg_cg_wreg: 108 bytes, 52 callsites
>>
>>> Sorry haven't noticed that before:
>>>
>>> radeon_device.c is most likely not the right place for the non-inlined
>>> functions. Please move them into to the appropriate files for each
>>> generation.
>>
>> Will do (probably tomorrow, not today).
>
> Is this whole exercise really worthwhile?
> This will be the 3rd or 4th time these have been inlined/uninlined.

When code grows by 65000 bytes, there ought to be a good reason to inline.
I don't see it.

Let's take a look what these functions actually do. cik_didt_wreg is():

spin_lock_irqsave(&rdev->didt_idx_lock, flags);
WREG32(CIK_DIDT_IND_INDEX, (reg));
WREG32(CIK_DIDT_IND_DATA, (v));
spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);

this compiles to (on defconfig + radeon enabled):

55 push %rbp
48 89 e5 mov %rsp,%rbp
48 83 ec 20 sub $0x20,%rsp
4c 89 65 e8 mov %r12,-0x18(%rbp)
4c 8d a7 cc 01 00 00 lea 0x1cc(%rdi),%r12
48 89 5d e0 mov %rbx,-0x20(%rbp)
48 89 fb mov %rdi,%rbx
4c 89 6d f0 mov %r13,-0x10(%rbp)
4c 89 75 f8 mov %r14,-0x8(%rbp)
4c 89 e7 mov %r12,%rdi
41 89 d6 mov %edx,%r14d
41 89 f5 mov %esi,%r13d
e8 20 6b 4d 00 callq <_raw_spin_lock_irqsave> //spin_lock_irqsave
48 8b 93 d0 01 00 00 mov 0x1d0(%rbx),%rdx
44 89 aa 00 ca 00 00 mov %r13d,0xca00(%rdx) //WREG32
48 8b 93 d0 01 00 00 mov 0x1d0(%rbx),%rdx
44 89 b2 04 ca 00 00 mov %r14d,0xca04(%rdx) //WREG32
4c 89 e7 mov %r12,%rdi
48 89 c6 mov %rax,%rsi
e8 b9 69 4d 00 callq <_raw_spin_unlock_irqrestore> //spin_unlock_irqrestore
48 8b 5d e0 mov -0x20(%rbp),%rbx
4c 8b 65 e8 mov -0x18(%rbp),%r12
4c 8b 6d f0 mov -0x10(%rbp),%r13
4c 8b 75 f8 mov -0x8(%rbp),%r14
c9 leaveq
c3 retq

<_raw_spin_lock_irqsave>:
55 push %rbp
48 89 e5 mov %rsp,%rbp
9c pushfq
58 pop %rax
fa cli
ba 00 01 00 00 mov $0x100,%edx
f0 66 0f c1 17 lock xadd %dx,(%rdi) // expensive
0f b6 ce movzbl %dh,%ecx
38 d1 cmp %dl,%cl
75 04 jne <_raw_spin_lock_irqsave+0x1c>
5d pop %rbp
c3 retq
f3 90 pause
0f b6 17 movzbl (%rdi),%edx
38 ca cmp %cl,%dl
75 f7 jne <_raw_spin_lock_irqsave+0x1a>
5d pop %rbp
c3 retq

<_raw_spin_unlock_irqrestore>:
55 push %rbp
48 89 e5 mov %rsp,%rbp
80 07 01 addb $0x1,(%rdi)
56 push %rsi
9d popfq //expensive
5d pop %rbp
c3 retq

Now, using attached test program, I measure how long
call+ret pair takes:

# ./timing_test64 callret
400000000 loops in 0.71467s = 1.79 nsec/loop for callret

Unlocked read-modify-write memory operation:

# ./timing_test64 or
400000000 loops in 0.86119s = 2.15 nsec/loop for or

Locked read-modify-write memory operations:

# ./timing_test64 lock_or
100000000 loops in 0.68902s = 6.89 nsec/loop for lock_or
# ./timing_test64 lock_xadd
100000000 loops in 0.68582s = 6.86 nsec/loop for lock_xadd

And POPF:

# ./timing_test64 popf
100000000 loops in 0.68861s = 6.89 nsec/loop for popf

This is on Sandy Bridge CPU with cycle time of about 0.30 ns:

# ./timing_test64 nothing
2000000000 loops in 0.59716s = 0.30 nsec/loop for nothing


So, what do we see?

call+ret takes 5 cycles. This is cheaper that one unlocked
RMW memory operation which is 7 cycles.

Locked RMW is 21 cycles in the ideal case (this is what
spin_lock_irqsave does). POPF is also 21 cycles
(spin_unlock_irqrestore does this). Add to this two mmio
accesses (easily 50s of cycles) and all other necessary operations
visible in the assembly code - 5 memory stores,
7 memory loads, and two call+ret pairs.

I expect overhead of call+ret added by deinlining to be in 1-4%,
if you run a microbenchmark which does nothing but one of these ops.
--
vda
// To be unaffected by random cacheline placement, use generous "align":
//
// i686-gcc -O2 -Wall -falign-loops=32 -falign-jumps=32 -falign-labels=32 -static
// x86_64-gcc -O2 -Wall -falign-loops=32 -falign-jumps=32 -falign-labels=32 -static

#include <inttypes.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <time.h>
#include <sys/time.h>
#include <sys/syscall.h>
#include <stdio.h>

#if !defined(__i386__)
#define get_sysenter_addr() 0
#else
#include <elf.h>
long sysenter_addr;
long get_sysenter_addr(char **envp)
{
Elf32_auxv_t *auxv;
while (*envp++ != NULL)
continue;
for (auxv = (void *)envp; auxv->a_type != AT_NULL; auxv++)
if( auxv->a_type == AT_SYSINFO)
return (sysenter_addr = auxv->a_un.a_val);
fprintf(stderr, "AT_SYSINFO not supplied, can't test\n");
exit(0); /* this is not a failure */
}

void sysenter_getpid(void)
{
asm volatile(
"\n" " mov $20,%eax" // GETPID
"\n" " call *sysenter_addr"
);
}
#endif

#if defined(__i386__)
#define L_or_Q "l"
#define E_or_R "e"
#else
#define L_or_Q "q"
#define E_or_R "r"
#endif

static int memvar;

asm (
"\n" " .text"
"\n" "ret__: ret"
);

int main(int argc, char **argv, char **envp)
{
struct timespec start, end;
unsigned long long duration;
size_t loops, i;
const char *mode;

if (argc < 2) {
printf("Usage: timing_test [MILLIONS_OF_ITERATIONS] MODE\n");
return 1;
}
mode = argv[2];
if (!mode) {
mode = argv[1];
loops = 10*1000;
} else {
loops = (size_t)atol(argv[1]) * 1000000;
}

again:
if (!strcmp(mode, "nothing")) {
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile ("# nothing");
}
} else if (!strcmp(mode, "nop")) {
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile ("nop");
}
} else if (!strcmp(mode, "rdtsc")) {
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
unsigned int a, d;
asm volatile ("rdtsc" : "=a" (a), "=d" (d));
}
} else if (!strcmp(mode, "lfence_rdtsc")) {
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
unsigned int a, d;
asm volatile ("lfence;rdtsc" : "=a" (a), "=d" (d));
}
} else if (!strcmp(mode, "lfence_rdtsc_lfence")) {
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
unsigned int a, d;
asm volatile ("");
asm volatile ("lfence;rdtsc;lfence" : "=a" (a), "=d" (d));
}
} else if (!strcmp(mode, "mfence_rdtsc_mfence")) {
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
unsigned int a, d;
asm volatile ("mfence;rdtsc;mfence" : "=a" (a), "=d" (d));
}
} else if (!strcmp(mode, "rdtscp")) {
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
unsigned int a, c, d;
asm volatile ("rdtscp" : "=a" (a), "=c" (c), "=d" (d));
}
} else if (!strcmp(mode, "gettimeofday")) {
struct timeval tv;
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--)
gettimeofday(&tv, 0);
} else if (!strcmp(mode, "getpid")) {
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--)
syscall(SYS_getpid);
#if defined(__i386__)
} else if (!strcmp(mode, "sysenter_getpid")) {
get_sysenter_addr(envp);
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--)
sysenter_getpid();
} else if (!strcmp(mode, "iret")) {
/* "push cs" is itself a bit expensive, moving it out of loop */
long saved_cs;
asm volatile ("mov %%cs,%0" : "=r" (saved_cs));
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile (
"\n" " push $0" // flags
"\n" " push %0" // cs
"\n" " push $1f" // ip
"\n" " iret"
"\n" "1:"
:
: "r" (saved_cs)
);
}
#endif
#if defined(__x86_64__)
} else if (!strcmp(mode, "iret")) {
long saved_cs;
long saved_ss;
asm volatile ("mov %%cs,%0" : "=r" (saved_cs));
asm volatile ("mov %%ss,%0" : "=r" (saved_ss));
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile (
"\n" " mov %%rsp,%%rax"
"\n" " push %0" // ss
"\n" " push %%rax" // sp
"\n" " push $0" // flags
"\n" " push %1" // cs
"\n" " push $1f" // ip
"\n" " iretq"
"\n" "1:"
:
: "r" (saved_ss), "r" (saved_cs)
: "ax"
);
}
#endif
} else if (!strcmp(mode, "lret")) {
/* "push cs" is itself a bit expensive, moving it out of loop */
long saved_cs;
asm volatile ("mov %%cs,%0" : "=r" (saved_cs));
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile (
"\n" " push %0"
"\n" " push $1f"
"\n" " lret"L_or_Q
"\n" "1:"
:
: "r" (saved_cs)
);
}
} else if (!strcmp(mode, "callret")) {
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile ("call ret__");
}
} else if (!strcmp(mode, "ret")) {
/* This is useful to measure delays due to
* return stack branch prediction not working
* (we aren't using paired call/rets here, as CPU expects).
* I observed "callret" test above being 4 times faster than this:
*/
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile (
"\n" " push $1f"
"\n" " ret"
"\n" "1:"
);
}
} else if (!strcmp(mode, "loadss")) {
long saved_ss;
asm volatile ("mov %%ss,%0" : "=r" (saved_ss));
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile ("mov %0,%%ss" : : "r" (saved_ss));
}
} else if (!strcmp(mode, "readss")) {
long saved_ss;
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile ("mov %%ss,%0" : "=r" (saved_ss));
}
} else if (!strcmp(mode, "leave")) {
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile (
"\n" " push %"E_or_R"bp"
"\n" " mov %"E_or_R"sp,%"E_or_R"bp"
"\n" " leave"
);
}
} else if (!strcmp(mode, "noleave")) {
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile (
"\n" " push %"E_or_R"bp"
"\n" " mov %"E_or_R"sp,%"E_or_R"bp"
"\n" " mov %"E_or_R"bp,%"E_or_R"sp"
"\n" " pop %"E_or_R"bp"
);
}
} else if (!strcmp(mode, "pushf")) {
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile (
"\n" " pushf"
"\n" " pop %%"E_or_R"ax"
:
:
: "ax"
);
}
} else if (!strcmp(mode, "popf")) {
long flags;
asm volatile (
"\n" " pushf"
"\n" " pop %0"
: "=r" (flags)
);
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile (
"\n" " push %0"
"\n" " popf"
:
: "r" (flags)
: "ax"
);
}
} else if (!strcmp(mode, "or")) {
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile (
"\n" " orl $1,%0"
:
: "m" (memvar)
);
}
} else if (!strcmp(mode, "lock_or")) {
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile (
"\n" " lock orl $1,%0"
:
: "m" (memvar)
);
}
} else if (!strcmp(mode, "lock_xadd")) {
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--) {
asm volatile (
"\n" " lock xaddl %0,%1"
:
: "r" (0), "m" (memvar)
);
}
} else if (!strcmp(mode, "rdpmc")) {
// Unlikely to work.
unsigned int eax, edx;
unsigned int ecx = 0;
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = loops; i != 0; i--)
asm volatile ("rdpmc" : "=a" (eax), "=d" (edx) : "c" (ecx));
} else {
printf("Unknown mode %s\n", mode);
return 1;
}

clock_gettime(CLOCK_MONOTONIC, &end);
duration = (1000*1000*1000ULL * end.tv_sec + end.tv_nsec)
- (1000*1000*1000ULL * start.tv_sec + start.tv_nsec);
printf("%lu loops in %.5fs = %.2f nsec/loop for %s\n",
(unsigned long)loops, (double)duration * 1e-9,
(double)duration / loops,
mode
);
if (!argv[2]) {
if (duration < 90*1000*1000) {
loops *= 10;
goto again;
}
if (duration < 490*1000*1000) {
loops *= 2;
goto again;
}
}
return 0;
}