RE: x86/csum: Remove unnecessary odd handling

From: David Laight
Date: Sat Jan 06 2024 - 17:09:20 EST

Next message: kernel test robot: "Re: [PATCH] x86/trampoline: Bypass compat mode in trampoline_start64() if not needed"
Previous message: kernel test robot: "drivers/pci/controller/dwc/pcie-designware.c:898:50: warning: '%d' directive output may be truncated writing between 1 and 11 bytes into a region of size 3"
In reply to: David Laight: "RE: x86/csum: Remove unnecessary odd handling"
Next in thread: H. Peter Anvin: "RE: x86/csum: Remove unnecessary odd handling"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

From: Linus Torvalds
> Sent: 05 January 2024 18:06
>
> On Fri, 5 Jan 2024 at 02:41, David Laight <David.Laight@xxxxxxxxxx> wrote:
> >
> > Interesting, I'm pretty sure trying to get two blocks of
> > 'adc' scheduled in parallel like that doesn't work.
>
> You should check out the benchmark at
>
> https://github.com/fenrus75/csum_partial
>
> and see if you can improve on it. I'm including the patch (on top of
> that code by Arjan) to implement the actual current kernel version as
> "New version".

Annoyingly (for me) you are partially right...

I found where my ip checksum perf code was hiding and revisited it.
Although I found comments elsewhere that the 'jecxz, adc, adc, lea, jmp'
did an adc every clock it isn't happening for me now.

I'm only measuring the inner loop for multiples of 64 bytes.
The code less than 8 bytes and partial final words is a
separate problem.
The less unrolled the main loop, the less overhead there'll
be for 'normal' sizes.
So I've changed your '80 byte' block to 64 bytes for consistency.

I'm ignoring pre-sandy bridge cpu (no split flags) and pre-broadwell
(adc takes two clocks - although adc to alternate regs is one clock
on sandy bridge).
My test system is an i7-7700, I think anything from broadwell (gen 4)
will be at least as good.
I don't have a modern amd cpu.

The best loop for 256+ bytes is an adxc/adxo one.
However that requires the run-time patching.
Followed by new kernel version (two blocks of 4 adc).
The surprising one is:
xor sum, sum
1: adc (buff), sum
adc 8(buff), sum
lea 16(buff), buff
dec count
jnz 1b
adc $0, sum
For 256 bytes it is only a couple of clocks slower.
Maybe 10% slower for 512+ bytes.
But it need almost no extra code for 'normal' buffer sizes.
By comparison the adxc/adxo one is 20% faster.

The code is doing:
old = rdpmc
mfence
csum = do_csum(buf, len);
mfence
clocks = rdpmc - old
(That is directly reading the pmc register.)
With 'no-op' function it takes 160 clocks (I-cache resident).
Without the mfence 40 - but pretty much everything can execute
after the 2nd rdpmc.

I've attached my (horrid) test program.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>

#include <linux/perf_event.h>
#include <sys/mman.h>
#include <sys/syscall.h>

#ifndef PASSES
#define PASSES 10
#endif

static int init_pmc(void)
{
static struct perf_event_attr perf_attr = {
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
.pinned = 1,
};
struct perf_event_mmap_page *pc;

int perf_fd;
perf_fd = syscall(__NR_perf_event_open, &perf_attr, 0, -1, -1, 0);
if (perf_fd < 0) {
fprintf(stderr, "perf_event_open failed: errno %d\n", errno);
exit(1);
}
pc = mmap(NULL, 4096, PROT_READ, MAP_SHARED, perf_fd, 0);
if (pc == MAP_FAILED) {
fprintf(stderr, "perf_event mmap() failed: errno %d\n", errno);
exit(1);
}
return pc->index - 1;
}

static inline unsigned int rdpmc(unsigned int counter)
{
unsigned int low, high;

#ifndef NOFENCE
asm volatile("mfence");
#endif
asm volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (counter));
#ifndef NOFENCE
asm volatile("mfence");
#endif

// return low bits, counter might to 32 or 40 bits wide.
return low;
}

__attribute__((noinline))
unsigned long overhead(const unsigned char *buff, unsigned long len)
{
return 0;
}

// Existing kernel loop.
// Simple 'adc' loop, abs max 8 bytes/clock.
// Will be very slow on old cpu (Pre Ivy bridge) cpu where 'dec'
// has a false dependeny against the carry flag.
// And also pre-broadwell where the adc take two clocks.
__attribute__((noinline))
unsigned long adc_dec_8(const unsigned char *buff, unsigned long len)
{
unsigned long csum = 0;

long count = len/64;
asm("1: addq 0(%[buff]), %[csum]\n"
" adcq 8(%[buff]), %[csum]\n"
" adcq 16(%[buff]), %[csum]\n"
" adcq 24(%[buff]), %[csum]\n"
" adcq 32(%[buff]), %[csum]\n"
" adcq 40(%[buff]), %[csum]\n"
" adcq 48(%[buff]), %[csum]\n"
" adcq 56(%[buff]), %[csum]\n"
" adcq $0, %[csum]\n"
" lea 64(%[buff]), %[buff]\n"
" dec %[count]\n"
" jnz 1b"
: [csum] "+&r" (csum), [len] "+&r" (len), [count] "+&r" (count)
: [buff] "r" (buff)
: "memory");

return csum;
}

// As above but only 4 adc
__attribute__((noinline))
unsigned long adc_dec_4(const unsigned char *buff, unsigned long len)
{
unsigned long csum = 0;

long count = len/32;
asm(" clc\n"
"1: adcq (%[buff]), %[csum]\n"
" adcq 8(%[buff]), %[csum]\n"
" adcq 16(%[buff]), %[csum]\n"
" adcq 24(%[buff]), %[csum]\n"
" lea 32(%[buff]), %[buff]\n"
" dec %[count]\n"
" jnz 1b\n"
" adcq $0, %[csum]\n"
: [csum] "+&r" (csum), [len] "+&r" (len), [count] "+&r" (count)
: [buff] "r" (buff)
: "memory");

return csum;
}

// As above but only 2 adc
__attribute__((noinline))
unsigned long adc_dec_2(const unsigned char *buff, unsigned long len)
{
unsigned long csum = 0;

long count = len/16;
asm(" clc\n"
"1: adcq (%[buff]), %[csum]\n"
" adcq 8(%[buff]), %[csum]\n"
" lea 16(%[buff]), %[buff]\n"
" dec %[count]\n"
" jnz 1b\n"
" adcq $0, %[csum]\n"
: [csum] "+&r" (csum), [len] "+&r" (len), [count] "+&r" (count)
: [buff] "r" (buff)
: "memory");

return csum;
}

// Alternate 'adc' loop that avoids using the zero flag.
__attribute__((noinline))
unsigned long adc_jcxz_4(const unsigned char *buff, unsigned long len)
{
unsigned long csum = 0;

buff += len;
len = -len;
asm(" clc\n"
"1: jrcxz 2f\n"
" adcq 0(%[buff], %[len]), %[csum]\n"
" adcq 8(%[buff], %[len]), %[csum]\n"
" adcq 16(%[buff], %[len]), %[csum]\n"
" adcq 24(%[buff], %[len]), %[csum]\n"
" lea 32(%[len]), %[len]\n"
" jmp 1b\n"
"2: adcq $0, %[csum]\n"
: [csum] "+&r" (csum), [len] "+&c" (len)
: [buff] "r" (buff)
: "memory");

return csum;
}

// Shorter alternate 'adc' loop that avoids using the zero flag.
__attribute__((noinline))
unsigned long adc_jcxz_2(const unsigned char *buff, unsigned long len)
{
unsigned long csum = 0;

buff += len;
len = -len;
asm(" clc\n"
"1: jrcxz 2f\n"
" adcq 0(%[buff], %[len]), %[csum]\n"
" adcq 8(%[buff], %[len]), %[csum]\n"
" lea 16(%[len]), %[len]\n"
" jmp 1b\n"
"2: adcq $0, %[csum]\n"
: [csum] "+&r" (csum), [len] "+&c" (len)
: [buff] "r" (buff)
: "memory");

return csum;
}

// Proposed kernel loop.
// This splits the adc chain in two.
__attribute__((noinline))
unsigned long adc_4_pair(const unsigned char *buff, unsigned long len)
{
unsigned long csum = 0;

long count = len/64;
unsigned long csum1;
asm("1: movq 0(%[buff]), %[csum1]\n"
" adcq 8(%[buff]), %[csum1]\n"
" adcq 16(%[buff]), %[csum1]\n"
" adcq 24(%[buff]), %[csum1]\n"
" adcq 32(%[buff]), %[csum1]\n"
" adcq $0, %[csum1]\n"
" lea 64(%[buff]), %[buff]\n"
" addq 40-64(%[buff]), %[csum]\n"
" adcq 48-64(%[buff]), %[csum]\n"
" adcq 56-64(%[buff]), %[csum]\n"
" adcq %[csum1], %[csum]\n"
" adcq $0, %[csum]\n"
" dec %[count]\n"
" jnz 1b"
: [csum] "+&r" (csum), [csum1] "=&r" (csum1), [len] "+&r" (len), [count] "+&r" (count)
: [buff] "r" (buff)
: "memory");

return csum;
}

// Less unrolled version of previous.
__attribute__((noinline))
unsigned long adc_2_pair(const unsigned char *buff, unsigned long len)
{
unsigned long csum = 0;

long count = len;
unsigned long csum1;
asm("1: movq 0(%[buff]), %[csum1]\n"
" adcq 8(%[buff]), %[csum1]\n"
" adcq $0, %[csum1]\n"
" lea 32(%[buff]), %[buff]\n"
" addq 16-32(%[buff]), %[csum]\n"
" adcq 24-32(%[buff]), %[csum]\n"
" adcq %[csum1], %[csum]\n"
" adcq $0, %[csum]\n"
" sub $32, %[count]\n"
" jnz 1b"
: [csum] "+&r" (csum), [csum1] "=&r" (csum1), [len] "+&r" (len), [count] "+&r" (count)
: [buff] "r" (buff)
: "memory");

return csum;
}

// adxc/adxo loop.
// This is the only loop that exceeds 8 bytes/clock for 1k buffers.
__attribute__((noinline))
unsigned long adcx_adox(const unsigned char *buff, unsigned long len)
{
unsigned long csum = 0;

unsigned long csum_odd;
buff += len;
len = -len;
asm( " xor %[csum_odd], %[csum_odd]\n" // Also clears carry and overflow
"10: jrcxz 20f\n"
" adcx (%[buff], %[len]), %[csum]\n"
" adox 8(%[buff], %[len]), %[csum_odd]\n"
" adcx 16(%[buff], %[len]), %[csum]\n"
" adox 24(%[buff], %[len]), %[csum_odd]\n"
" adcx 32(%[buff], %[len]), %[csum]\n"
" adox 40(%[buff], %[len]), %[csum_odd]\n"
" adcx 48(%[buff], %[len]), %[csum]\n"
" adox 56(%[buff], %[len]), %[csum_odd]\n"
" lea 64(%[len]), %[len]\n"
" jmp 10b\n"
"20: adox %[len], %[csum_odd]\n" // [len] is zero
" adcx %[csum_odd], %[csum]\n"
" adcx %[len], %[csum]"
: [csum] "+&r" (csum), [len] "+&c" (len), [csum_odd] "=&r" (csum_odd)
: [buff] "r" (buff)
: "memory");

return csum;
}

#define OPTIMISER_HIDE_VAR(x) asm volatile( "": "+r" (x))

// In principe the cpu can do 2 reads every clock.
// So combining with adds to different registers it should
// be possible to get this C code to run near to 8 bytes/clock.
// But the compiler doesn't want to 'play ball'.
// The OPTIMISER_HIDE_VAR() helps a bit
__attribute__((noinline))
unsigned long add_c_16(const unsigned char *buff, unsigned long len)
{
unsigned long csum_odd = 0;
unsigned long csum = 0;

buff += len;
len = -len;
do {
csum += *(unsigned int *)(buff + len + 0);
csum_odd += *(unsigned int *)(buff + len + 4);
OPTIMISER_HIDE_VAR(csum);
OPTIMISER_HIDE_VAR(csum_odd);
csum += *(unsigned int *)(buff + len + 8);
csum_odd += *(unsigned int *)(buff + len + 12);
OPTIMISER_HIDE_VAR(csum);
OPTIMISER_HIDE_VAR(csum_odd);
} while (len += 16);
csum += csum_odd;

return csum;
}

// As above but unrolled further
__attribute__((noinline))
unsigned long add_c_32(const unsigned char *buff, unsigned long len)
{
unsigned long csum_odd = 0;
unsigned long csum = 0;

buff += len;
len = -len;
do {
csum += *(unsigned int *)(buff + len + 0);
csum_odd += *(unsigned int *)(buff + len + 4);
OPTIMISER_HIDE_VAR(csum);
OPTIMISER_HIDE_VAR(csum_odd);
csum += *(unsigned int *)(buff + len + 8);
csum_odd += *(unsigned int *)(buff + len + 12);
OPTIMISER_HIDE_VAR(csum);
OPTIMISER_HIDE_VAR(csum_odd);
csum += *(unsigned int *)(buff + len + 16);
csum_odd += *(unsigned int *)(buff + len + 20);
OPTIMISER_HIDE_VAR(csum);
OPTIMISER_HIDE_VAR(csum_odd);
csum += *(unsigned int *)(buff + len + 24);
csum_odd += *(unsigned int *)(buff + len + 28);
OPTIMISER_HIDE_VAR(csum);
OPTIMISER_HIDE_VAR(csum_odd);
} while (len += 32);
csum += csum_odd;

return csum;
}

// Alternative C loop that sums 'val << 32' to get the carries.
// Might be better on arm64 than x86
__attribute__((noinline))
unsigned long add_c_high(const unsigned char *buff, unsigned long len)
{
unsigned long sum_hi_odd = 0, sum_lo_odd = 0;
unsigned long sum_hi = 0, sum_lo = 0;
unsigned long val;
unsigned int i;

buff += len;
len = -len;
do {
val = *(unsigned long *)(buff + len + 0);
sum_lo += val;
sum_hi += val >> 32;
val = *(unsigned long *)(buff + len + 8);
sum_lo_odd += val;
sum_hi_odd += val >> 32;
} while (len += 16);
OPTIMISER_HIDE_VAR(sum_lo);
OPTIMISER_HIDE_VAR(sum_hi);
sum_lo += sum_lo_odd;
sum_hi += sum_hi_odd;

sum_hi += (sum_lo >> 32) - (sum_hi & 0xffffffffu);
return sum_hi + (sum_lo & 0xffffffff);
}

unsigned char buffer[8192] __attribute__((aligned(4096))) = {
0x46,0x56,0x20,0x04,0x10,0x02,0x50,0x07,0x72,0x4d,0xc6,0x3d,0x31,0x85,0x2d,0xbd,
0xe2,0xe0,0x9d,0x3e,0x3b,0x7a,0x70,0x3d,0xd2,0xfb,0x8c,0xbf,0x95,0x10,0xa9,0xbe,
0xeb,0xfd,0x29,0x40,0xd5,0x7a,0x61,0x40,0xde,0xcd,0x14,0xbf,0x81,0x1b,0xf6,0x3f,
0xbc,0xff,0x17,0x3f,0x67,0x1c,0x6e,0xbe,0xf4,0xc2,0x05,0x40,0x0b,0x13,0x78,0x3f,
0xfe,0x47,0xa7,0xbd,0x59,0xc2,0x15,0x3f,0x07,0xd0,0xea,0xbf,0x97,0xf1,0x3c,0x3f,
0xcc,0xfa,0x6b,0x40,0x72,0x6a,0x4f,0xbe,0x0b,0xe3,0x75,0x3e,0x3c,0x9b,0x0e,0xbf,
0xa9,0xeb,0xb7,0x3f,0xeb,0x4a,0xec,0x3e,0x33,0x8c,0x0c,0x3f,0x6a,0xf2,0xf3,0x3e,
0x2b,0x45,0x86,0x3f,0x83,0xce,0x8a,0x3f,0xf6,0x01,0x16,0x40,0x9c,0x17,0x47,0x3e,
0x44,0x83,0x61,0x40,0x74,0xc7,0x5c,0x3f,0xec,0xe7,0x95,0x3f,0xee,0x19,0xb5,0xbf,
0xb5,0xf0,0x03,0xbf,0xd1,0x02,0x1c,0x3e,0xa3,0x55,0x90,0xbe,0x1e,0x0b,0xa1,0xbf,
0xa4,0xa8,0xb4,0x3f,0xc6,0x68,0x91,0x3f,0xd1,0xc5,0xab,0x3f,0xb9,0x14,0x62,0x3f,
0x7c,0xe0,0xb9,0xbf,0xc0,0xa4,0xb5,0x3d,0x6f,0xd9,0xa7,0x3f,0x8f,0xc4,0xb0,0x3d,
0x48,0x2c,0x7a,0x3e,0x83,0xb2,0x3c,0x40,0x36,0xd3,0x18,0x40,0xb7,0xa9,0x57,0x40,
0xda,0xd3,0x95,0x3f,0x74,0x95,0xc0,0xbe,0xbb,0xce,0x71,0x3e,0x95,0xec,0x18,0xbf,
0x94,0x17,0xdd,0x3f,0x98,0xa5,0x02,0x3f,0xbb,0xfb,0xbb,0x3e,0xd0,0x5a,0x9c,0x3f,
0xd4,0x70,0x9b,0xbf,0x3b,0x9f,0x20,0xc0,0x84,0x5b,0x0f,0x40,0x5e,0x48,0x2c,0xbf,
};

static __attribute__((noinline)) void
print_csum(const char *msg, unsigned long csum, unsigned int *ticks)
{
static unsigned int offset;
unsigned int i;

csum = (csum & 0xffffffff) + (csum >> 32);
csum = (csum & 0xffffffff) + (csum >> 32);
csum = (csum & 0xffff) + (csum >> 16);
csum = (csum & 0xffff) + (csum >> 16);
printf(" %4lx", csum);

for (i = 0; i < PASSES; i++)
printf(" %5d", ticks[i] - offset);
printf(" %s\n", msg);

if (!offset) {
offset = ticks[0];
for (i = 1; i < PASSES; i++)
if (offset > ticks[i])
offset = ticks[i];
}
}

static inline __attribute__((always_inline)) void
do_csum(int pmc_id, const char *msg, unsigned long (*fn)(const unsigned char *, unsigned long),
const unsigned char *buf, unsigned long len)
{
unsigned int ticks[PASSES];
unsigned long csum;
unsigned int tick;
unsigned int i;

for (i = 0; i < PASSES; i++) {
tick = rdpmc(pmc_id);
csum = fn(buf, len);
ticks[i] = rdpmc(pmc_id) - tick;
}

print_csum(msg, csum, ticks);
}

int main(int argc, char **argv)
{
unsigned long csum;
unsigned int len, off;
unsigned int pmc_id = init_pmc();

len = 128;
off = 0;
for (;;) {
switch (getopt(argc, argv, "l:o:")) {
case -1:
break;
default:
exit(1);
case 'l':
len = atoi(optarg);
continue;
case 'o':
off = atoi(optarg);
continue;
}
break;
}

/* Solving partial blocks is a different problem... */
len = (len + 63) & ~63;
if (!len)
len = 64;

do_csum(pmc_id, "overhead", overhead, buffer + off, len);
do_csum(pmc_id, "adc_dec_2", adc_dec_2, buffer + off, len);
do_csum(pmc_id, "adc_dec_4", adc_dec_4, buffer + off, len);
do_csum(pmc_id, "adc_dec_8", adc_dec_8, buffer + off, len);
do_csum(pmc_id, "adc_jcxz_2", adc_jcxz_2, buffer + off, len);
do_csum(pmc_id, "adc_jcxz_4", adc_jcxz_4, buffer + off, len);
do_csum(pmc_id, "adc_2_pair", adc_2_pair, buffer + off, len);
do_csum(pmc_id, "adc_4_pair", adc_4_pair, buffer + off, len);
do_csum(pmc_id, "adcx_adox", adcx_adox, buffer + off, len);
do_csum(pmc_id, "add_c_16", add_c_16, buffer + off, len);
do_csum(pmc_id, "add_c_32", add_c_32, buffer + off, len);
do_csum(pmc_id, "add_c_high", add_c_high, buffer + off, len);

return 0;
}

Next message: kernel test robot: "Re: [PATCH] x86/trampoline: Bypass compat mode in trampoline_start64() if not needed"
Previous message: kernel test robot: "drivers/pci/controller/dwc/pcie-designware.c:898:50: warning: '%d' directive output may be truncated writing between 1 and 11 bytes into a region of size 3"
In reply to: David Laight: "RE: x86/csum: Remove unnecessary odd handling"
Next in thread: H. Peter Anvin: "RE: x86/csum: Remove unnecessary odd handling"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]