Re: [PATCH] lib/strscpy: remove word-at-a-time optimization.

From: Andrey Ryabinin
Date: Tue Jan 09 2018 - 11:47:05 EST


Attached user space program I used to see the difference.
Usage:
gcc -02 -o strscpy strscpy_test.c
./strscpy {b|w} src_str_len count

src_str_len - length of source string in between 1-4096
count - how many strscpy() to execute.



Also I've noticed something strange. I'm not sure why, but certain
src_len values (e.g. 30) drives branch predictor crazy causing worse than usual results
for byte-at-a-time copy:

$ perf stat ./strscpy b 29 10000000

Performance counter stats for './strscpy b 29 10000000':

165.354974 task-clock:u (msec) # 0.999 CPUs utilized
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
48 page-faults:u # 0.290 K/sec
640,475,981 cycles:u # 3.873 GHz
2,500,090,080 instructions:u # 3.90 insn per cycle
640,017,126 branches:u # 3870.565 M/sec
1,589 branch-misses:u # 0.00% of all branches

0.165568346 seconds time elapsed



Performance counter stats for './strscpy b 30 10000000':

250.835659 task-clock:u (msec) # 0.999 CPUs utilized
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
46 page-faults:u # 0.183 K/sec
974,528,780 cycles:u # 3.885 GHz
2,580,090,165 instructions:u # 2.65 insn per cycle
660,017,211 branches:u # 2631.273 M/sec
14,488,234 branch-misses:u # 2.20% of all branches

0.251147341 seconds time elapsed


Performance counter stats for './strscpy b 31 10000000':

176.598368 task-clock:u (msec) # 0.997 CPUs utilized
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
46 page-faults:u # 0.260 K/sec
681,367,948 cycles:u # 3.858 GHz
2,660,090,092 instructions:u # 3.90 insn per cycle
680,017,138 branches:u # 3850.642 M/sec
1,817 branch-misses:u # 0.00% of all branches

0.177150181 seconds time elapsed

#include <stdlib.h>
#include <string.h>

#define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS

#define REPEAT_BYTE(x) ((~0ul / 0xff) * (x))

#define E2BIG -1
#define PAGE_SIZE 4096
struct word_at_a_time {
const unsigned long one_bits, high_bits;
};

#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }
static inline long count_masked_bytes(unsigned long mask)
{
return mask*0x0001020304050608ul >> 56;
}
static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c)
{
unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits;
*bits = mask;
return mask;
}

static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c)
{
return bits;
}

static inline unsigned long create_zero_mask(unsigned long bits)
{
bits = (bits - 1) & ~bits;
return bits >> 7;
}

/* The mask we created is directly usable as a bytemask */
#define zero_bytemask(mask) (mask)

static inline unsigned long find_zero(unsigned long mask)
{
return count_masked_bytes(mask);
}

__attribute__((noinline))
int strscpy_word(char *dest, const char *src, size_t count)
{
const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
size_t max = count;
long res = 0;

if (count == 0)
return -E2BIG;

#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
/*
* If src is unaligned, don't cross a page boundary,
* since we don't know if the next page is mapped.
*/
if ((long)src & (sizeof(long) - 1)) {
size_t limit = PAGE_SIZE - ((long)src & (PAGE_SIZE - 1));
if (limit < max)
max = limit;
}
#else
/* If src or dest is unaligned, don't do word-at-a-time. */
if (((long) dest | (long) src) & (sizeof(long) - 1))
max = 0;
#endif

while (max >= sizeof(unsigned long)) {
unsigned long c, data;

c = *(unsigned long *)(src+res);
if (has_zero(c, &data, &constants)) {
data = prep_zero_mask(c, data, &constants);
data = create_zero_mask(data);
*(unsigned long *)(dest+res) = c & zero_bytemask(data);
return res + find_zero(data);
}
*(unsigned long *)(dest+res) = c;
res += sizeof(unsigned long);
count -= sizeof(unsigned long);
max -= sizeof(unsigned long);
}

while (count) {
char c;

c = src[res];
dest[res] = c;
if (!c)
return res;
res++;
count--;
}

/* Hit buffer length without finding a NUL; force NUL-termination. */
if (res)
dest[res-1] = '\0';

return -E2BIG;
}

__attribute__((noinline))
int strscpy_byte(char *dest, const char *src, int count)
{
int res = 0;

while (count) {
char c;
c = src[res];
dest[res] = c;
if (!c)
return res;
res++;
count--;

}

/* Hit buffer length without finding a NUL; force NUL-termination. */
if (res)
dest[res-1] = '\0';

return -E2BIG;
}

char dest[4096] __attribute__((aligned(4096)));
char src[4096] __attribute__((aligned(4096)));

int main(int argc, char **argv)
{
unsigned long long i;
unsigned long src_len;
unsigned long count;

if (argc < 4)
return -1;

src_len = atoi(argv[2]);
count = atoi(argv[3]);

memset(src, 1, src_len);

if (argv[1][0] == 'w') {
for (i = 0; i < count; i++) {
strscpy_word(dest, src, sizeof(dest));
}
} else if (argv[1][0] == 'b') {
for (i = 0; i < count; i++) {
strscpy_byte(dest, src, sizeof(dest));
}
}
return 0;
}