[PATCH] X86-32: Let gcc decide whether to inline memcpy was Re: New x86 warning

From: Andi Kleen
Date: Wed Apr 22 2009 - 04:45:32 EST


Ingo Molnar <mingo@xxxxxxx> writes:

> * Jeff Garzik <jeff@xxxxxxxxxx> wrote:
>
>> On x86-32, this warning now appears for me in 2.6.30-rc3, and did
>> not appear in 2.6.29.
>>
>> drivers/acpi/acpica/tbfadt.c: In function 'acpi_tb_create_local_fadt':
>> /spare/repo/linux-2.6/arch/x86/include/asm/string_32.h:75: warning:
>> array subscript is above array bounds
>
> Last i checked it was a GCC bounds check bogosity. All attempts to
> work it around or annotate it sanely (without changing the assembly
> code) failed. (new ideas welcome)
>
> The closest i came was the hacklet below to the assembly code. [with
> an intentionally corrupted patch header to make it harder to apply
> accidentally.]

Modern gcc (and that is all that is supported now) should be able to
generate this code on its own already. So if you call __builtin_* it
will just work (that is what 64bit does) without that explicit code.

Here's a patch that does that with some numbers. It gives about 3k
smaller kernels on my configuration.

IMHO it's long overdue to do this for 32bit too.

It's a very attractive patch because it removes a lot of code:

arch/x86/include/asm/string_32.h | 127 ++-------------------------------------
arch/x86/lib/memcpy_32.c | 16 ++++

-Andi

---

X86-32: Let gcc decide whether to inline memcpy

Modern gccs have own heuristics to decide whether string functions should be inlined
or not. This used to be not the case with old gccs, but Linux doesn't support them
anymore. The 64bit kernel always did it this way. Just define memcpy to
__builtin_memcpy and gcc should do the right thing. Also supply
a out of line memcpy that gcc can fall back to when it decides
not to inline.

First this fixes the

arch/x86/include/asm/string_32.h:75: warning: array subscript is above array bounds

warnings which have been creeping up recently by just
removing that code.

Then trusting gcc actually makes the kernel smaller by nearly 3K:

5503146 529444 1495040 7527630 72dcce vmlinux
5500373 529444 1495040 7524857 72d1f9 vmlinux-string

Also it removes some quite ugly code and will likely speed up
compilation by a tiny bit by having less inline code to process
for every file.

It did some quick boot tests and everything worked as expected.
I left the 3d now case alone for now.

Signed-off-by: Andi Kleen <ak@xxxxxxxxxxxxxxx>

---
arch/x86/include/asm/string_32.h | 127 ++-------------------------------------
arch/x86/lib/memcpy_32.c | 16 ++++
2 files changed, 23 insertions(+), 120 deletions(-)

Index: linux-2.6.30-rc2-ak/arch/x86/include/asm/string_32.h
===================================================================
--- linux-2.6.30-rc2-ak.orig/arch/x86/include/asm/string_32.h 2009-04-22 10:22:33.000000000 +0200
+++ linux-2.6.30-rc2-ak/arch/x86/include/asm/string_32.h 2009-04-22 10:23:12.000000000 +0200
@@ -29,121 +29,10 @@
#define __HAVE_ARCH_STRLEN
extern size_t strlen(const char *s);

-static __always_inline void *__memcpy(void *to, const void *from, size_t n)
-{
- int d0, d1, d2;
- asm volatile("rep ; movsl\n\t"
- "movl %4,%%ecx\n\t"
- "andl $3,%%ecx\n\t"
- "jz 1f\n\t"
- "rep ; movsb\n\t"
- "1:"
- : "=&c" (d0), "=&D" (d1), "=&S" (d2)
- : "0" (n / 4), "g" (n), "1" ((long)to), "2" ((long)from)
- : "memory");
- return to;
-}
-
-/*
- * This looks ugly, but the compiler can optimize it totally,
- * as the count is constant.
- */
-static __always_inline void *__constant_memcpy(void *to, const void *from,
- size_t n)
-{
- long esi, edi;
- if (!n)
- return to;
-
- switch (n) {
- case 1:
- *(char *)to = *(char *)from;
- return to;
- case 2:
- *(short *)to = *(short *)from;
- return to;
- case 4:
- *(int *)to = *(int *)from;
- return to;
-
- case 3:
- *(short *)to = *(short *)from;
- *((char *)to + 2) = *((char *)from + 2);
- return to;
- case 5:
- *(int *)to = *(int *)from;
- *((char *)to + 4) = *((char *)from + 4);
- return to;
- case 6:
- *(int *)to = *(int *)from;
- *((short *)to + 2) = *((short *)from + 2);
- return to;
- case 8:
- *(int *)to = *(int *)from;
- *((int *)to + 1) = *((int *)from + 1);
- return to;
- }
-
- esi = (long)from;
- edi = (long)to;
- if (n >= 5 * 4) {
- /* large block: use rep prefix */
- int ecx;
- asm volatile("rep ; movsl"
- : "=&c" (ecx), "=&D" (edi), "=&S" (esi)
- : "0" (n / 4), "1" (edi), "2" (esi)
- : "memory"
- );
- } else {
- /* small block: don't clobber ecx + smaller code */
- if (n >= 4 * 4)
- asm volatile("movsl"
- : "=&D"(edi), "=&S"(esi)
- : "0"(edi), "1"(esi)
- : "memory");
- if (n >= 3 * 4)
- asm volatile("movsl"
- : "=&D"(edi), "=&S"(esi)
- : "0"(edi), "1"(esi)
- : "memory");
- if (n >= 2 * 4)
- asm volatile("movsl"
- : "=&D"(edi), "=&S"(esi)
- : "0"(edi), "1"(esi)
- : "memory");
- if (n >= 1 * 4)
- asm volatile("movsl"
- : "=&D"(edi), "=&S"(esi)
- : "0"(edi), "1"(esi)
- : "memory");
- }
- switch (n % 4) {
- /* tail */
- case 0:
- return to;
- case 1:
- asm volatile("movsb"
- : "=&D"(edi), "=&S"(esi)
- : "0"(edi), "1"(esi)
- : "memory");
- return to;
- case 2:
- asm volatile("movsw"
- : "=&D"(edi), "=&S"(esi)
- : "0"(edi), "1"(esi)
- : "memory");
- return to;
- default:
- asm volatile("movsw\n\tmovsb"
- : "=&D"(edi), "=&S"(esi)
- : "0"(edi), "1"(esi)
- : "memory");
- return to;
- }
-}
-
#define __HAVE_ARCH_MEMCPY

+extern void *__memcpy(void *to, const void *from, size_t n);
+
#ifdef CONFIG_X86_USE_3DNOW

#include <asm/mmx.h>
@@ -155,7 +44,7 @@
static inline void *__constant_memcpy3d(void *to, const void *from, size_t len)
{
if (len < 512)
- return __constant_memcpy(to, from, len);
+ return __memcpy(to, from, len);
return _mmx_memcpy(to, from, len);
}

@@ -168,20 +57,18 @@

#define memcpy(t, f, n) \
(__builtin_constant_p((n)) \
- ? __constant_memcpy3d((t), (f), (n)) \
+ ? __builtin_memcpy((t), (f), (n)) \
: __memcpy3d((t), (f), (n)))

#else

/*
* No 3D Now!
+ *
+ * Let gcc figure it out.
*/

-#define memcpy(t, f, n) \
- (__builtin_constant_p((n)) \
- ? __constant_memcpy((t), (f), (n)) \
- : __memcpy((t), (f), (n)))
-
+#define memcpy(t, f, n) __builtin_memcpy(t,f,n)
#endif

#define __HAVE_ARCH_MEMMOVE
Index: linux-2.6.30-rc2-ak/arch/x86/lib/memcpy_32.c
===================================================================
--- linux-2.6.30-rc2-ak.orig/arch/x86/lib/memcpy_32.c 2009-04-22 10:22:33.000000000 +0200
+++ linux-2.6.30-rc2-ak/arch/x86/lib/memcpy_32.c 2009-04-22 10:23:56.000000000 +0200
@@ -4,6 +4,22 @@
#undef memcpy
#undef memset

+void *__memcpy(void *to, const void *from, size_t n)
+{
+ int d0, d1, d2;
+ asm volatile("rep ; movsl\n\t"
+ "movl %4,%%ecx\n\t"
+ "andl $3,%%ecx\n\t"
+ "jz 1f\n\t"
+ "rep ; movsb\n\t"
+ "1:"
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+ : "0" (n / 4), "g" (n), "1" ((long)to), "2" ((long)from)
+ : "memory");
+ return to;
+}
+EXPORT_SYMBOL(__memcpy);
+
void *memcpy(void *to, const void *from, size_t n)
{
#ifdef CONFIG_X86_USE_3DNOW



--
ak@xxxxxxxxxxxxxxx -- Speaking for myself only.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/