[PATCH] reduce sha512_transform() stack usage, speedup

From: Denis Vlasenko
Date: Fri Oct 01 2004 - 16:02:04 EST


On top of previous:

Patch moves large temporary u64 W[80]
from stack to ctx struct:

* reduces stack usage by 640 bytes
* saves one 640-byte memset() per sha512_transform()
(we still do it after *all* iterations are done)
* quite unexpectedly saves 1.6k of code on i386
because stack offsets now fit into 8bits
and many stack addressing insns got 3 bytes smaller:

# size sha512.o.org sha512.o
text data bss dec hex filename
8281 372 0 8653 21cd sha512.o.org
6649 372 0 7021 1b6d sha512.o

# objdump -d sha512.o.org | cut -b9- >sha512.d.org
# objdump -d sha512.o | cut -b9- >sha512.d
# diff -u sha512.d.org sha512.d
[snip]
: 8b 4b 28 mov 0x28(%ebx),%ecx
: 8b 5b 2c mov 0x2c(%ebx),%ebx
-: 89 8d 44 fd ff ff mov %ecx,0xfffffd44(%ebp)
-: 89 9d 48 fd ff ff mov %ebx,0xfffffd48(%ebp)
-: 89 9d f4 fc ff ff mov %ebx,0xfffffcf4(%ebp)
+: 89 4d c4 mov %ecx,0xffffffc4(%ebp)
+: 89 5d c8 mov %ebx,0xffffffc8(%ebp)
+: 89 9d 64 ff ff ff mov %ebx,0xffffff64(%ebp)
: 8b 5d 08 mov 0x8(%ebp),%ebx
-: 89 8d f0 fc ff ff mov %ecx,0xfffffcf0(%ebp)
+: 89 8d 60 ff ff ff mov %ecx,0xffffff60(%ebp)
: 8b 42 30 mov 0x30(%edx),%eax
: 8b 52 34 mov 0x34(%edx),%edx
[snip]

WARNING: compile tested only.
--
vda
--- linux-2.6.9-rc3/crypto/sha512.c.org Fri Oct 1 22:17:14 2004
+++ linux-2.6.9-rc3/crypto/sha512.c Fri Oct 1 23:20:13 2004
@@ -30,6 +30,7 @@
u64 state[8];
u32 count[4];
u8 buf[128];
+ u64 W[80];
};

static inline u64 Ch(u64 x, u64 y, u64 z)
@@ -113,10 +114,9 @@
}

static void
-sha512_transform(u64 *state, const u8 *input)
+sha512_transform(u64 *state, u64 *W, const u8 *input)
{
u64 a, b, c, d, e, f, g, h, t1, t2;
- u64 W[80];

int i;

@@ -157,7 +157,6 @@

/* erase our data */
a = b = c = d = e = f = g = h = t1 = t2 = 0;
- memset(W, 0, 80 * sizeof(u64));
}

static void
@@ -215,10 +214,10 @@
/* Transform as many times as possible. */
if (len >= part_len) {
memcpy(&sctx->buf[index], data, part_len);
- sha512_transform(sctx->state, sctx->buf);
+ sha512_transform(sctx->state, sctx->W, sctx->buf);

for (i = part_len; i + 127 < len; i+=128)
- sha512_transform(sctx->state, &data[i]);
+ sha512_transform(sctx->state, sctx->W, &data[i]);

index = 0;
} else {
@@ -227,6 +226,9 @@

/* Buffer remaining input */
memcpy(&sctx->buf[index], &data[i], len - i);
+
+ /* erase our data */
+ memset(sctx->W, 0, sizeof(sctx->W));
}

static void