[PATCH RT v3] crypto: limit more FPU-enabled sections

From: Sebastian Andrzej Siewior
Date: Fri Dec 01 2017 - 09:03:10 EST


Those crypto drivers use SSE/AVX/â for their crypto work and in order to
do so in kernel they need to enable the "FPU" in kernel mode which
disables preemption.
There are two problems with the way they are used:
- the while loop which processes X bytes may create latency spikes and
should be avoided or limited.
- the cipher-walk-next part may allocate/free memory and may use
kmap_atomic().

The whole kernel_fpu_begin()/end() processing isn't probably that cheap.
It most likely makes sense to process as much of those as possible in one
go. The new *_fpu_sched_rt() schedules only if a RT task is pending.

Probably we should measure the performance those ciphers in pure SW
mode and with this optimisations to see if it makes sense to keep them
for RT.

This kernel_fpu_resched() makes the code more preemptible which might hurt
performance.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx>
---
arch/x86/crypto/camellia_aesni_avx2_glue.c | 20 ++++++++++++++++++++
arch/x86/crypto/camellia_aesni_avx_glue.c | 19 +++++++++++++++++++
arch/x86/crypto/cast6_avx_glue.c | 24 +++++++++++++++++++-----
arch/x86/crypto/chacha20_glue.c | 9 +++++----
arch/x86/crypto/serpent_avx2_glue.c | 19 +++++++++++++++++++
arch/x86/crypto/serpent_avx_glue.c | 23 +++++++++++++++++++----
arch/x86/crypto/serpent_sse2_glue.c | 23 +++++++++++++++++++----
arch/x86/crypto/twofish_avx_glue.c | 27 +++++++++++++++++++++++++--
arch/x86/include/asm/fpu/api.h | 1 +
arch/x86/kernel/fpu/core.c | 12 ++++++++++++
10 files changed, 158 insertions(+), 19 deletions(-)

--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -206,6 +206,20 @@ struct crypt_priv {
bool fpu_enabled;
};

+#ifdef CONFIG_PREEMPT_RT_FULL
+static void camellia_fpu_end_rt(struct crypt_priv *ctx)
+{
+ bool fpu_enabled = ctx->fpu_enabled;
+
+ if (!fpu_enabled)
+ return;
+ camellia_fpu_end(fpu_enabled);
+ ctx->fpu_enabled = false;
+}
+#else
+static void camellia_fpu_end_rt(struct crypt_priv *ctx) { }
+#endif
+
static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
{
const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
@@ -221,16 +235,19 @@ static void encrypt_callback(void *priv,
}

if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
+ kernel_fpu_resched();
camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
}

while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+ kernel_fpu_resched();
camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
}
+ camellia_fpu_end_rt(ctx);

for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
camellia_enc_blk(ctx->ctx, srcdst, srcdst);
@@ -251,16 +268,19 @@ static void decrypt_callback(void *priv,
}

if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
+ kernel_fpu_resched();
camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
}

while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+ kernel_fpu_resched();
camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
}
+ camellia_fpu_end_rt(ctx);

for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
camellia_dec_blk(ctx->ctx, srcdst, srcdst);
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -210,6 +210,21 @@ struct crypt_priv {
bool fpu_enabled;
};

+#ifdef CONFIG_PREEMPT_RT_FULL
+static void camellia_fpu_end_rt(struct crypt_priv *ctx)
+{
+ bool fpu_enabled = ctx->fpu_enabled;
+
+ if (!fpu_enabled)
+ return;
+ camellia_fpu_end(fpu_enabled);
+ ctx->fpu_enabled = false;
+}
+
+#else
+static void camellia_fpu_end_rt(struct crypt_priv *ctx) { }
+#endif
+
static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
{
const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
@@ -225,10 +240,12 @@ static void encrypt_callback(void *priv,
}

while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+ kernel_fpu_resched();
camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
}
+ camellia_fpu_end_rt(ctx);

for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
camellia_enc_blk(ctx->ctx, srcdst, srcdst);
@@ -249,10 +266,12 @@ static void decrypt_callback(void *priv,
}

while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+ kernel_fpu_resched();
camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
}
+ camellia_fpu_end_rt(ctx);

for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
camellia_dec_blk(ctx->ctx, srcdst, srcdst);
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -205,19 +205,33 @@ struct crypt_priv {
bool fpu_enabled;
};

+#ifdef CONFIG_PREEMPT_RT_FULL
+static void cast6_fpu_end_rt(struct crypt_priv *ctx)
+{
+ bool fpu_enabled = ctx->fpu_enabled;
+
+ if (!fpu_enabled)
+ return;
+ cast6_fpu_end(fpu_enabled);
+ ctx->fpu_enabled = false;
+}
+
+#else
+static void cast6_fpu_end_rt(struct crypt_priv *ctx) { }
+#endif
+
static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
{
const unsigned int bsize = CAST6_BLOCK_SIZE;
struct crypt_priv *ctx = priv;
int i;

- ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
-
if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
+ ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
+ cast6_fpu_end_rt(ctx);
return;
}
-
for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
__cast6_encrypt(ctx->ctx, srcdst, srcdst);
}
@@ -228,10 +242,10 @@ static void decrypt_callback(void *priv,
struct crypt_priv *ctx = priv;
int i;

- ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
-
if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
+ ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
+ cast6_fpu_end_rt(ctx);
return;
}

--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -81,23 +81,24 @@ static int chacha20_simd(struct skcipher

crypto_chacha20_init(state, ctx, walk.iv);

- kernel_fpu_begin();
-
while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
+ kernel_fpu_begin();
+
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
+ kernel_fpu_end();
err = skcipher_walk_done(&walk,
walk.nbytes % CHACHA20_BLOCK_SIZE);
}

if (walk.nbytes) {
+ kernel_fpu_begin();
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
walk.nbytes);
+ kernel_fpu_end();
err = skcipher_walk_done(&walk, 0);
}

- kernel_fpu_end();
-
return err;
}

--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -184,6 +184,21 @@ struct crypt_priv {
bool fpu_enabled;
};

+#ifdef CONFIG_PREEMPT_RT_FULL
+static void serpent_fpu_end_rt(struct crypt_priv *ctx)
+{
+ bool fpu_enabled = ctx->fpu_enabled;
+
+ if (!fpu_enabled)
+ return;
+ serpent_fpu_end(fpu_enabled);
+ ctx->fpu_enabled = false;
+}
+
+#else
+static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
+#endif
+
static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
{
const unsigned int bsize = SERPENT_BLOCK_SIZE;
@@ -199,10 +214,12 @@ static void encrypt_callback(void *priv,
}

while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
+ kernel_fpu_resched();
serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
}
+ serpent_fpu_end_rt(ctx);

for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
__serpent_encrypt(ctx->ctx, srcdst, srcdst);
@@ -223,10 +240,12 @@ static void decrypt_callback(void *priv,
}

while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
+ kernel_fpu_resched();
serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
}
+ serpent_fpu_end_rt(ctx);

for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
__serpent_decrypt(ctx->ctx, srcdst, srcdst);
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -218,16 +218,31 @@ struct crypt_priv {
bool fpu_enabled;
};

+#ifdef CONFIG_PREEMPT_RT_FULL
+static void serpent_fpu_end_rt(struct crypt_priv *ctx)
+{
+ bool fpu_enabled = ctx->fpu_enabled;
+
+ if (!fpu_enabled)
+ return;
+ serpent_fpu_end(fpu_enabled);
+ ctx->fpu_enabled = false;
+}
+
+#else
+static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
+#endif
+
static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
{
const unsigned int bsize = SERPENT_BLOCK_SIZE;
struct crypt_priv *ctx = priv;
int i;

- ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
-
if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+ ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
+ serpent_fpu_end_rt(ctx);
return;
}

@@ -241,10 +256,10 @@ static void decrypt_callback(void *priv,
struct crypt_priv *ctx = priv;
int i;

- ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
-
if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+ ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
+ serpent_fpu_end_rt(ctx);
return;
}

--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -187,16 +187,31 @@ struct crypt_priv {
bool fpu_enabled;
};

+#ifdef CONFIG_PREEMPT_RT_FULL
+static void serpent_fpu_end_rt(struct crypt_priv *ctx)
+{
+ bool fpu_enabled = ctx->fpu_enabled;
+
+ if (!fpu_enabled)
+ return;
+ serpent_fpu_end(fpu_enabled);
+ ctx->fpu_enabled = false;
+}
+
+#else
+static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
+#endif
+
static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
{
const unsigned int bsize = SERPENT_BLOCK_SIZE;
struct crypt_priv *ctx = priv;
int i;

- ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
-
if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+ ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
+ serpent_fpu_end_rt(ctx);
return;
}

@@ -210,10 +225,10 @@ static void decrypt_callback(void *priv,
struct crypt_priv *ctx = priv;
int i;

- ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
-
if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+ ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
+ serpent_fpu_end_rt(ctx);
return;
}

--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -218,6 +218,21 @@ struct crypt_priv {
bool fpu_enabled;
};

+#ifdef CONFIG_PREEMPT_RT_FULL
+static void twofish_fpu_end_rt(struct crypt_priv *ctx)
+{
+ bool fpu_enabled = ctx->fpu_enabled;
+
+ if (!fpu_enabled)
+ return;
+ twofish_fpu_end(fpu_enabled);
+ ctx->fpu_enabled = false;
+}
+
+#else
+static void twofish_fpu_end_rt(struct crypt_priv *ctx) { }
+#endif
+
static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
{
const unsigned int bsize = TF_BLOCK_SIZE;
@@ -228,12 +243,16 @@ static void encrypt_callback(void *priv,

if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
+ twofish_fpu_end_rt(ctx);
return;
}

- for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
+ for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) {
+ kernel_fpu_resched();
twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
+ }

+ twofish_fpu_end_rt(ctx);
nbytes %= bsize * 3;

for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
@@ -250,11 +269,15 @@ static void decrypt_callback(void *priv,

if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
+ twofish_fpu_end_rt(ctx);
return;
}

- for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
+ for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) {
+ kernel_fpu_resched();
twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
+ }
+ twofish_fpu_end_rt(ctx);

nbytes %= bsize * 3;

--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -25,6 +25,7 @@ extern void __kernel_fpu_begin(void);
extern void __kernel_fpu_end(void);
extern void kernel_fpu_begin(void);
extern void kernel_fpu_end(void);
+extern void kernel_fpu_resched(void);
extern bool irq_fpu_usable(void);

/*
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -137,6 +137,18 @@ void kernel_fpu_end(void)
}
EXPORT_SYMBOL_GPL(kernel_fpu_end);

+void kernel_fpu_resched(void)
+{
+ WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
+
+ if (should_resched(PREEMPT_OFFSET)) {
+ kernel_fpu_end();
+ cond_resched();
+ kernel_fpu_begin();
+ }
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_resched);
+
/*
* Save the FPU state (mark it for reload if necessary):
*