Re: [PATCH bpf-next v2 1/2] bpf,riscv: Implement PROBE_MEM32 pseudo instructions

From: Pu Lehui
Date: Mon Mar 25 2024 - 23:16:02 EST




On 2024/3/26 1:15, Puranjay Mohan wrote:
Björn Töpel <bjorn@xxxxxxxxxx> writes:

Puranjay Mohan <puranjay12@xxxxxxxxx> writes:

Add support for [LDX | STX | ST], PROBE_MEM32, [B | H | W | DW]
instructions. They are similar to PROBE_MEM instructions with the
following differences:
- PROBE_MEM32 supports store.
- PROBE_MEM32 relies on the verifier to clear upper 32-bit of the
src/dst register
- PROBE_MEM32 adds 64-bit kern_vm_start address (which is stored in S7
in the prologue). Due to bpf_arena constructions such S7 + reg +
off16 access is guaranteed to be within arena virtual range, so no
address check at run-time.
- S7 is a free callee-saved register, so it is used to store kern_vm_start
- PROBE_MEM32 allows STX and ST. If they fault the store is a nop. When
LDX faults the destination register is zeroed.

To support these on riscv, we do tmp = S7 + src/dst reg and then use
tmp2 as the new src/dst register. This allows us to reuse most of the
code for normal [LDX | STX | ST].

Cool to see the RV BPF JIT keeping up with x86 features! ;-) Nice work!

It is my self proclaimed duty to make sure that all 64-bit JITs have
feature parity. :D


A couple of minor comments below.

Signed-off-by: Puranjay Mohan <puranjay12@xxxxxxxxx>
---
arch/riscv/net/bpf_jit.h | 1 +
arch/riscv/net/bpf_jit_comp64.c | 193 +++++++++++++++++++++++++++++++-
arch/riscv/net/bpf_jit_core.c | 1 +
3 files changed, 192 insertions(+), 3 deletions(-)

diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h
index f4b6b3b9edda..8a47da08dd9c 100644
--- a/arch/riscv/net/bpf_jit.h
+++ b/arch/riscv/net/bpf_jit.h
@@ -81,6 +81,7 @@ struct rv_jit_context {
int nexentries;
unsigned long flags;
int stack_size;
+ u64 arena_vm_start;
};
/* Convert from ninsns to bytes. */
diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
index 1adf2f39ce59..0c0588e327af 100644
--- a/arch/riscv/net/bpf_jit_comp64.c
+++ b/arch/riscv/net/bpf_jit_comp64.c
@@ -255,6 +255,10 @@ static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx)
emit_ld(RV_REG_S6, store_offset, RV_REG_SP, ctx);
store_offset -= 8;
}
+ if (ctx->arena_vm_start) {
+ emit_ld(RV_REG_S7, store_offset, RV_REG_SP, ctx);
+ store_offset -= 8;
+ }

As RV_REG_S7 is only for bpf arena, how about define this register as bellow, like RV_REG_TCC

#define RV_REG_ARENA RV_REG_S7

emit_addi(RV_REG_SP, RV_REG_SP, stack_adjust, ctx);
/* Set return value. */
@@ -548,6 +552,7 @@ static void emit_atomic(u8 rd, u8 rs, s16 off, s32 imm, bool is64,
#define BPF_FIXUP_OFFSET_MASK GENMASK(26, 0)
#define BPF_FIXUP_REG_MASK GENMASK(31, 27)
+#define DONT_CLEAR 17 /* RV_REG_A7 unused in pt_regmap */

Hmm, so this is just a a sentinel node, right? Isn't it more robust to
use, say REG_ZERO which will never be used? Maybe REG_DONT_CLEAR_MARKER
or smth, so it's obvious how it's used?

Yes, I agree, RV_REG_ZERO would be the best thing to use here.



bool ex_handler_bpf(const struct exception_table_entry *ex,
struct pt_regs *regs)
@@ -555,7 +560,8 @@ bool ex_handler_bpf(const struct exception_table_entry *ex,
off_t offset = FIELD_GET(BPF_FIXUP_OFFSET_MASK, ex->fixup);
int regs_offset = FIELD_GET(BPF_FIXUP_REG_MASK, ex->fixup);
- *(unsigned long *)((void *)regs + pt_regmap[regs_offset]) = 0;
+ if (regs_offset != DONT_CLEAR)
+ *(unsigned long *)((void *)regs + pt_regmap[regs_offset]) = 0;
regs->epc = (unsigned long)&ex->fixup - offset;
return true;
@@ -572,7 +578,8 @@ static int add_exception_handler(const struct bpf_insn *insn,
off_t fixup_offset;
if (!ctx->insns || !ctx->ro_insns || !ctx->prog->aux->extable ||
- (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX))
+ (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX &&
+ BPF_MODE(insn->code) != BPF_PROBE_MEM32))
return 0;
if (WARN_ON_ONCE(ctx->nexentries >= ctx->prog->aux->num_exentries))
@@ -622,6 +629,9 @@ static int add_exception_handler(const struct bpf_insn *insn,
ex->insn = ins_offset;
+ if (BPF_CLASS(insn->code) != BPF_LDX)
+ dst_reg = DONT_CLEAR;
+

Instead of having a side-effect, and passing a dummy dst_reg for the
probe_mem32, just explicitly add DONT_CLEAR when calling
add_exception_handler(). It's more obvious to me at least.

Sure, will do that in the next version.


ex->fixup = FIELD_PREP(BPF_FIXUP_OFFSET_MASK, fixup_offset) |
FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg);
ex->type = EX_TYPE_BPF;
@@ -1063,7 +1073,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
BPF_CLASS(insn->code) == BPF_JMP;
int s, e, rvoff, ret, i = insn - ctx->prog->insnsi;
struct bpf_prog_aux *aux = ctx->prog->aux;
- u8 rd = -1, rs = -1, code = insn->code;
+ u8 rd = -1, rs = -1, code = insn->code, reg_arena_vm_start = RV_REG_S7;
s16 off = insn->off;
s32 imm = insn->imm;
@@ -1539,6 +1549,11 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
case BPF_LDX | BPF_PROBE_MEMSX | BPF_B:
case BPF_LDX | BPF_PROBE_MEMSX | BPF_H:
case BPF_LDX | BPF_PROBE_MEMSX | BPF_W:
+ /* LDX | PROBE_MEM32: dst = *(unsigned size *)(src + S7 + off)*/
+ case BPF_LDX | BPF_PROBE_MEM32 | BPF_B:
+ case BPF_LDX | BPF_PROBE_MEM32 | BPF_H:
+ case BPF_LDX | BPF_PROBE_MEM32 | BPF_W:
+ case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW:
{
int insn_len, insns_start;
bool sign_ext;
@@ -1546,6 +1561,11 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
sign_ext = BPF_MODE(insn->code) == BPF_MEMSX ||
BPF_MODE(insn->code) == BPF_PROBE_MEMSX;
+ if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) {
+ emit_add(RV_REG_T2, rs, reg_arena_vm_start, ctx);
+ rs = RV_REG_T2;
+ }
+
switch (BPF_SIZE(code)) {
case BPF_B:
if (is_12b_int(off)) {
@@ -1682,6 +1702,87 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx);
break;
+ case BPF_ST | BPF_PROBE_MEM32 | BPF_B:
+ case BPF_ST | BPF_PROBE_MEM32 | BPF_H:
+ case BPF_ST | BPF_PROBE_MEM32 | BPF_W:
+ case BPF_ST | BPF_PROBE_MEM32 | BPF_DW:
+ {
+ int insn_len, insns_start;
+
+ emit_add(RV_REG_T3, rd, reg_arena_vm_start, ctx);
+ rd = RV_REG_T3;
+
+ /* Load imm to a register then store it */
+ emit_imm(RV_REG_T1, imm, ctx);
+
+ switch (BPF_SIZE(code)) {
+ case BPF_B:
+ if (is_12b_int(off)) {
+ insns_start = ctx->ninsns;
+ emit(rv_sb(rd, off, RV_REG_T1), ctx);
+ insn_len = ctx->ninsns - insns_start;
+ break;
+ }
+
+ emit_imm(RV_REG_T2, off, ctx);
+ emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
+ insns_start = ctx->ninsns;
+ emit(rv_sb(RV_REG_T2, 0, RV_REG_T1), ctx);
+ insn_len = ctx->ninsns - insns_start;
+
+ break;
+
+ case BPF_H:
+ if (is_12b_int(off)) {
+ insns_start = ctx->ninsns;
+ emit(rv_sh(rd, off, RV_REG_T1), ctx);
+ insn_len = ctx->ninsns - insns_start;
+ break;
+ }
+
+ emit_imm(RV_REG_T2, off, ctx);
+ emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
+ insns_start = ctx->ninsns;
+ emit(rv_sh(RV_REG_T2, 0, RV_REG_T1), ctx);
+ insn_len = ctx->ninsns - insns_start;
+ break;
+ case BPF_W:
+ if (is_12b_int(off)) {
+ insns_start = ctx->ninsns;
+ emit_sw(rd, off, RV_REG_T1, ctx);
+ insn_len = ctx->ninsns - insns_start;
+ break;
+ }
+
+ emit_imm(RV_REG_T2, off, ctx);
+ emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
+ insns_start = ctx->ninsns;
+ emit_sw(RV_REG_T2, 0, RV_REG_T1, ctx);
+ insn_len = ctx->ninsns - insns_start;
+ break;
+ case BPF_DW:
+ if (is_12b_int(off)) {
+ insns_start = ctx->ninsns;
+ emit_sd(rd, off, RV_REG_T1, ctx);
+ insn_len = ctx->ninsns - insns_start;
+ break;
+ }
+
+ emit_imm(RV_REG_T2, off, ctx);
+ emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
+ insns_start = ctx->ninsns;
+ emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx);
+ insn_len = ctx->ninsns - insns_start;
+ break;
+ }

A lot of similar code, with emit of different sizes. Possible to move
move out to a function, and wrap the emits? The main loop is hard read
already!

I thought about this as well. My plan is to refactor the whole thing in a
seperate patch. I did not do it with this feature as it will cause a lot
of unrelated code churn.

Yeah, I think we could do that factor out for LDX, ST, STX, while I had done it before another riscv bpf arena. BUT, looking forword to your implementation.😄


Thanks,
Puranjay