Re: [RFC PATCH v2 for 4.15 08/14] Provide cpu_opv system call

From: Mathieu Desnoyers
Date: Mon Nov 06 2017 - 21:40:22 EST


----- On Nov 6, 2017, at 9:07 PM, Boqun Feng boqun.feng@xxxxxxxxx wrote:

> On Mon, Nov 06, 2017 at 03:56:38PM -0500, Mathieu Desnoyers wrote:
> [...]
>> +static int cpu_op_pin_pages(unsigned long addr, unsigned long len,
>> + struct page ***pinned_pages_ptr, size_t *nr_pinned,
>> + int write)
>> +{
>> + struct page *pages[2];
>> + int ret, nr_pages;
>> +
>> + if (!len)
>> + return 0;
>> + nr_pages = cpu_op_range_nr_pages(addr, len);
>> + BUG_ON(nr_pages > 2);
>> + if (*nr_pinned + nr_pages > NR_PINNED_PAGES_ON_STACK) {
>
> Is this a bug? Seems you will kzalloc() every time if *nr_pinned is
> bigger than NR_PINNED_PAGES_ON_STACK, which will result in memory
> leaking.
>
> I think the logic here is complex enough for us to introduce a
> structure, like:
>
> struct cpu_opv_page_pinner {
> int nr_pinned;
> bool is_kmalloc;
> struct page **pinned_pages;
> };
>
> Thoughts?

Good catch !

How about the attached diff ? I'll fold it into the rseq/dev tree.

Thanks,

Mathieu

--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
diff --git a/kernel/cpu_opv.c b/kernel/cpu_opv.c
index 09754bbe6a4f..3d8fd66416a0 100644
--- a/kernel/cpu_opv.c
+++ b/kernel/cpu_opv.c
@@ -46,6 +46,12 @@ union op_fn_data {
#endif
};

+struct cpu_opv_pinned_pages {
+ struct page **pages;
+ size_t nr;
+ bool is_kmalloc;
+};
+
typedef int (*op_fn_t)(union op_fn_data *data, uint64_t v, uint32_t len);

static DEFINE_MUTEX(cpu_opv_offline_lock);
@@ -217,8 +223,7 @@ static int cpu_op_check_pages(struct page **pages,
}

static int cpu_op_pin_pages(unsigned long addr, unsigned long len,
- struct page ***pinned_pages_ptr, size_t *nr_pinned,
- int write)
+ struct cpu_opv_pinned_pages *pin_pages, int write)
{
struct page *pages[2];
int ret, nr_pages;
@@ -227,15 +232,17 @@ static int cpu_op_pin_pages(unsigned long addr, unsigned long len,
return 0;
nr_pages = cpu_op_range_nr_pages(addr, len);
BUG_ON(nr_pages > 2);
- if (*nr_pinned + nr_pages > NR_PINNED_PAGES_ON_STACK) {
+ if (!pin_pages->is_kmalloc && pin_pages->nr + nr_pages
+ > NR_PINNED_PAGES_ON_STACK) {
struct page **pinned_pages =
kzalloc(CPU_OP_VEC_LEN_MAX * CPU_OP_MAX_PAGES
* sizeof(struct page *), GFP_KERNEL);
if (!pinned_pages)
return -ENOMEM;
- memcpy(pinned_pages, *pinned_pages_ptr,
- *nr_pinned * sizeof(struct page *));
- *pinned_pages_ptr = pinned_pages;
+ memcpy(pinned_pages, pin_pages->pages,
+ pin_pages->nr * sizeof(struct page *));
+ pin_pages->pages = pinned_pages;
+ pin_pages->is_kmalloc = true;
}
again:
ret = get_user_pages_fast(addr, nr_pages, write, pages);
@@ -257,9 +264,9 @@ static int cpu_op_pin_pages(unsigned long addr, unsigned long len,
}
if (ret)
goto error;
- (*pinned_pages_ptr)[(*nr_pinned)++] = pages[0];
+ pin_pages->pages[pin_pages->nr++] = pages[0];
if (nr_pages > 1)
- (*pinned_pages_ptr)[(*nr_pinned)++] = pages[1];
+ pin_pages->pages[pin_pages->nr++] = pages[1];
return 0;

error:
@@ -270,7 +277,7 @@ static int cpu_op_pin_pages(unsigned long addr, unsigned long len,
}

static int cpu_opv_pin_pages(struct cpu_op *cpuop, int cpuopcnt,
- struct page ***pinned_pages_ptr, size_t *nr_pinned)
+ struct cpu_opv_pinned_pages *pin_pages)
{
int ret, i;
bool expect_fault = false;
@@ -289,7 +296,7 @@ static int cpu_opv_pin_pages(struct cpu_op *cpuop, int cpuopcnt,
goto error;
ret = cpu_op_pin_pages(
(unsigned long)op->u.compare_op.a,
- op->len, pinned_pages_ptr, nr_pinned, 0);
+ op->len, pin_pages, 0);
if (ret)
goto error;
ret = -EFAULT;
@@ -299,7 +306,7 @@ static int cpu_opv_pin_pages(struct cpu_op *cpuop, int cpuopcnt,
goto error;
ret = cpu_op_pin_pages(
(unsigned long)op->u.compare_op.b,
- op->len, pinned_pages_ptr, nr_pinned, 0);
+ op->len, pin_pages, 0);
if (ret)
goto error;
break;
@@ -311,7 +318,7 @@ static int cpu_opv_pin_pages(struct cpu_op *cpuop, int cpuopcnt,
goto error;
ret = cpu_op_pin_pages(
(unsigned long)op->u.memcpy_op.dst,
- op->len, pinned_pages_ptr, nr_pinned, 1);
+ op->len, pin_pages, 1);
if (ret)
goto error;
ret = -EFAULT;
@@ -321,7 +328,7 @@ static int cpu_opv_pin_pages(struct cpu_op *cpuop, int cpuopcnt,
goto error;
ret = cpu_op_pin_pages(
(unsigned long)op->u.memcpy_op.src,
- op->len, pinned_pages_ptr, nr_pinned, 0);
+ op->len, pin_pages, 0);
if (ret)
goto error;
break;
@@ -333,7 +340,7 @@ static int cpu_opv_pin_pages(struct cpu_op *cpuop, int cpuopcnt,
goto error;
ret = cpu_op_pin_pages(
(unsigned long)op->u.arithmetic_op.p,
- op->len, pinned_pages_ptr, nr_pinned, 1);
+ op->len, pin_pages, 1);
if (ret)
goto error;
break;
@@ -347,7 +354,7 @@ static int cpu_opv_pin_pages(struct cpu_op *cpuop, int cpuopcnt,
goto error;
ret = cpu_op_pin_pages(
(unsigned long)op->u.bitwise_op.p,
- op->len, pinned_pages_ptr, nr_pinned, 1);
+ op->len, pin_pages, 1);
if (ret)
goto error;
break;
@@ -360,7 +367,7 @@ static int cpu_opv_pin_pages(struct cpu_op *cpuop, int cpuopcnt,
goto error;
ret = cpu_op_pin_pages(
(unsigned long)op->u.shift_op.p,
- op->len, pinned_pages_ptr, nr_pinned, 1);
+ op->len, pin_pages, 1);
if (ret)
goto error;
break;
@@ -373,9 +380,9 @@ static int cpu_opv_pin_pages(struct cpu_op *cpuop, int cpuopcnt,
return 0;

error:
- for (i = 0; i < *nr_pinned; i++)
- put_page((*pinned_pages_ptr)[i]);
- *nr_pinned = 0;
+ for (i = 0; i < pin_pages->nr; i++)
+ put_page(pin_pages->pages[i]);
+ pin_pages->nr = 0;
/*
* If faulting access is expected, return EAGAIN to user-space.
* It allows user-space to distinguish between a fault caused by
@@ -923,9 +930,12 @@ SYSCALL_DEFINE4(cpu_opv, struct cpu_op __user *, ucpuopv, int, cpuopcnt,
{
struct cpu_op cpuopv[CPU_OP_VEC_LEN_MAX];
struct page *pinned_pages_on_stack[NR_PINNED_PAGES_ON_STACK];
- struct page **pinned_pages = pinned_pages_on_stack;
+ struct cpu_opv_pinned_pages pin_pages = {
+ .pages = pinned_pages_on_stack,
+ .nr = 0,
+ .is_kmalloc = false,
+ };
int ret, i;
- size_t nr_pinned = 0;

if (unlikely(flags))
return -EINVAL;
@@ -938,15 +948,14 @@ SYSCALL_DEFINE4(cpu_opv, struct cpu_op __user *, ucpuopv, int, cpuopcnt,
ret = cpu_opv_check(cpuopv, cpuopcnt);
if (ret)
return ret;
- ret = cpu_opv_pin_pages(cpuopv, cpuopcnt,
- &pinned_pages, &nr_pinned);
+ ret = cpu_opv_pin_pages(cpuopv, cpuopcnt, &pin_pages);
if (ret)
goto end;
ret = do_cpu_opv(cpuopv, cpuopcnt, cpu);
- for (i = 0; i < nr_pinned; i++)
- put_page(pinned_pages[i]);
+ for (i = 0; i < pin_pages.nr; i++)
+ put_page(pin_pages.pages[i]);
end:
- if (pinned_pages != pinned_pages_on_stack)
- kfree(pinned_pages);
+ if (pin_pages.is_kmalloc)
+ kfree(pin_pages.pages);
return ret;
}