Re: [PATCH v7 08/15] x86/sgx: Implement EPC reclamation flows for cgroup

From: Jarkko Sakkinen
Date: Mon Jan 22 2024 - 15:29:53 EST


On Mon Jan 22, 2024 at 7:20 PM EET, Haitao Huang wrote:
> From: Kristen Carlson Accardi <kristen@xxxxxxxxxxxxxxx>
>
> Implement the reclamation flow for cgroup, encapsulated in the top-level
> function sgx_epc_cgroup_reclaim_pages(). It does a pre-order walk on its
> subtree, and make calls to sgx_reclaim_pages() at each node passing in
> the LRU of that node. It keeps track of total reclaimed pages, and pages
> left to attempt. It stops the walk if desired number of pages are
> attempted.
>
> In some contexts, e.g. page fault handling, only asynchronous
> reclamation is allowed. Create a work-queue, corresponding work item and
> function definitions to support the asynchronous reclamation. Both
> synchronous and asynchronous flows invoke the same top level reclaim
> function, and will be triggered later by sgx_epc_cgroup_try_charge()
> when usage of the cgroup is at or near its limit.
>
> Co-developed-by: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
> Signed-off-by: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
> Signed-off-by: Kristen Carlson Accardi <kristen@xxxxxxxxxxxxxxx>
> Co-developed-by: Haitao Huang <haitao.huang@xxxxxxxxxxxxxxx>
> Signed-off-by: Haitao Huang <haitao.huang@xxxxxxxxxxxxxxx>
> ---
> V7:
> - Split this out from the big patch, #10 in V6. (Dave, Kai)
> ---
> arch/x86/kernel/cpu/sgx/epc_cgroup.c | 174 ++++++++++++++++++++++++++-
> arch/x86/kernel/cpu/sgx/epc_cgroup.h | 5 +-
> 2 files changed, 177 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/sgx/epc_cgroup.c b/arch/x86/kernel/cpu/sgx/epc_cgroup.c
> index 938695816a9e..71570c346d95 100644
> --- a/arch/x86/kernel/cpu/sgx/epc_cgroup.c
> +++ b/arch/x86/kernel/cpu/sgx/epc_cgroup.c
> @@ -7,9 +7,173 @@
>
> static struct sgx_epc_cgroup epc_cg_root;
>
> +static struct workqueue_struct *sgx_epc_cg_wq;
> +
> +static inline u64 sgx_epc_cgroup_page_counter_read(struct sgx_epc_cgroup *epc_cg)
> +{
> + return atomic64_read(&epc_cg->cg->res[MISC_CG_RES_SGX_EPC].usage) / PAGE_SIZE;
> +}
> +
> +static inline u64 sgx_epc_cgroup_max_pages(struct sgx_epc_cgroup *epc_cg)
> +{
> + return READ_ONCE(epc_cg->cg->res[MISC_CG_RES_SGX_EPC].max) / PAGE_SIZE;
> +}
> +
> +/*
> + * Get the lower bound of limits of a cgroup and its ancestors. Used in
> + * sgx_epc_cgroup_reclaim_work_func() to determine if EPC usage of a cgroup is over its limit
> + * or its ancestors' hence reclamation is needed.
> + */
> +static inline u64 sgx_epc_cgroup_max_pages_to_root(struct sgx_epc_cgroup *epc_cg)
> +{
> + struct misc_cg *i = epc_cg->cg;
> + u64 m = U64_MAX;
> +
> + while (i) {
> + m = min(m, READ_ONCE(i->res[MISC_CG_RES_SGX_EPC].max));
> + i = misc_cg_parent(i);
> + }
> +
> + return m / PAGE_SIZE;
> +}
> +
> /**
> - * sgx_epc_cgroup_try_charge() - try to charge cgroup for a single EPC page
> + * sgx_epc_cgroup_lru_empty() - check if a cgroup tree has no pages on its LRUs
> + * @root: Root of the tree to check
> *
> + * Return: %true if all cgroups under the specified root have empty LRU lists.
> + * Used to avoid livelocks due to a cgroup having a non-zero charge count but
> + * no pages on its LRUs, e.g. due to a dead enclave waiting to be released or
> + * because all pages in the cgroup are unreclaimable.
> + */
> +bool sgx_epc_cgroup_lru_empty(struct misc_cg *root)
> +{
> + struct cgroup_subsys_state *css_root;
> + struct cgroup_subsys_state *pos;
> + struct sgx_epc_cgroup *epc_cg;
> + bool ret = true;
> +
> + /*
> + * Caller ensure css_root ref acquired
> + */
> + css_root = &root->css;
> +
> + rcu_read_lock();
> + css_for_each_descendant_pre(pos, css_root) {
> + if (!css_tryget(pos))
> + break;
> +
> + rcu_read_unlock();
> +
> + epc_cg = sgx_epc_cgroup_from_misc_cg(css_misc(pos));
> +
> + spin_lock(&epc_cg->lru.lock);
> + ret = list_empty(&epc_cg->lru.reclaimable);
> + spin_unlock(&epc_cg->lru.lock);
> +
> + rcu_read_lock();
> + css_put(pos);
> + if (!ret)
> + break;
> + }
> +
> + rcu_read_unlock();
> +
> + return ret;
> +}
> +
> +/**
> + * sgx_epc_cgroup_reclaim_pages() - walk a cgroup tree and scan LRUs to reclaim pages
> + * @root: Root of the tree to start walking
> + * Return: Number of pages reclaimed.
> + */
> +unsigned int sgx_epc_cgroup_reclaim_pages(struct misc_cg *root)
> +{
> + /*
> + * Attempting to reclaim only a few pages will often fail and is inefficient, while
> + * reclaiming a huge number of pages can result in soft lockups due to holding various
> + * locks for an extended duration.
> + */
> + unsigned int nr_to_scan = SGX_NR_TO_SCAN;
> + struct cgroup_subsys_state *css_root;
> + struct cgroup_subsys_state *pos;
> + struct sgx_epc_cgroup *epc_cg;
> + unsigned int cnt;
> +
> + /* Caller ensure css_root ref acquired */
> + css_root = &root->css;
> +
> + cnt = 0;
> + rcu_read_lock();
> + css_for_each_descendant_pre(pos, css_root) {
> + if (!css_tryget(pos))
> + break;
> + rcu_read_unlock();
> +
> + epc_cg = sgx_epc_cgroup_from_misc_cg(css_misc(pos));
> + cnt += sgx_reclaim_pages(&epc_cg->lru, &nr_to_scan);
> +
> + rcu_read_lock();
> + css_put(pos);
> + if (!nr_to_scan)
> + break;
> + }
> +
> + rcu_read_unlock();
> + return cnt;
> +}
> +
> +/*
> + * Scheduled by sgx_epc_cgroup_try_charge() to reclaim pages from the cgroup when the cgroup is
> + * at/near its maximum capacity
> + */
> +static void sgx_epc_cgroup_reclaim_work_func(struct work_struct *work)
> +{
> + struct sgx_epc_cgroup *epc_cg;
> + u64 cur, max;
> +
> + epc_cg = container_of(work, struct sgx_epc_cgroup, reclaim_work);
> +
> + for (;;) {
> + max = sgx_epc_cgroup_max_pages_to_root(epc_cg);
> +
> + /*
> + * Adjust the limit down by one page, the goal is to free up
> + * pages for fault allocations, not to simply obey the limit.
> + * Conditionally decrementing max also means the cur vs. max
> + * check will correctly handle the case where both are zero.
> + */
> + if (max)
> + max--;
> +
> + /*
> + * Unless the limit is extremely low, in which case forcing
> + * reclaim will likely cause thrashing, force the cgroup to
> + * reclaim at least once if it's operating *near* its maximum
> + * limit by adjusting @max down by half the min reclaim size.
> + * This work func is scheduled by sgx_epc_cgroup_try_charge
> + * when it cannot directly reclaim due to being in an atomic
> + * context, e.g. EPC allocation in a fault handler. Waiting
> + * to reclaim until the cgroup is actually at its limit is less
> + * performant as it means the faulting task is effectively
> + * blocked until a worker makes its way through the global work
> + * queue.
> + */
> + if (max > SGX_NR_TO_SCAN * 2)
> + max -= (SGX_NR_TO_SCAN / 2);
> +
> + cur = sgx_epc_cgroup_page_counter_read(epc_cg);
> +
> + if (cur <= max || sgx_epc_cgroup_lru_empty(epc_cg->cg))
> + break;
> +
> + /* Keep reclaiming until above condition is met. */
> + sgx_epc_cgroup_reclaim_pages(epc_cg->cg);
> + }
> +}
> +
> +/**
> + * sgx_epc_cgroup_try_charge() - try to charge cgroup for a single EPC page
> * @epc_cg: The EPC cgroup to be charged for the page.
> * Return:
> * * %0 - If successfully charged.
> @@ -43,6 +207,7 @@ static void sgx_epc_cgroup_free(struct misc_cg *cg)
> if (!epc_cg)
> return;
>
> + cancel_work_sync(&epc_cg->reclaim_work);
> kfree(epc_cg);
> }
>
> @@ -55,6 +220,8 @@ const struct misc_res_ops sgx_epc_cgroup_ops = {
>
> static void sgx_epc_misc_init(struct misc_cg *cg, struct sgx_epc_cgroup *epc_cg)
> {
> + sgx_lru_init(&epc_cg->lru);
> + INIT_WORK(&epc_cg->reclaim_work, sgx_epc_cgroup_reclaim_work_func);
> cg->res[MISC_CG_RES_SGX_EPC].priv = epc_cg;
> epc_cg->cg = cg;
> }
> @@ -74,6 +241,11 @@ static int sgx_epc_cgroup_alloc(struct misc_cg *cg)
>
> void sgx_epc_cgroup_init(void)
> {
> + sgx_epc_cg_wq = alloc_workqueue("sgx_epc_cg_wq",
> + WQ_UNBOUND | WQ_FREEZABLE,
> + WQ_UNBOUND_MAX_ACTIVE);
> + BUG_ON(!sgx_epc_cg_wq);
> +
> misc_cg_set_ops(MISC_CG_RES_SGX_EPC, &sgx_epc_cgroup_ops);
> sgx_epc_misc_init(misc_cg_root(), &epc_cg_root);
> }
> diff --git a/arch/x86/kernel/cpu/sgx/epc_cgroup.h b/arch/x86/kernel/cpu/sgx/epc_cgroup.h
> index 971df34f27d8..9b77b51a2839 100644
> --- a/arch/x86/kernel/cpu/sgx/epc_cgroup.h
> +++ b/arch/x86/kernel/cpu/sgx/epc_cgroup.h
> @@ -33,7 +33,9 @@ static inline void sgx_epc_cgroup_uncharge(struct sgx_epc_cgroup *epc_cg) { }
> static inline void sgx_epc_cgroup_init(void) { }
> #else
> struct sgx_epc_cgroup {
> - struct misc_cg *cg;
> + struct misc_cg *cg;
> + struct sgx_epc_lru_list lru;
> + struct work_struct reclaim_work;

We don't align fields anywhere else so this is somewhat inconsistent.

> };
>
> static inline struct sgx_epc_cgroup *sgx_epc_cgroup_from_misc_cg(struct misc_cg *cg)
> @@ -66,6 +68,7 @@ static inline void sgx_put_epc_cg(struct sgx_epc_cgroup *epc_cg)
>
> int sgx_epc_cgroup_try_charge(struct sgx_epc_cgroup *epc_cg);
> void sgx_epc_cgroup_uncharge(struct sgx_epc_cgroup *epc_cg);
> +bool sgx_epc_cgroup_lru_empty(struct misc_cg *root);
> void sgx_epc_cgroup_init(void);
>
> #endif


BR, Jarkko