Re: [PATCH v2 07/18] x86/sgx: Use a list to track to-be-reclaimed pages during reclaim

From: Dave Hansen
Date: Fri Dec 02 2022 - 17:33:23 EST


On 12/2/22 10:36, Kristen Carlson Accardi wrote:
> From: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
>
> Change sgx_reclaim_pages() to use a list rather than an array for
> storing the epc_pages which will be reclaimed. This change is needed
> to transition to the LRU implementation for EPC cgroup support.
>
> This change requires keeping track of whether newly recorded
> EPC pages are pages for VA Arrays, or for Enclave data. In addition,
> helper functions are added to move pages from one list to another and
> enforce a consistent queue like behavior for the LRU lists.

More changelog nit: Please use imperative voice, not passive voice.
Move from:

In addition, helper functions are added

to:

In addition, add helper functions

> diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
> index 4683da9ef4f1..9ee306ac2a8e 100644
> --- a/arch/x86/kernel/cpu/sgx/encl.c
> +++ b/arch/x86/kernel/cpu/sgx/encl.c
> @@ -252,7 +252,7 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
> epc_page = sgx_encl_eldu(&encl->secs, NULL);
> if (IS_ERR(epc_page))
> return ERR_CAST(epc_page);
> - sgx_record_epc_page(epc_page, 0);
> + sgx_record_epc_page(epc_page, SGX_EPC_PAGE_ENCLAVE);
> }

This is one of those patches where the first hunk seems like it is
entirely disconnected from what the changelog made me expect I would see.

I don't see sgx_reclaim_pages(), or lists or arrays.

If you need to pass additional data down into a function, then do *that*
in a separate patch.

I'm glad it eventually got fixed up, but I don't really ever like to see
bare integers that don't have obvious meaning:

sgx_record_epc_page(epc_page, 0);

Even if you had:

#define SGX_EPC_PAGE_RECLAIMER_UNTRACKED 0

sgx_record_epc_page(epc_page, SGX_EPC_PAGE_RECLAIMER_UNTRACKED);

makes a *LOT* of sense compared to other callers that do

sgx_record_epc_page(epc_page, SGX_EPC_PAGE_RECLAIMER_TRACKED);

> epc_page = sgx_encl_eldu(entry, encl->secs.epc_page);
> @@ -260,7 +260,8 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
> return ERR_CAST(epc_page);
>
> encl->secs_child_cnt++;
> - sgx_record_epc_page(entry->epc_page, SGX_EPC_PAGE_RECLAIMER_TRACKED);
> + sgx_record_epc_page(entry->epc_page,
> + (SGX_EPC_PAGE_ENCLAVE | SGX_EPC_PAGE_RECLAIMER_TRACKED));
>
> return entry;
> }
> @@ -1221,7 +1222,7 @@ struct sgx_epc_page *sgx_alloc_va_page(struct sgx_encl *encl, bool reclaim)
> sgx_encl_free_epc_page(epc_page);
> return ERR_PTR(-EFAULT);
> }
> - sgx_record_epc_page(epc_page, 0);
> + sgx_record_epc_page(epc_page, SGX_EPC_PAGE_VERSION_ARRAY);
>
> return epc_page;
> }
> diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
> index aca80a3f38a1..c3a9bffbc37e 100644
> --- a/arch/x86/kernel/cpu/sgx/ioctl.c
> +++ b/arch/x86/kernel/cpu/sgx/ioctl.c
> @@ -114,7 +114,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
> encl->attributes = secs->attributes;
> encl->attributes_mask = SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | SGX_ATTR_KSS;
>
> - sgx_record_epc_page(encl->secs.epc_page, 0);
> + sgx_record_epc_page(encl->secs.epc_page, SGX_EPC_PAGE_ENCLAVE);
>
> /* Set only after completion, as encl->lock has not been taken. */
> set_bit(SGX_ENCL_CREATED, &encl->flags);
> @@ -325,7 +325,8 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src,
> goto err_out;
> }
>
> - sgx_record_epc_page(encl_page->epc_page, SGX_EPC_PAGE_RECLAIMER_TRACKED);
> + sgx_record_epc_page(encl_page->epc_page,
> + (SGX_EPC_PAGE_ENCLAVE | SGX_EPC_PAGE_RECLAIMER_TRACKED));
> mutex_unlock(&encl->lock);
> mmap_read_unlock(current->mm);
> return ret;
> diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
> index bad72498b0a7..83aaf5cea7b9 100644
> --- a/arch/x86/kernel/cpu/sgx/main.c
> +++ b/arch/x86/kernel/cpu/sgx/main.c
> @@ -288,37 +288,43 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
> */
> static void __sgx_reclaim_pages(void)
> {
> - struct sgx_epc_page *chunk[SGX_NR_TO_SCAN];
> struct sgx_backing backing[SGX_NR_TO_SCAN];
> + struct sgx_epc_page *epc_page, *tmp;
> struct sgx_encl_page *encl_page;
> - struct sgx_epc_page *epc_page;
> pgoff_t page_index;
> - int cnt = 0;
> + LIST_HEAD(iso);
> int ret;
> int i;
>
> spin_lock(&sgx_global_lru.lock);
> for (i = 0; i < SGX_NR_TO_SCAN; i++) {
> - epc_page = sgx_epc_pop_reclaimable(&sgx_global_lru);
> + epc_page = sgx_epc_peek_reclaimable(&sgx_global_lru);
> if (!epc_page)
> break;
>
> encl_page = epc_page->encl_owner;
>
> + if (WARN_ON_ONCE(!(epc_page->flags & SGX_EPC_PAGE_ENCLAVE)))
> + continue;
> +
> if (kref_get_unless_zero(&encl_page->encl->refcount) != 0) {
> epc_page->flags |= SGX_EPC_PAGE_RECLAIM_IN_PROGRESS;
> - chunk[cnt++] = epc_page;
> + list_move_tail(&epc_page->list, &iso);
> } else {
> - /* The owner is freeing the page. No need to add the
> - * page back to the list of reclaimable pages.
> + /* The owner is freeing the page, remove it from the
> + * LRU list
> */
> epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
> + list_del_init(&epc_page->list);
> }
> }
> spin_unlock(&sgx_global_lru.lock);
>
> - for (i = 0; i < cnt; i++) {
> - epc_page = chunk[i];
> + if (list_empty(&iso))
> + return;
> +
> + i = 0;
> + list_for_each_entry_safe(epc_page, tmp, &iso, list) {
> encl_page = epc_page->encl_owner;
>
> if (!sgx_reclaimer_age(epc_page))
> @@ -333,6 +339,7 @@ static void __sgx_reclaim_pages(void)
> goto skip;
> }
>
> + i++;
> encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED;
> mutex_unlock(&encl_page->encl->lock);
> continue;
> @@ -340,31 +347,25 @@ static void __sgx_reclaim_pages(void)
> skip:
> spin_lock(&sgx_global_lru.lock);
> epc_page->flags &= ~SGX_EPC_PAGE_RECLAIM_IN_PROGRESS;
> - sgx_epc_push_reclaimable(&sgx_global_lru, epc_page);
> + sgx_epc_move_reclaimable(&sgx_global_lru, epc_page);
> spin_unlock(&sgx_global_lru.lock);
>
> kref_put(&encl_page->encl->refcount, sgx_encl_release);
> -
> - chunk[i] = NULL;
> }
>
> - for (i = 0; i < cnt; i++) {
> - epc_page = chunk[i];
> - if (epc_page)
> - sgx_reclaimer_block(epc_page);
> - }
> -
> - for (i = 0; i < cnt; i++) {
> - epc_page = chunk[i];
> - if (!epc_page)
> - continue;
> -
> + list_for_each_entry(epc_page, &iso, list)
> + sgx_reclaimer_block(epc_page);
> +
> + i = 0;
> + list_for_each_entry_safe(epc_page, tmp, &iso, list) {
> encl_page = epc_page->encl_owner;
> - sgx_reclaimer_write(epc_page, &backing[i]);
> + sgx_reclaimer_write(epc_page, &backing[i++]);
>
> kref_put(&encl_page->encl->refcount, sgx_encl_release);
> epc_page->flags &= ~(SGX_EPC_PAGE_RECLAIMER_TRACKED |
> - SGX_EPC_PAGE_RECLAIM_IN_PROGRESS);
> + SGX_EPC_PAGE_RECLAIM_IN_PROGRESS |
> + SGX_EPC_PAGE_ENCLAVE |
> + SGX_EPC_PAGE_VERSION_ARRAY);
>
> sgx_free_epc_page(epc_page);
> }
> @@ -505,6 +506,7 @@ struct sgx_epc_page *__sgx_alloc_epc_page(void)
> /**
> * sgx_record_epc_page() - Add a page to the LRU tracking
> * @page: EPC page
> + * @flags: Reclaim flags for the page.
> *
> * Mark a page with the specified flags and add it to the appropriate
> * (un)reclaimable list.
> @@ -535,18 +537,19 @@ void sgx_record_epc_page(struct sgx_epc_page *page, unsigned long flags)
> int sgx_drop_epc_page(struct sgx_epc_page *page)
> {
> spin_lock(&sgx_global_lru.lock);
> - if (page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) {
> - /* The page is being reclaimed. */
> - if (page->flags & SGX_EPC_PAGE_RECLAIM_IN_PROGRESS) {
> - spin_unlock(&sgx_global_lru.lock);
> - return -EBUSY;
> - }
> -
> - page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
> + if ((page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) &&
> + (page->flags & SGX_EPC_PAGE_RECLAIM_IN_PROGRESS)) {
> + spin_unlock(&sgx_global_lru.lock);
> + return -EBUSY;
> }
> list_del(&page->list);
> spin_unlock(&sgx_global_lru.lock);
>
> + page->flags &= ~(SGX_EPC_PAGE_RECLAIMER_TRACKED |
> + SGX_EPC_PAGE_RECLAIM_IN_PROGRESS |
> + SGX_EPC_PAGE_ENCLAVE |
> + SGX_EPC_PAGE_VERSION_ARRAY);
> +
> return 0;
> }
>
> diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
> index 37d66bc6ca27..ec8d567cd975 100644
> --- a/arch/x86/kernel/cpu/sgx/sgx.h
> +++ b/arch/x86/kernel/cpu/sgx/sgx.h
> @@ -32,6 +32,8 @@
> #define SGX_EPC_PAGE_KVM_GUEST BIT(2)
> /* page flag to indicate reclaim is in progress */
> #define SGX_EPC_PAGE_RECLAIM_IN_PROGRESS BIT(3)
> +#define SGX_EPC_PAGE_ENCLAVE BIT(4)
> +#define SGX_EPC_PAGE_VERSION_ARRAY BIT(5)

Could you please spend some time to clearly document what each bit means?

> +static inline void __sgx_epc_page_list_move(struct list_head *list, struct sgx_epc_page *page)
> +{
> + list_move_tail(&page->list, list);
> +}

I'm not sure I get the point of a helper like this. Why not just have
the caller call list_move() directly?

> /*
> * Must be called with queue lock acquired
> */
> @@ -157,6 +167,38 @@ static inline void sgx_epc_push_unreclaimable(struct sgx_epc_lru_lists *lrus,
> __sgx_epc_page_list_push(&(lrus)->unreclaimable, page);
> }
>
> +/*
> + * Must be called with queue lock acquired
> + */
> +static inline struct sgx_epc_page * __sgx_epc_page_list_peek(struct list_head *list)
> +{
> + struct sgx_epc_page *epc_page;
> +
> + if (list_empty(list))
> + return NULL;
> +
> + epc_page = list_first_entry(list, struct sgx_epc_page, list);
> + return epc_page;
> +}

list_first_entry_or_null() perhaps?

> +static inline struct sgx_epc_page *
> +sgx_epc_peek_reclaimable(struct sgx_epc_lru_lists *lrus)
> +{
> + return __sgx_epc_page_list_peek(&(lrus)->reclaimable);
> +}
> +
> +static inline void sgx_epc_move_reclaimable(struct sgx_epc_lru_lists *lru,
> + struct sgx_epc_page *page)
> +{
> + __sgx_epc_page_list_move(&(lru)->reclaimable, page);
> +}
> +
> +static inline struct sgx_epc_page *
> +sgx_epc_peek_unreclaimable(struct sgx_epc_lru_lists *lrus)
> +{
> + return __sgx_epc_page_list_peek(&(lrus)->unreclaimable);
> +}

In general, I'm not becoming more fond of these helpers as the series
goes along. My worry is that they're an abstraction where we don't
*really* need one. I don't seem them growing much functionality as the
series goes along.

I'll reserve judgement until the end though.