Re: [PATCH] mm: fix unsafe page -> lruvec lookups with cgroup charge migration

From: Shakeel Butt
Date: Wed Nov 20 2019 - 15:31:21 EST


On Wed, Nov 20, 2019 at 8:58 AM Johannes Weiner <hannes@xxxxxxxxxxx> wrote:
>
> While reviewing the "per lruvec lru_lock for memcg" series, Hugh and I
> noticed two places in the existing code where the page -> memcg ->
> lruvec lookup can result in a use-after-free bug. This affects cgroup1
> setups that have charge migration enabled.
>
> To pin page->mem_cgroup, callers need to either have the page locked,
> an exclusive refcount (0), or hold the lru_lock and "own" PageLRU
> (either ensure it's set, or be the one to hold the page in isolation)
> to make cgroup migration fail the isolation step.

I think we should add the above para in the comments for better visibility.

> Failure to follow
> this can result in the page moving out of the memcg and freeing it,
> along with its lruvecs, while the observer is dereferencing them.
>
> 1. isolate_lru_page() calls mem_cgroup_page_lruvec() with the lru_lock
> held but before testing PageLRU. It doesn't dereference the returned
> lruvec before testing PageLRU, giving the impression that it might
> just be safe ordering after all - but mem_cgroup_page_lruvec() itself
> touches the lruvec to lazily initialize the pgdat back pointer. This
> one is easy to fix, just move the lookup into the PageLRU branch.
>
> 2. pagevec_lru_move_fn() conveniently looks up the lruvec for all the
> callbacks it might get invoked on. Unfortunately, it's the callbacks
> that first check PageLRU under the lru_lock, which makes this order
> equally unsafe as isolate_lru_page(). Remove the lruvec argument from
> the move callbacks and let them do it inside their PageLRU branches.
>
> Reported-by: Hugh Dickins <hughd@xxxxxxxxxx>
> Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>

Reviewed-by: Shakeel Butt <shakeelb@xxxxxxxxxx>

> ---
> mm/swap.c | 48 +++++++++++++++++++++++++++++-------------------
> mm/vmscan.c | 8 ++++----
> 2 files changed, 33 insertions(+), 23 deletions(-)
>
> diff --git a/mm/swap.c b/mm/swap.c
> index 5341ae93861f..6b015e9532fb 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -188,12 +188,11 @@ int get_kernel_page(unsigned long start, int write, struct page **pages)
> EXPORT_SYMBOL_GPL(get_kernel_page);
>
> static void pagevec_lru_move_fn(struct pagevec *pvec,
> - void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
> + void (*move_fn)(struct page *page, void *arg),
> void *arg)
> {
> int i;
> struct pglist_data *pgdat = NULL;
> - struct lruvec *lruvec;
> unsigned long flags = 0;
>
> for (i = 0; i < pagevec_count(pvec); i++) {
> @@ -207,8 +206,7 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
> spin_lock_irqsave(&pgdat->lru_lock, flags);
> }
>
> - lruvec = mem_cgroup_page_lruvec(page, pgdat);
> - (*move_fn)(page, lruvec, arg);
> + (*move_fn)(page, arg);
> }
> if (pgdat)
> spin_unlock_irqrestore(&pgdat->lru_lock, flags);
> @@ -216,12 +214,14 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
> pagevec_reinit(pvec);
> }
>
> -static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
> - void *arg)
> +static void pagevec_move_tail_fn(struct page *page, void *arg)
> {
> int *pgmoved = arg;
>
> if (PageLRU(page) && !PageUnevictable(page)) {
> + struct lruvec *lruvec;
> +
> + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
> del_page_from_lru_list(page, lruvec, page_lru(page));
> ClearPageActive(page);
> add_page_to_lru_list_tail(page, lruvec, page_lru(page));
> @@ -272,12 +272,14 @@ static void update_page_reclaim_stat(struct lruvec *lruvec,
> reclaim_stat->recent_rotated[file]++;
> }
>
> -static void __activate_page(struct page *page, struct lruvec *lruvec,
> - void *arg)
> +static void __activate_page(struct page *page, void *arg)
> {
> if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
> int file = page_is_file_cache(page);
> int lru = page_lru_base_type(page);
> + struct lruvec *lruvec;
> +
> + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
>
> del_page_from_lru_list(page, lruvec, lru);
> SetPageActive(page);
> @@ -328,7 +330,7 @@ void activate_page(struct page *page)
>
> page = compound_head(page);
> spin_lock_irq(&pgdat->lru_lock);
> - __activate_page(page, mem_cgroup_page_lruvec(page, pgdat), NULL);
> + __activate_page(page, NULL);
> spin_unlock_irq(&pgdat->lru_lock);
> }
> #endif
> @@ -498,9 +500,9 @@ void lru_cache_add_active_or_unevictable(struct page *page,
> * be write it out by flusher threads as this is much more effective
> * than the single-page writeout from reclaim.
> */
> -static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
> - void *arg)
> +static void lru_deactivate_file_fn(struct page *page, void *arg)
> {
> + struct lruvec *lruvec;
> int lru, file;
> bool active;
>
> @@ -518,6 +520,8 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
> file = page_is_file_cache(page);
> lru = page_lru_base_type(page);
>
> + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
> +
> del_page_from_lru_list(page, lruvec, lru + active);
> ClearPageActive(page);
> ClearPageReferenced(page);
> @@ -544,12 +548,14 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
> update_page_reclaim_stat(lruvec, file, 0);
> }
>
> -static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
> - void *arg)
> +static void lru_deactivate_fn(struct page *page, void *arg)
> {
> if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
> int file = page_is_file_cache(page);
> int lru = page_lru_base_type(page);
> + struct lruvec *lruvec;
> +
> + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
>
> del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
> ClearPageActive(page);
> @@ -561,12 +567,14 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
> }
> }
>
> -static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
> - void *arg)
> +static void lru_lazyfree_fn(struct page *page, void *arg)
> {
> if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
> !PageSwapCache(page) && !PageUnevictable(page)) {
> bool active = PageActive(page);
> + struct lruvec *lruvec;
> +
> + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
>
> del_page_from_lru_list(page, lruvec,
> LRU_INACTIVE_ANON + active);
> @@ -921,15 +929,17 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
> }
> #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>
> -static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
> - void *arg)
> +static void __pagevec_lru_add_fn(struct page *page, void *arg)
> {
> - enum lru_list lru;
> int was_unevictable = TestClearPageUnevictable(page);
> + struct lruvec *lruvec;
> + enum lru_list lru;
>
> VM_BUG_ON_PAGE(PageLRU(page), page);
> -
> SetPageLRU(page);
> +
> + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
> +
> /*
> * Page becomes evictable in two ways:
> * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index df859b1d583c..3c8b81990146 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1767,15 +1767,15 @@ int isolate_lru_page(struct page *page)
>
> if (PageLRU(page)) {
> pg_data_t *pgdat = page_pgdat(page);
> - struct lruvec *lruvec;
>
> spin_lock_irq(&pgdat->lru_lock);
> - lruvec = mem_cgroup_page_lruvec(page, pgdat);
> if (PageLRU(page)) {
> - int lru = page_lru(page);
> + struct lruvec *lruvec;
> +
> + lruvec = mem_cgroup_page_lruvec(page, pgdat);
> get_page(page);
> ClearPageLRU(page);
> - del_page_from_lru_list(page, lruvec, lru);
> + del_page_from_lru_list(page, lruvec, page_lru(page));
> ret = 0;
> }
> spin_unlock_irq(&pgdat->lru_lock);
> --
> 2.24.0
>