Re: [PATCH v4] fs/buffer.c: update per-CPU bh_lru cache via RCU

From: Marcelo Tosatti
Date: Wed May 03 2023 - 09:53:55 EST



Friendly ping ?

On Thu, Mar 30, 2023 at 04:27:32PM -0300, Marcelo Tosatti wrote:
>
> For certain types of applications (for example PLC software or
> RAN processing), upon occurrence of an event, it is necessary to
> complete a certain task in a maximum amount of time (deadline).
>
> One way to express this requirement is with a pair of numbers,
> deadline time and execution time, where:
>
> * deadline time: length of time between event and deadline.
> * execution time: length of time it takes for processing of event
> to occur on a particular hardware platform
> (uninterrupted).
>
> The particular values depend on use-case. For the case
> where the realtime application executes in a virtualized
> guest, an IPI which must be serviced in the host will cause
> the following sequence of events:
>
> 1) VM-exit
> 2) execution of IPI (and function call)
> 3) VM-entry
>
> Which causes an excess of 50us latency as observed by cyclictest
> (this violates the latency requirement of vRAN application with 1ms TTI,
> for example).
>
> invalidate_bh_lrus calls an IPI on each CPU that has non empty
> per-CPU cache:
>
> on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
>
> To avoid the IPI, free the per-CPU caches remotely via RCU.
> Two bh_lrus structures for each CPU are allocated: one is being
> used (assigned to per-CPU bh_lru pointer), and the other is
> being freed (or idle).
>
> An alternative solution would be to protect the fast path
> (__find_get_block) with a per-CPU spinlock. Then grab the
> lock from invalidate_bh_lru, when evaluating whether a given
> CPUs buffer_head cache should be invalidated.
> This solution would slow down the fast path.
>
> Numbers (16 vCPU guest) for the following test:
>
> for i in `seq 0 50`;
> mount -o loop alpine-standard-3.17.1-x86_64.iso /mnt/loop
> umount /mnt/loop
> done
>
> Where the time being measured is time between invalidate_bh_lrus
> function call start and return.
>
> Unpatched: average is 2us
> ┌ ┐
> [ 0.0, 2.0) ┤████████████████████████▊ 53
> [ 2.0, 4.0) ┤████████████████████████████████████ 77
> [ 4.0, 6.0) ┤████████▍ 18
> [ 6.0, 8.0) ┤▌ 1
> [ 8.0, 10.0) ┤ 0
> [10.0, 12.0) ┤ 0
> [12.0, 14.0) ┤▌ 1
> [14.0, 16.0) ┤ 0
> [16.0, 18.0) ┤▌ 1
> └ ┘
> Frequency
>
> Patched: average is 16us
>
> ┌ ┐
> [ 0.0, 10.0) ┤██████████████████▍ 35
> [10.0, 20.0) ┤████████████████████████████████████ 69
> [20.0, 30.0) ┤██████████████████▍ 35
> [30.0, 40.0) ┤████▎ 8
> [40.0, 50.0) ┤█▌ 3
> [50.0, 60.0) ┤█▏ 2
> └ ┘
> Frequency
>
> The fact that invalidate_bh_lru() is now serialized should not be
> an issue, since invalidate_bdev does:
>
> /* Invalidate clean unused buffers and pagecache. */
> void invalidate_bdev(struct block_device *bdev)
> {
> struct address_space *mapping = bdev->bd_inode->i_mapping;
>
> if (mapping->nrpages) {
> invalidate_bh_lrus();
> lru_add_drain_all(); /* make sure all lru add caches are flushed */
> invalidate_mapping_pages(mapping, 0, -1);
> }
> }
>
> Where lru_add_drain_all() is serialized by a single mutex lock
> (and there have been no reported use cases where this
> serialization is an issue).
>
> Regarding scalability, considering the results above where
> it takes 16us to execute invalidate_bh_lrus on 16 CPUs
> (where 8us are taken by synchronize_rcu_expedited),
> we can assume 500ns per CPU. For a system with
> 1024 CPUs, we can infer 8us + 1024*500ns ~= 500us
> (which seems acceptable).
>
> Signed-off-by: Marcelo Tosatti <mtosatti@xxxxxxxxxx>
>
> ---
>
> v4: improved changelog, no code change (Dave Chinner)
> v3: fix CPU hotplug
> v2: fix sparse warnings (kernel test robot)
>
> diff --git a/fs/buffer.c b/fs/buffer.c
> index 9e1e2add541e..e9b4d579eff0 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -1246,7 +1246,21 @@ struct bh_lru {
> struct buffer_head *bhs[BH_LRU_SIZE];
> };
>
> -static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
> +
> +/*
> + * Allocate two bh_lrus structures for each CPU. bh_lru points to the
> + * one that is currently in use, and the update path does
> + * (consider cpu->bh_lru = bh_lrus[0]).
> + *
> + * cpu->bh_lrup = bh_lrus[1]
> + * synchronize_rcu()
> + * free bh's in bh_lrus[0]
> + */
> +static unsigned int bh_lru_idx;
> +static DEFINE_PER_CPU(struct bh_lru, bh_lrus[2]) = {{{ NULL }}, {{NULL}}};
> +static DEFINE_PER_CPU(struct bh_lru __rcu *, bh_lrup);
> +
> +static DEFINE_MUTEX(bh_lru_invalidate_mutex);
>
> #ifdef CONFIG_SMP
> #define bh_lru_lock() local_irq_disable()
> @@ -1288,16 +1302,19 @@ static void bh_lru_install(struct buffer_head *bh)
> return;
> }
>
> - b = this_cpu_ptr(&bh_lrus);
> + rcu_read_lock();
> + b = rcu_dereference(per_cpu(bh_lrup, smp_processor_id()));
> for (i = 0; i < BH_LRU_SIZE; i++) {
> swap(evictee, b->bhs[i]);
> if (evictee == bh) {
> + rcu_read_unlock();
> bh_lru_unlock();
> return;
> }
> }
>
> get_bh(bh);
> + rcu_read_unlock();
> bh_lru_unlock();
> brelse(evictee);
> }
> @@ -1309,28 +1326,32 @@ static struct buffer_head *
> lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
> {
> struct buffer_head *ret = NULL;
> + struct bh_lru *lru;
> unsigned int i;
>
> check_irqs_on();
> bh_lru_lock();
> + rcu_read_lock();
> +
> + lru = rcu_dereference(per_cpu(bh_lrup, smp_processor_id()));
> for (i = 0; i < BH_LRU_SIZE; i++) {
> - struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
> + struct buffer_head *bh = lru->bhs[i];
>
> if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
> bh->b_size == size) {
> if (i) {
> while (i) {
> - __this_cpu_write(bh_lrus.bhs[i],
> - __this_cpu_read(bh_lrus.bhs[i - 1]));
> + lru->bhs[i] = lru->bhs[i - 1];
> i--;
> }
> - __this_cpu_write(bh_lrus.bhs[0], bh);
> + lru->bhs[0] = bh;
> }
> get_bh(bh);
> ret = bh;
> break;
> }
> }
> + rcu_read_unlock();
> bh_lru_unlock();
> return ret;
> }
> @@ -1424,35 +1445,54 @@ static void __invalidate_bh_lrus(struct bh_lru *b)
> b->bhs[i] = NULL;
> }
> }
> -/*
> - * invalidate_bh_lrus() is called rarely - but not only at unmount.
> - * This doesn't race because it runs in each cpu either in irq
> - * or with preempt disabled.
> - */
> -static void invalidate_bh_lru(void *arg)
> -{
> - struct bh_lru *b = &get_cpu_var(bh_lrus);
> -
> - __invalidate_bh_lrus(b);
> - put_cpu_var(bh_lrus);
> -}
>
> bool has_bh_in_lru(int cpu, void *dummy)
> {
> - struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
> + struct bh_lru *b;
> int i;
> -
> +
> + rcu_read_lock();
> + b = rcu_dereference(per_cpu(bh_lrup, cpu));
> for (i = 0; i < BH_LRU_SIZE; i++) {
> - if (b->bhs[i])
> + if (b->bhs[i]) {
> + rcu_read_unlock();
> return true;
> + }
> }
>
> + rcu_read_unlock();
> return false;
> }
>
> +/*
> + * invalidate_bh_lrus() is called rarely - but not only at unmount.
> + */
> void invalidate_bh_lrus(void)
> {
> - on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
> + int cpu, oidx;
> +
> + mutex_lock(&bh_lru_invalidate_mutex);
> + cpus_read_lock();
> + oidx = bh_lru_idx;
> + bh_lru_idx++;
> + if (bh_lru_idx >= 2)
> + bh_lru_idx = 0;
> +
> + /* Assign the per-CPU bh_lru pointer */
> + for_each_online_cpu(cpu)
> + rcu_assign_pointer(per_cpu(bh_lrup, cpu),
> + per_cpu_ptr(&bh_lrus[bh_lru_idx], cpu));
> + synchronize_rcu_expedited();
> +
> + for_each_online_cpu(cpu) {
> + struct bh_lru *b = per_cpu_ptr(&bh_lrus[oidx], cpu);
> +
> + bh_lru_lock();
> + __invalidate_bh_lrus(b);
> + bh_lru_unlock();
> + }
> + cpus_read_unlock();
> + mutex_unlock(&bh_lru_invalidate_mutex);
> }
> EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
>
> @@ -1465,8 +1505,10 @@ void invalidate_bh_lrus_cpu(void)
> struct bh_lru *b;
>
> bh_lru_lock();
> - b = this_cpu_ptr(&bh_lrus);
> + rcu_read_lock();
> + b = rcu_dereference(per_cpu(bh_lrup, smp_processor_id()));
> __invalidate_bh_lrus(b);
> + rcu_read_unlock();
> bh_lru_unlock();
> }
>
> @@ -2968,15 +3010,25 @@ void free_buffer_head(struct buffer_head *bh)
> }
> EXPORT_SYMBOL(free_buffer_head);
>
> +static int buffer_cpu_online(unsigned int cpu)
> +{
> + rcu_assign_pointer(per_cpu(bh_lrup, cpu),
> + per_cpu_ptr(&bh_lrus[bh_lru_idx], cpu));
> + return 0;
> +}
> +
> static int buffer_exit_cpu_dead(unsigned int cpu)
> {
> int i;
> - struct bh_lru *b = &per_cpu(bh_lrus, cpu);
> + struct bh_lru *b;
>
> + rcu_read_lock();
> + b = rcu_dereference(per_cpu(bh_lrup, cpu));
> for (i = 0; i < BH_LRU_SIZE; i++) {
> brelse(b->bhs[i]);
> b->bhs[i] = NULL;
> }
> + rcu_read_unlock();
> this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
> per_cpu(bh_accounting, cpu).nr = 0;
> return 0;
> @@ -3069,7 +3121,7 @@ EXPORT_SYMBOL(__bh_read_batch);
> void __init buffer_init(void)
> {
> unsigned long nrpages;
> - int ret;
> + int ret, cpu;
>
> bh_cachep = kmem_cache_create("buffer_head",
> sizeof(struct buffer_head), 0,
> @@ -3077,6 +3129,11 @@ void __init buffer_init(void)
> SLAB_MEM_SPREAD),
> NULL);
>
> + cpus_read_lock();
> + for_each_online_cpu(cpu)
> + rcu_assign_pointer(per_cpu(bh_lrup, cpu), per_cpu_ptr(&bh_lrus[0], cpu));
> + cpus_read_unlock();
> +
> /*
> * Limit the bh occupancy to 10% of ZONE_NORMAL
> */
> @@ -3085,4 +3142,7 @@ void __init buffer_init(void)
> ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
> NULL, buffer_exit_cpu_dead);
> WARN_ON(ret < 0);
> + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "fs/buffer:online",
> + NULL, buffer_cpu_online);
> + WARN_ON(ret < 0);
> }
>
>