RE: [PATCH v8 01/33] x86/traps: let common_interrupt() handle IRQ_MOVE_CLEANUP_VECTOR

From: Li, Xin3
Date: Tue Jun 06 2023 - 19:18:59 EST


> The untested below should do the trick. Wants to be split in several patches, but
> you get the idea.

I will continue the work from what you posted. Thanks a lot!

Xin




> ---
> Subject: x86/vector: Get rid of IRQ_MOVE_CLEANUP_VECTOR
> From: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
>
> No point to waste a vector for cleaning up the leftovers of a moved interrupt.
> Aside of that this must be the lowest priority of all vectors which makes FRED
> systems utilizing vectors 0x10-0x1f more complicated than necessary.
>
> Schedule a timer instead.
>
> Not-Yet-Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> ---
> arch/x86/include/asm/hw_irq.h | 4 -
> arch/x86/include/asm/idtentry.h | 1
> arch/x86/include/asm/irq_vectors.h | 7 ---
> arch/x86/kernel/apic/vector.c | 83 ++++++++++++++++++++++++++----------
> arch/x86/kernel/idt.c | 1
> arch/x86/platform/uv/uv_irq.c | 2
> drivers/iommu/amd/iommu.c | 2
> drivers/iommu/hyperv-iommu.c | 4 -
> drivers/iommu/intel/irq_remapping.c | 2
> 9 files changed, 68 insertions(+), 38 deletions(-)
>
> --- a/arch/x86/include/asm/hw_irq.h
> +++ b/arch/x86/include/asm/hw_irq.h
> @@ -97,10 +97,10 @@ extern struct irq_cfg *irqd_cfg(struct i extern void
> lock_vector_lock(void); extern void unlock_vector_lock(void); #ifdef
> CONFIG_SMP -extern void send_cleanup_vector(struct irq_cfg *);
> +extern void vector_schedule_cleanup(struct irq_cfg *);
> extern void irq_complete_move(struct irq_cfg *cfg); #else -static inline void
> send_cleanup_vector(struct irq_cfg *c) { }
> +static inline void vector_schedule_cleanup(struct irq_cfg *c) { }
> static inline void irq_complete_move(struct irq_cfg *c) { } #endif
>
> --- a/arch/x86/include/asm/idtentry.h
> +++ b/arch/x86/include/asm/idtentry.h
> @@ -648,7 +648,6 @@ DECLARE_IDTENTRY_SYSVEC(X86_PLATFORM_IPI
>
> #ifdef CONFIG_SMP
> DECLARE_IDTENTRY(RESCHEDULE_VECTOR,
> sysvec_reschedule_ipi);
> -DECLARE_IDTENTRY_SYSVEC(IRQ_MOVE_CLEANUP_VECTOR,
> sysvec_irq_move_cleanup);
> DECLARE_IDTENTRY_SYSVEC(REBOOT_VECTOR,
> sysvec_reboot);
> DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_SINGLE_VECTOR,
> sysvec_call_function_single);
> DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_VECTOR,
> sysvec_call_function);
> --- a/arch/x86/include/asm/irq_vectors.h
> +++ b/arch/x86/include/asm/irq_vectors.h
> @@ -35,13 +35,6 @@
> */
> #define FIRST_EXTERNAL_VECTOR 0x20
>
> -/*
> - * Reserve the lowest usable vector (and hence lowest priority) 0x20 for
> - * triggering cleanup after irq migration. 0x21-0x2f will still be used
> - * for device interrupts.
> - */
> -#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR
> -
> #define IA32_SYSCALL_VECTOR 0x80
>
> /*
> --- a/arch/x86/kernel/apic/vector.c
> +++ b/arch/x86/kernel/apic/vector.c
> @@ -44,7 +44,18 @@ static cpumask_var_t vector_searchmask; static struct
> irq_chip lapic_controller; static struct irq_matrix *vector_matrix; #ifdef
> CONFIG_SMP -static DEFINE_PER_CPU(struct hlist_head, cleanup_list);
> +
> +static void vector_cleanup_callback(struct timer_list *tmr);
> +
> +struct vector_cleanup {
> + struct hlist_head head;
> + struct timer_list timer;
> +};
> +
> +static DEFINE_PER_CPU(struct vector_cleanup, vector_cleanup) = {
> + .head = HLIST_HEAD_INIT,
> + .timer = __TIMER_INITIALIZER(vector_cleanup_callback,
> TIMER_PINNED),
> +};
> #endif
>
> void lock_vector_lock(void)
> @@ -843,8 +854,12 @@ void lapic_online(void)
>
> void lapic_offline(void)
> {
> + struct vector_cleanup *cl = this_cpu_ptr(&vector_cleanup);
> +
> lock_vector_lock();
> irq_matrix_offline(vector_matrix);
> + WARN_ON_ONCE(try_to_del_timer_sync(&cl->timer) < 0);
> + WARN_ON_ONCE(!hlist_empty(&cl->head));
> unlock_vector_lock();
> }
>
> @@ -934,62 +949,86 @@ static void free_moved_vector(struct api
> apicd->move_in_progress = 0;
> }
>
> -DEFINE_IDTENTRY_SYSVEC(sysvec_irq_move_cleanup)
> +static void vector_cleanup_callback(struct timer_list *tmr)
> {
> - struct hlist_head *clhead = this_cpu_ptr(&cleanup_list);
> + struct vector_cleanup *cl = container_of(tmr, typeof(*cl), timer);
> struct apic_chip_data *apicd;
> struct hlist_node *tmp;
> + bool rearm = false;
>
> - ack_APIC_irq();
> /* Prevent vectors vanishing under us */
> - raw_spin_lock(&vector_lock);
> + raw_spin_lock_irq(&vector_lock);
>
> - hlist_for_each_entry_safe(apicd, tmp, clhead, clist) {
> + hlist_for_each_entry_safe(apicd, tmp, &cl->head, clist) {
> unsigned int irr, vector = apicd->prev_vector;
>
> /*
> * Paranoia: Check if the vector that needs to be cleaned
> - * up is registered at the APICs IRR. If so, then this is
> - * not the best time to clean it up. Clean it up in the
> - * next attempt by sending another
> IRQ_MOVE_CLEANUP_VECTOR
> - * to this CPU. IRQ_MOVE_CLEANUP_VECTOR is the lowest
> - * priority external vector, so on return from this
> - * interrupt the device interrupt will happen first.
> + * up is registered at the APICs IRR. That's clearly a
> + * hardware issue if the vector arrived on the old target
> + * _after_ interrupts were disabled above. Keep @apicd
> + * on the list and schedule the timer again to give the CPU
> + * a chance to handle the pending interrupt.
> */
> irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
> if (irr & (1U << (vector % 32))) {
> - apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
> + pr_warn_once("Moved interrupt pending in old target
> APIC %u\n", apicd->irq);
> + rearm = true;
> continue;
> }
> free_moved_vector(apicd);
> }
>
> - raw_spin_unlock(&vector_lock);
> + /*
> + * Must happen under vector_lock to make the timer_pending() check
> + * in __vector_schedule_cleanup() race free against the rearm here.
> + */
> + if (rearm)
> + mod_timer(tmr, jiffies + 1);
> +
> + raw_spin_unlock_irq(&vector_lock);
> }
>
> -static void __send_cleanup_vector(struct apic_chip_data *apicd)
> +static void __vector_schedule_cleanup(struct apic_chip_data *apicd)
> {
> - unsigned int cpu;
> + unsigned int cpu = apicd->prev_cpu;
>
> raw_spin_lock(&vector_lock);
> apicd->move_in_progress = 0;
> - cpu = apicd->prev_cpu;
> if (cpu_online(cpu)) {
> - hlist_add_head(&apicd->clist, per_cpu_ptr(&cleanup_list, cpu));
> - apic->send_IPI(cpu, IRQ_MOVE_CLEANUP_VECTOR);
> + struct vector_cleanup *cl = per_cpu_ptr(&vector_cleanup, cpu);
> +
> + /*
> + * The lockless timer_pending() check is safe here. If it
> + * returns true, then the callback will observe this new
> + * apic data in the hlist as everything is serialized by
> + * vector lock.
> + *
> + * If it returns false then the timer is either not armed
> + * or the other CPU executes the callback, which again
> + * would be blocked on vector lock. Rearming it in the
> + * latter case makes it fire for nothing.
> + *
> + * This is also safe against the callback rearming the timer
> + * because that's serialized via vector lock too.
> + */
> + if (!timer_pending(&cl->timer)) {
> + cl->timer.expires = jiffies + 1;
> + add_timer_on(&cl->timer, cpu);
> + }
> } else {
> apicd->prev_vector = 0;
> }
> raw_spin_unlock(&vector_lock);
> }
>
> -void send_cleanup_vector(struct irq_cfg *cfg)
> +void vector_schedule_cleanup(struct irq_cfg *cfg)
> {
> struct apic_chip_data *apicd;
>
> apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg);
> if (apicd->move_in_progress)
> - __send_cleanup_vector(apicd);
> + __vector_schedule_cleanup(apicd);
> }
>
> void irq_complete_move(struct irq_cfg *cfg) @@ -1007,7 +1046,7 @@ void
> irq_complete_move(struct irq_cfg *c
> * on the same CPU.
> */
> if (apicd->cpu == smp_processor_id())
> - __send_cleanup_vector(apicd);
> + __vector_schedule_cleanup(apicd);
> }
>
> /*
> --- a/arch/x86/kernel/idt.c
> +++ b/arch/x86/kernel/idt.c
> @@ -131,7 +131,6 @@ static const __initconst struct idt_data
> INTG(RESCHEDULE_VECTOR,
> asm_sysvec_reschedule_ipi),
> INTG(CALL_FUNCTION_VECTOR,
> asm_sysvec_call_function),
> INTG(CALL_FUNCTION_SINGLE_VECTOR,
> asm_sysvec_call_function_single),
> - INTG(IRQ_MOVE_CLEANUP_VECTOR,
> asm_sysvec_irq_move_cleanup),
> INTG(REBOOT_VECTOR, asm_sysvec_reboot),
> #endif
>
> --- a/arch/x86/platform/uv/uv_irq.c
> +++ b/arch/x86/platform/uv/uv_irq.c
> @@ -58,7 +58,7 @@ uv_set_irq_affinity(struct irq_data *dat
> ret = parent->chip->irq_set_affinity(parent, mask, force);
> if (ret >= 0) {
> uv_program_mmr(cfg, data->chip_data);
> - send_cleanup_vector(cfg);
> + vector_schedule_cleanup(cfg);
> }
>
> return ret;
> --- a/drivers/iommu/amd/iommu.c
> +++ b/drivers/iommu/amd/iommu.c
> @@ -3639,7 +3639,7 @@ static int amd_ir_set_affinity(struct ir
> * at the new destination. So, time to cleanup the previous
> * vector allocation.
> */
> - send_cleanup_vector(cfg);
> + vector_schedule_cleanup(cfg);
>
> return IRQ_SET_MASK_OK_DONE;
> }
> --- a/drivers/iommu/hyperv-iommu.c
> +++ b/drivers/iommu/hyperv-iommu.c
> @@ -51,7 +51,7 @@ static int hyperv_ir_set_affinity(struct
> if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
> return ret;
>
> - send_cleanup_vector(cfg);
> + vector_schedule_cleanup(cfg);
>
> return 0;
> }
> @@ -257,7 +257,7 @@ static int hyperv_root_ir_set_affinity(s
> if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
> return ret;
>
> - send_cleanup_vector(cfg);
> + vector_schedule_cleanup(cfg);
>
> return 0;
> }
> --- a/drivers/iommu/intel/irq_remapping.c
> +++ b/drivers/iommu/intel/irq_remapping.c
> @@ -1180,7 +1180,7 @@ intel_ir_set_affinity(struct irq_data *d
> * at the new destination. So, time to cleanup the previous
> * vector allocation.
> */
> - send_cleanup_vector(cfg);
> + vector_schedule_cleanup(cfg);
>
> return IRQ_SET_MASK_OK_DONE;
> }