Re: [PATCH 1/2] PCI: Correct error reporting with PCIe failed link retraining

From: Pengfei Xu
Date: Thu Mar 14 2024 - 03:24:36 EST


Hi Maciej W. Rozycki,

Greeting!

We tested intel internal Linux next kernel by syzkaller fuzz testing and met
"CPU soft lockup in __run_timers" problem in guest.

Bisect and find that the following patch is the first bad commit.
Check the commit content is same as following patch.

All detailed info: https://github.com/xupengfe/syzkaller_logs/tree/main/240313_174939___run_timers
Syzkaller reproduced code: https://github.com/xupengfe/syzkaller_logs/blob/main/240313_174939___run_timers/repro.c
Syzkaller reproduced steps: https://github.com/xupengfe/syzkaller_logs/blob/main/240313_174939___run_timers/repro.prog
Bisect info: https://github.com/xupengfe/syzkaller_logs/blob/main/240313_174939___run_timers/bisect_info.log
Issue dmesg: https://github.com/xupengfe/syzkaller_logs/blob/main/240313_174939___run_timers/0a53950322bc80aeebf56f5a9d38c847186a082a_dmesg.log
bzImage: https://github.com/xupengfe/syzkaller_logs/raw/main/240313_174939___run_timers/bzImage_v6.8_problem.tar.gz

"
[ 34.599994] hrtimer: interrupt took 63789 ns
[ 102.443957] clocksource: Long readout interval, skipping watchdog check: cs_nsec: 3343956725 wd_nsec: 3343956627
[ 393.043326] watchdog: BUG: soft lockup - CPU#1 stuck for 27s! [repro:2653]
[ 393.043704] Modules linked in:
[ 393.043856] irq event stamp: 17486443
[ 393.044019] hardirqs last enabled at (17486442): [<ffffffff855b0ebe>] irqentry_exit+0x3e/0xa0
[ 393.044462] hardirqs last disabled at (17486443): [<ffffffff855aed94>] sysvec_apic_timer_interrupt+0x14/0xc0
[ 393.044925] softirqs last enabled at (14336234): [<ffffffff8126c828>] __irq_exit_rcu+0xa8/0x110
[ 393.045332] softirqs last disabled at (14336237): [<ffffffff8126c828>] __irq_exit_rcu+0xa8/0x110
[ 393.045740] CPU: 1 PID: 2653 Comm: repro Not tainted 6.8.0-0a53950322bc+ #1
[ 393.046062] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
[ 393.046618] RIP: 0010:preempt_count_sub+0x6c/0x160
[ 393.046856] Code: ed 07 85 c9 75 1b 65 8b 05 d9 7b d2 7e 89 c2 81 e2 ff ff ff 7f 39 da 7c 1b 81 fb fe 00 00 00 76 6b f7 db 65 01 1d bc 7b d2 7e <48> 8b 5d f8 c9 c3 cc cc cc cc e8 d5 36 9d 01 85 c0 74 ed 48 c7 c0
[ 393.047696] RSP: 0018:ffff88806cd09da0 EFLAGS: 00000213
[ 393.047950] RAX: 0000000000000102 RBX: 00000000ffffffff RCX: 0000000000000000
[ 393.048281] RDX: 0000000000000102 RSI: 0000000000000102 RDI: 0000000000000001
[ 393.048611] RBP: ffff88806cd09da8 R08: 0000000000000001 R09: fffffbfff11caa46
[ 393.048943] R10: ffffffff88e55237 R11: 0000000000000001 R12: ffff88806cd32a00
[ 393.049277] R13: ffffffff81516e90 R14: dffffc0000000000 R15: ffff88806cd32a00
[ 393.049611] FS: 0000000000000000(0000) GS:ffff88806cd00000(0000) knlGS:0000000000000000
[ 393.049987] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 393.050261] CR2: 00007febf5bfa838 CR3: 000000000bda4006 CR4: 0000000000770ef0
[ 393.050596] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 393.050928] DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 0000000000000400
[ 393.051262] PKRU: 55555554
[ 393.051396] Call Trace:
[ 393.051521] <IRQ>
[ 393.051630] ? show_regs+0xa9/0xc0
[ 393.051805] ? watchdog_timer_fn+0x531/0x6b0
[ 393.052019] ? __pfx_watchdog_timer_fn+0x10/0x10
[ 393.052255] ? __hrtimer_run_queues+0x1a4/0xc00
[ 393.052505] ? __pfx___hrtimer_run_queues+0x10/0x10
[ 393.052770] ? hrtimer_interrupt+0x324/0x7a0
[ 393.052994] ? __sysvec_apic_timer_interrupt+0x105/0x3c0
[ 393.053254] ? sysvec_apic_timer_interrupt+0x4b/0xc0
[ 393.053495] ? asm_sysvec_apic_timer_interrupt+0x1f/0x30
[ 393.053757] ? __pfx_clocksource_watchdog+0x10/0x10
[ 393.054002] ? preempt_count_sub+0x6c/0x160
[ 393.054213] _raw_spin_unlock_irq+0x3c/0x60
[ 393.054423] __run_timers.part.0+0x6c9/0xa40
[ 393.054647] ? __pfx___run_timers.part.0+0x10/0x10
[ 393.054883] ? sysvec_irq_work+0x8b/0x100
[ 393.055087] ? asm_sysvec_irq_work+0x1f/0x30
[ 393.055314] run_timer_softirq+0xbc/0x1c0
[ 393.055517] __do_softirq+0x1cb/0x84a
[ 393.055709] __irq_exit_rcu+0xa8/0x110
[ 393.055899] irq_exit_rcu+0x12/0x30
[ 393.056078] sysvec_apic_timer_interrupt+0xa5/0xc0
[ 393.056314] </IRQ>
[ 393.056422] <TASK>
[ 393.056532] asm_sysvec_apic_timer_interrupt+0x1f/0x30
[ 393.056781] RIP: 0010:__schedule+0xbe/0x30c0
[ 393.056993] Code: c0 e8 e6 a8 fd ff 48 98 48 8d 3c c5 a0 dc 75 86 48 89 fa 48 c1 ea 03 42 80 3c 2a 00 0f 85 81 28 00 00 4c 03 24 c5 a0 dc 75 86 <49> 8d 84 24 58 0a 00 00 48 89 c2 48 89 85 e0 fe ff ff 48 b8 00 00
[ 393.057847] RSP: 0018:ffff888024f3f6e8 EFLAGS: 00000282
[ 393.058101] RAX: 0000000000000001 RBX: 0000000000046380 RCX: 0000000000000001
[ 393.058437] RDX: 1ffffffff0cebb95 RSI: 0000000000000001 RDI: ffffffff8675dca8
[ 393.058771] RBP: ffff888024f3f820 R08: 0000000000000001 R09: 0000000000000002
[ 393.059107] R10: 0000000000406000 R11: 0000000000000001 R12: ffff88806cd46380
[ 393.059442] R13: dffffc0000000000 R14: ffffffff855da674 R15: 0000000000406000
[ 393.059779] ? __cond_resched+0x24/0x30
[ 393.059982] ? __schedule+0x9a/0x30c0
[ 393.060173] ? lockdep_hardirqs_on+0x8a/0x110
[ 393.060408] ? trace_hardirqs_on+0x26/0x120
[ 393.060637] ? __pfx___schedule+0x10/0x10
[ 393.060836] ? __this_cpu_preempt_check+0x21/0x30
[ 393.061068] ? lock_release+0x417/0x7e0
[ 393.061267] ? __this_cpu_preempt_check+0x21/0x30
[ 393.061497] ? lock_is_held_type+0xf0/0x150
[ 393.061705] ? __cond_resched+0x24/0x30
[ 393.061895] preempt_schedule_common+0x4a/0xd0
[ 393.062118] __cond_resched+0x24/0x30
[ 393.062302] unmap_page_range+0xab4/0x3690
[ 393.062530] ? __pfx_unmap_page_range+0x10/0x10
[ 393.062752] ? __this_cpu_preempt_check+0x21/0x30
[ 393.062989] ? uprobe_munmap+0xb0/0x590
[ 393.063186] unmap_single_vma+0x1ac/0x2d0
[ 393.063390] unmap_vmas+0x210/0x470
[ 393.063572] ? __pfx_unmap_vmas+0x10/0x10
[ 393.063769] ? __pfx_lock_release+0x10/0x10
[ 393.063977] ? lock_release+0x417/0x7e0
[ 393.064167] ? __pfx_folio_batch_move_lru+0x10/0x10
[ 393.064410] ? __pfx_lock_release+0x10/0x10
[ 393.064624] ? mlock_drain_local+0x281/0x4b0
[ 393.064844] exit_mmap+0x19b/0xac0
[ 393.065018] ? mark_lock.part.0+0xf3/0x17a0
[ 393.065233] ? __pfx_exit_mmap+0x10/0x10
[ 393.065427] ? __kasan_check_write+0x18/0x20
[ 393.065644] ? __pfx___mutex_unlock_slowpath+0x10/0x10
[ 393.065909] ? mutex_unlock+0x16/0x20
[ 393.066096] __mmput+0xde/0x3e0
[ 393.066256] mmput+0x74/0x90
[ 393.066407] do_exit+0xa59/0x28c0
[ 393.066580] ? lock_release+0x417/0x7e0
[ 393.066778] ? __pfx_lock_release+0x10/0x10
[ 393.066989] ? __pfx_do_exit+0x10/0x10
[ 393.067179] ? __this_cpu_preempt_check+0x21/0x30
[ 393.067410] ? _raw_spin_unlock_irq+0x2c/0x60
[ 393.067630] ? lockdep_hardirqs_on+0x8a/0x110
[ 393.067845] ? _raw_spin_unlock_irq+0x2c/0x60
[ 393.068064] ? trace_hardirqs_on+0x26/0x120
[ 393.068287] do_group_exit+0xe5/0x2c0
[ 393.068489] __x64_sys_exit_group+0x4d/0x60
[ 393.068707] do_syscall_64+0x73/0x150
[ 393.068894] entry_SYSCALL_64_after_hwframe+0x6e/0x76
[ 393.069140] RIP: 0033:0x7febf5b18a4d
[ 393.069319] Code: Unable to access opcode bytes at 0x7febf5b18a23.
"

---

If you don't need the following environment to reproduce the problem or if you
already have one reproduced environment, please ignore the following information.

How to reproduce:
git clone https://gitlab.com/xupengfe/repro_vm_env.git
cd repro_vm_env
tar -xvf repro_vm_env.tar.gz
cd repro_vm_env; ./start3.sh // it needs qemu-system-x86_64 and I used v7.1.0
// start3.sh will load bzImage_2241ab53cbb5cdb08a6b2d4688feb13971058f65 v6.2-rc5 kernel
// You could change the bzImage_xxx as you want
// Maybe you need to remove line "-drive if=pflash,format=raw,readonly=on,file=./OVMF_CODE.fd \" for different qemu version
You could use below command to log in, there is no password for root.
ssh -p 10023 root@localhost

After login vm(virtual machine) successfully, you could transfer reproduced
binary to the vm by below way, and reproduce the problem in vm:
gcc -pthread -o repro repro.c
scp -P 10023 repro root@localhost:/root/

Get the bzImage for target kernel:
Please use target kconfig and copy it to kernel_src/.config
make olddefconfig
make -jx bzImage //x should equal or less than cpu num your pc has

Fill the bzImage file into above start3.sh to load the target kernel in vm.


Tips:
If you already have qemu-system-x86_64, please ignore below info.
If you want to install qemu v7.1.0 version:
git clone https://github.com/qemu/qemu.git
cd qemu
git checkout -f v7.1.0
mkdir build
cd build
yum install -y ninja-build.x86_64
yum -y install libslirp-devel.x86_64
./configure --target-list=x86_64-softmmu --enable-kvm --enable-vnc --enable-gtk --enable-sdl --enable-usb-redir --enable-slirp
make
make install
---

Best Regards,
Thanks!

On 2024-02-10 at 01:43:50 +0000, Maciej W. Rozycki wrote:
> Only return successful completion status from `pcie_failed_link_retrain'
> if retraining has actually been done, preventing excessive delays from
> being triggered at call sites in a hope that communication will finally
> be established with the downstream device where in fact nothing has been
> done about the link in question that would justify such a hope.
>
> Fixes: a89c82249c37 ("PCI: Work around PCIe link training failures")
> Reported-by: Ilpo Järvinen <ilpo.jarvinen@xxxxxxxxxxxxxxx>
> Link: https://lore.kernel.org/r/aa2d1c4e-9961-d54a-00c7-ddf8e858a9b0@xxxxxxxxxxxxxxx/
> Signed-off-by: Maciej W. Rozycki <macro@xxxxxxxxxxx>
> Cc: stable@xxxxxxxxxxxxxxx # v6.5+
> ---
> drivers/pci/quirks.c | 18 +++++++++++-------
> 1 file changed, 11 insertions(+), 7 deletions(-)
>
> linux-pcie-failed-link-retrain-status-fix.diff
> Index: linux-macro/drivers/pci/quirks.c
> ===================================================================
> --- linux-macro.orig/drivers/pci/quirks.c
> +++ linux-macro/drivers/pci/quirks.c
> @@ -74,7 +74,8 @@
> * firmware may have already arranged and lift it with ports that already
> * report their data link being up.
> *
> - * Return TRUE if the link has been successfully retrained, otherwise FALSE.
> + * Return TRUE if the link has been successfully retrained, otherwise FALSE,
> + * also when retraining was not needed in the first place.
> */
> bool pcie_failed_link_retrain(struct pci_dev *dev)
> {
> @@ -83,10 +84,11 @@ bool pcie_failed_link_retrain(struct pci
> {}
> };
> u16 lnksta, lnkctl2;
> + bool ret = false;
>
> if (!pci_is_pcie(dev) || !pcie_downstream_port(dev) ||
> !pcie_cap_has_lnkctl2(dev) || !dev->link_active_reporting)
> - return false;
> + return ret;
>
> pcie_capability_read_word(dev, PCI_EXP_LNKCTL2, &lnkctl2);
> pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta);
> @@ -98,9 +100,10 @@ bool pcie_failed_link_retrain(struct pci
> lnkctl2 |= PCI_EXP_LNKCTL2_TLS_2_5GT;
> pcie_capability_write_word(dev, PCI_EXP_LNKCTL2, lnkctl2);
>
> - if (pcie_retrain_link(dev, false)) {
> + ret = pcie_retrain_link(dev, false) == 0;
> + if (!ret) {
> pci_info(dev, "retraining failed\n");
> - return false;
> + return ret;
> }
>
> pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta);
> @@ -117,13 +120,14 @@ bool pcie_failed_link_retrain(struct pci
> lnkctl2 |= lnkcap & PCI_EXP_LNKCAP_SLS;
> pcie_capability_write_word(dev, PCI_EXP_LNKCTL2, lnkctl2);
>
> - if (pcie_retrain_link(dev, false)) {
> + ret = pcie_retrain_link(dev, false) == 0;
> + if (!ret) {
> pci_info(dev, "retraining failed\n");
> - return false;
> + return ret;
> }
> }
>
> - return true;
> + return ret;
> }
>
> static ktime_t fixup_debug_start(struct pci_dev *dev,