Re: 2b5d1c29f6c4 ("drm/nouveau/disp: PIOR DP uses GPIO for HPD, not PMGR AUX interrupts")

From: Karol Herbst
Date: Mon Aug 14 2023 - 09:28:07 EST


On Tue, Aug 8, 2023 at 3:47 PM Borislav Petkov <bp@xxxxxxxxx> wrote:
>
> On Tue, Aug 08, 2023 at 12:39:32PM +0200, Karol Herbst wrote:
> > ahh, that would have been good to know :)
>
> Yeah, I didn't see it before - it would only freeze. Only after I added
> the printk you requested.
>
> > Mind figuring out what's exactly NULL inside nvif_object_mthd? Or
> > rather what line `nvif_object_mthd+0x136` belongs to, then it should
> > be easy to figure out what's wrong here.
>
> That looks like this:
>
> ffffffff816ddfee: e8 8d 04 4e 00 callq ffffffff81bbe480 <__memcpy>
> ffffffff816ddff3: 41 8d 56 20 lea 0x20(%r14),%edx
> ffffffff816ddff7: 49 8b 44 24 08 mov 0x8(%r12),%rax
> ffffffff816ddffc: 83 fa 17 cmp $0x17,%edx
> ffffffff816ddfff: 76 7d jbe ffffffff816de07e <nvif_object_mthd+0x1ae>
> ffffffff816de001: 49 39 c4 cmp %rax,%r12
> ffffffff816de004: 74 45 je ffffffff816de04b <nvif_object_mthd+0x17b>
>
> <--- RIP points here.
>
> The 0x20 also fits the deref address: 0000000000000020.
>
> Which means %rax is 0. Yap.
>
> ffffffff816de006: 48 8b 78 20 mov 0x20(%rax),%rdi
> ffffffff816de00a: 4c 89 64 24 10 mov %r12,0x10(%rsp)
> ffffffff816de00f: 48 8b 40 38 mov 0x38(%rax),%rax
> ffffffff816de013: c6 44 24 06 ff movb $0xff,0x6(%rsp)
> ffffffff816de018: 31 c9 xor %ecx,%ecx
> ffffffff816de01a: 48 89 e6 mov %rsp,%rsi
> ffffffff816de01d: 48 8b 40 28 mov 0x28(%rax),%rax
> ffffffff816de021: e8 3a 0c 4f 00 callq ffffffff81bcec60 <__x86_indirect_thunk_array>
>
>
> Now, the preprocessed asm version of nvif/object.c says around here:
>
>
> call memcpy #
> # drivers/gpu/drm/nouveau/nvif/object.c:160: ret = nvif_object_ioctl(object, args, sizeof(*args) + size, NULL);
> leal 32(%r14), %edx #, _108
> # drivers/gpu/drm/nouveau/nvif/object.c:33: struct nvif_client *client = object->client;
> movq 8(%r12), %rax # object_19(D)->client, client
> # drivers/gpu/drm/nouveau/nvif/object.c:38: if (size >= sizeof(*args) && args->v0.version == 0) {
> cmpl $23, %edx #, _108
> jbe .L69 #,
> # drivers/gpu/drm/nouveau/nvif/object.c:39: if (object != &client->object)
> cmpq %rax, %r12 # client, object
> je .L70 #,
> # drivers/gpu/drm/nouveau/nvif/object.c:47: return client->driver->ioctl(client->object.priv, data, size, hack);
> movq 32(%rax), %rdi # client_109->object.priv, client_109->object.priv
>
>
> So I'd say that client is NULL. IINM.
>
>
> movq %r12, 16(%rsp) # object, MEM[(union *)&stack].v0.object
> # drivers/gpu/drm/nouveau/nvif/object.c:47: return client->driver->ioctl(client->object.priv, data, size, hack);
> movq 56(%rax), %rax # client_109->driver, client_109->driver
> # drivers/gpu/drm/nouveau/nvif/object.c:43: args->v0.owner = NVIF_IOCTL_V0_OWNER_ANY;
> movb $-1, 6(%rsp) #, MEM[(union *)&stack].v0.owner
> .L64:
> # drivers/gpu/drm/nouveau/nvif/object.c:47: return client->driver->ioctl(client->object.priv, data, size, hack);
> xorl %ecx, %ecx #
> movq %rsp, %rsi #,
> movq 40(%rax), %rax #, _77->ioctl
> call __x86_indirect_thunk_rax
> # drivers/gpu/drm/nouveau/nvif/object.c:161: memcpy(data, args->mthd.data, size);
>
> > > [ 4.144676] #PF: supervisor read access in kernel mode
> > > [ 4.144676] #PF: error_code(0x0000) - not-present page
> > > [ 4.144676] PGD 0 P4D 0
> > > [ 4.144676] Oops: 0000 [#1] PREEMPT SMP PTI
> > > [ 4.144676] CPU: 2 PID: 1 Comm: swapper/0 Not tainted 6.5.0-rc5-dirty #1
> > > [ 4.144676] Hardware name: Dell Inc. Precision T3600/0PTTT9, BIOS A13 05/11/2014
> > > [ 4.144676] RIP: 0010:nvif_object_mthd+0x136/0x1e0
> > > [ 4.144676] Code: f2 4c 89 ee 48 8d 7c 24 20 66 89 04 24 c6 44 24 18 00 e8 8d 04 4e 00 41 8d 56 20 49 8b 44 24 08 83 fa 17 76 7d 49 39 c4 74 45 <48> 8b 78 20 4c 89 64 24 10 48 8b 40 38 c6 44 24 06 ff 31 c9 48 89
>
> Opcode bytes around RIP look correct too:
>
> ./scripts/decodecode < /tmp/oops
> [ 4.144676] Code: f2 4c 89 ee 48 8d 7c 24 20 66 89 04 24 c6 44 24 18 00 e8 8d 04 4e 00 41 8d 56 20 49 8b 44 24 08 83 fa 17 76 7d 49 39 c4 74 45 <48> 8b 78 20 4c 89 64 24 10 48 8b 40 38 c6 44 24 06 ff 31 c9 48 89
> All code
> ========
> 0: f2 4c 89 ee repnz mov %r13,%rsi
> 4: 48 8d 7c 24 20 lea 0x20(%rsp),%rdi
> 9: 66 89 04 24 mov %ax,(%rsp)
> d: c6 44 24 18 00 movb $0x0,0x18(%rsp)
> 12: e8 8d 04 4e 00 callq 0x4e04a4
> 17: 41 8d 56 20 lea 0x20(%r14),%edx
> 1b: 49 8b 44 24 08 mov 0x8(%r12),%rax
> 20: 83 fa 17 cmp $0x17,%edx
> 23: 76 7d jbe 0xa2
> 25: 49 39 c4 cmp %rax,%r12
> 28: 74 45 je 0x6f
> 2a:* 48 8b 78 20 mov 0x20(%rax),%rdi <-- trapping instruction
> 2e: 4c 89 64 24 10 mov %r12,0x10(%rsp)
> 33: 48 8b 40 38 mov 0x38(%rax),%rax
> 37: c6 44 24 06 ff movb $0xff,0x6(%rsp)
> 3c: 31 c9 xor %ecx,%ecx
> 3e: 48 rex.W
> 3f: 89 .byte 0x89
>

mind compiling your kernel with KASAN and see if you hit the same
error as reported on this thread?

>
> HTH.
>
> --
> Regards/Gruss,
> Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette
>