Re: [PATCH v7 1/2] x86/split_lock: Rework the initialization flow of split lock detection

From: Xiaoyao Li
Date: Mon Mar 30 2020 - 09:26:33 EST


On 3/29/2020 12:32 AM, Sean Christopherson wrote:
On Wed, Mar 25, 2020 at 11:09:23AM +0800, Xiaoyao Li wrote:
static void __init split_lock_setup(void)
{
+ enum split_lock_detect_state state = sld_warn;
char arg[20];
int i, ret;
- setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT);
- sld_state = sld_warn;
+ if (!split_lock_verify_msr(false)) {
+ pr_info("MSR access failed: Disabled\n");

A few nits on the error handling.

The error message for this is a bit wonky, lots of colons and it's not
super clear what "Disabled" refers to.

[ 0.000000] x86/split lock detection: MSR access failed: Disabled

Maybe this, so that it reads "split lock detection disabled because the MSR
access failed".

pr_info("Disabled, MSR access failed\n");

And rather than duplicate the error message, maybe use a goto, e.g.

if (!split_lock_verify_msr(false))
goto msr_failed;

...

if (!split_lock_verify_msr(true))
goto msr_failed;


Will do it in next version.

thanks

+ return;
+ }
ret = cmdline_find_option(boot_command_line, "split_lock_detect",
arg, sizeof(arg));
if (ret >= 0) {
for (i = 0; i < ARRAY_SIZE(sld_options); i++) {
if (match_option(arg, ret, sld_options[i].option)) {
- sld_state = sld_options[i].state;
+ state = sld_options[i].state;
break;
}
}
}
- switch (sld_state) {
+ switch (state) {
case sld_off:
pr_info("disabled\n");
- break;
-
+ return;
case sld_warn:
pr_info("warning about user-space split_locks\n");
break;
-
case sld_fatal:
pr_info("sending SIGBUS on user-space split_locks\n");
break;
}
+
+ if (!split_lock_verify_msr(true)) {
+ pr_info("MSR access failed: Disabled\n");
+ return;
+ }
+
+ sld_state = state;
+ setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT);
}
/*
- * Locking is not required at the moment because only bit 29 of this
- * MSR is implemented and locking would not prevent that the operation
- * of one thread is immediately undone by the sibling thread.
- * Use the "safe" versions of rdmsr/wrmsr here because although code
- * checks CPUID and MSR bits to make sure the TEST_CTRL MSR should
- * exist, there may be glitches in virtualization that leave a guest
- * with an incorrect view of real h/w capabilities.
+ * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking
+ * is not implemented as one thread could undo the setting of the other
+ * thread immediately after dropping the lock anyway.
*/
-static bool __sld_msr_set(bool on)
+static void sld_update_msr(bool on)
{
u64 test_ctrl_val;
- if (rdmsrl_safe(MSR_TEST_CTRL, &test_ctrl_val))
- return false;
+ rdmsrl(MSR_TEST_CTRL, test_ctrl_val);
if (on)
test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
else
test_ctrl_val &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
- return !wrmsrl_safe(MSR_TEST_CTRL, test_ctrl_val);
+ wrmsrl(MSR_TEST_CTRL, test_ctrl_val);
}
static void split_lock_init(void)
{
- if (sld_state == sld_off)
- return;
-
- if (__sld_msr_set(true))
- return;
-
- /*
- * If this is anything other than the boot-cpu, you've done
- * funny things and you get to keep whatever pieces.
- */
- pr_warn("MSR fail -- disabled\n");
- sld_state = sld_off;
+ split_lock_verify_msr(sld_state != sld_off);

I think it'd be worth a WARN_ON() if this fails with sld_state != off. If
the WRMSR fails, then presumably SLD is off when it's expected to be on.
The implied WARN on the unsafe WRMSR in sld_update_msr() won't fire unless
a task generates an #AC on a non-buggy core and then gets migrated to the
buggy core. Even if the WARNs are redundant, if something is wrong it'd be
a lot easier for a user to triage/debug if there is a WARN in boot as
opposed to a runtime WARN that requires a misbehaving application and
scheduler behavior.


IIUC, you're recommending something like below?

WARN_ON(!split_lock_verify_msr(sld_state != sld_off) &&
sld_state != sld_off);

}
bool handle_user_split_lock(struct pt_regs *regs, long error_code)
@@ -1071,7 +1083,7 @@ bool handle_user_split_lock(struct pt_regs *regs, long error_code)
* progress and set TIF_SLD so the detection is re-enabled via
* switch_to_sld() when the task is scheduled out.
*/
- __sld_msr_set(false);
+ sld_update_msr(false);
set_tsk_thread_flag(current, TIF_SLD);
return true;
}
@@ -1085,7 +1097,7 @@ bool handle_user_split_lock(struct pt_regs *regs, long error_code)
*/
void switch_to_sld(unsigned long tifn)
{
- __sld_msr_set(!(tifn & _TIF_SLD));
+ sld_update_msr(!(tifn & _TIF_SLD));
}
#define SPLIT_LOCK_CPU(model) {X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY}
--
2.20.1