diff options
Diffstat (limited to 'arch/x86/kernel/cpu')
-rw-r--r-- | arch/x86/kernel/cpu/centaur.c | 27 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/common.c | 63 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/cpuid-deps.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mce/amd.c | 44 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mce/core.c | 188 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mce/dev-mcelog.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mce/internal.h | 13 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mce/severity.c | 102 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mshyperv.c | 8 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/resctrl/core.c | 60 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 92 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/resctrl/internal.h | 49 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/resctrl/monitor.c | 16 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/resctrl/rdtgroup.c | 85 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/scattered.c | 2 |
15 files changed, 479 insertions, 276 deletions
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index c5cf336e5077..345f7d905db6 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -65,6 +65,9 @@ static void init_c3(struct cpuinfo_x86 *c) c->x86_cache_alignment = c->x86_clflush_size * 2; set_cpu_cap(c, X86_FEATURE_REP_GOOD); } + + if (c->x86 >= 7) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); } enum { @@ -90,18 +93,15 @@ enum { static void early_init_centaur(struct cpuinfo_x86 *c) { - switch (c->x86) { #ifdef CONFIG_X86_32 - case 5: - /* Emulate MTRRs using Centaur's MCR. */ + /* Emulate MTRRs using Centaur's MCR. */ + if (c->x86 == 5) set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); - break; #endif - case 6: - if (c->x86_model >= 0xf) - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); - break; - } + if ((c->x86 == 6 && c->x86_model >= 0xf) || + (c->x86 >= 7)) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSENTER32); #endif @@ -145,9 +145,8 @@ static void init_centaur(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); } - switch (c->x86) { #ifdef CONFIG_X86_32 - case 5: + if (c->x86 == 5) { switch (c->x86_model) { case 4: name = "C6"; @@ -207,12 +206,10 @@ static void init_centaur(struct cpuinfo_x86 *c) c->x86_cache_size = (cc>>24)+(dd>>24); } sprintf(c->x86_model_id, "WinChip %s", name); - break; + } #endif - case 6: + if (c->x86 == 6 || c->x86 >= 7) init_c3(c); - break; - } #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 178499f90366..c51158914ea2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -23,6 +23,7 @@ #include <linux/syscore_ops.h> #include <linux/pgtable.h> +#include <asm/cmdline.h> #include <asm/stackprotector.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> @@ -1221,6 +1222,59 @@ static void detect_nopl(void) } /* + * We parse cpu parameters early because fpu__init_system() is executed + * before parse_early_param(). + */ +static void __init cpu_parse_early_param(void) +{ + char arg[128]; + char *argptr = arg; + int arglen, res, bit; + +#ifdef CONFIG_X86_32 + if (cmdline_find_option_bool(boot_command_line, "no387")) +#ifdef CONFIG_MATH_EMULATION + setup_clear_cpu_cap(X86_FEATURE_FPU); +#else + pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n"); +#endif + + if (cmdline_find_option_bool(boot_command_line, "nofxsr")) + setup_clear_cpu_cap(X86_FEATURE_FXSR); +#endif + + if (cmdline_find_option_bool(boot_command_line, "noxsave")) + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + + if (cmdline_find_option_bool(boot_command_line, "noxsaveopt")) + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + + if (cmdline_find_option_bool(boot_command_line, "noxsaves")) + setup_clear_cpu_cap(X86_FEATURE_XSAVES); + + arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); + if (arglen <= 0) + return; + + pr_info("Clearing CPUID bits:"); + do { + res = get_option(&argptr, &bit); + if (res == 0 || res == 3) + break; + + /* If the argument was too long, the last bit may be cut off */ + if (res == 1 && arglen >= sizeof(arg)) + break; + + if (bit >= 0 && bit < NCAPINTS * 32) { + pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); + setup_clear_cpu_cap(bit); + } + } while (res == 2); + pr_cont("\n"); +} + +/* * Do minimum CPU detection early. * Fields really needed: vendor, cpuid_level, family, model, mask, * cache alignment. @@ -1255,6 +1309,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) get_cpu_cap(c); get_cpu_address_sizes(c); setup_force_cpu_cap(X86_FEATURE_CPUID); + cpu_parse_early_param(); if (this_cpu->c_early_init) this_cpu->c_early_init(c); @@ -1413,15 +1468,7 @@ static void generic_identify(struct cpuinfo_x86 *c) * ESPFIX issue, we can change this. */ #ifdef CONFIG_X86_32 -# ifdef CONFIG_PARAVIRT_XXL - do { - extern void native_iret(void); - if (pv_ops.cpu.iret == native_iret) - set_cpu_bug(c, X86_BUG_ESPFIX); - } while (0); -# else set_cpu_bug(c, X86_BUG_ESPFIX); -# endif #endif } diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 3cbe24ca80ab..d502241995a3 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -69,6 +69,8 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC }, { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, + { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES }, + { X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA }, {} }; diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 99be063fcb1b..0c6b02dd744c 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -132,49 +132,49 @@ static enum smca_bank_types smca_get_bank_type(unsigned int bank) } static struct smca_hwid smca_hwid_mcatypes[] = { - /* { bank_type, hwid_mcatype, xec_bitmap } */ + /* { bank_type, hwid_mcatype } */ /* Reserved type */ - { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0), 0x0 }, + { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0) }, /* ZN Core (HWID=0xB0) MCA types */ - { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFFF }, - { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10), 0xFFFFFF }, - { SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF }, - { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF }, - { SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF }, + { SMCA_LS, HWID_MCATYPE(0xB0, 0x0) }, + { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10) }, + { SMCA_IF, HWID_MCATYPE(0xB0, 0x1) }, + { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2) }, + { SMCA_DE, HWID_MCATYPE(0xB0, 0x3) }, /* HWID 0xB0 MCATYPE 0x4 is Reserved */ - { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0xFFF }, - { SMCA_FP, HWID_MCATYPE(0xB0, 0x6), 0x7F }, - { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF }, + { SMCA_EX, HWID_MCATYPE(0xB0, 0x5) }, + { SMCA_FP, HWID_MCATYPE(0xB0, 0x6) }, + { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7) }, /* Data Fabric MCA types */ - { SMCA_CS, HWID_MCATYPE(0x2E, 0x0), 0x1FF }, - { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0x1F }, - { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2), 0x3FFF }, + { SMCA_CS, HWID_MCATYPE(0x2E, 0x0) }, + { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) }, + { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) }, /* Unified Memory Controller MCA type */ - { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0xFF }, + { SMCA_UMC, HWID_MCATYPE(0x96, 0x0) }, /* Parameter Block MCA type */ - { SMCA_PB, HWID_MCATYPE(0x05, 0x0), 0x1 }, + { SMCA_PB, HWID_MCATYPE(0x05, 0x0) }, /* Platform Security Processor MCA type */ - { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0), 0x1 }, - { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1), 0x3FFFF }, + { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0) }, + { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1) }, /* System Management Unit MCA type */ - { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 }, - { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1), 0x7FF }, + { SMCA_SMU, HWID_MCATYPE(0x01, 0x0) }, + { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1) }, /* Microprocessor 5 Unit MCA type */ - { SMCA_MP5, HWID_MCATYPE(0x01, 0x2), 0x3FF }, + { SMCA_MP5, HWID_MCATYPE(0x01, 0x2) }, /* Northbridge IO Unit MCA type */ - { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0), 0x1F }, + { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0) }, /* PCI Express Unit MCA type */ - { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0), 0x1F }, + { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) }, }; struct smca_bank smca_banks[MAX_NR_BANKS]; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index f43a78bde670..1c08cb9eb9f6 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -40,7 +40,6 @@ #include <linux/debugfs.h> #include <linux/irq_work.h> #include <linux/export.h> -#include <linux/jump_label.h> #include <linux/set_memory.h> #include <linux/sync_core.h> #include <linux/task_work.h> @@ -373,42 +372,105 @@ static int msr_to_offset(u32 msr) return -1; } +__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) +{ + pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, regs->ip, (void *)regs->ip); + + show_stack_regs(regs); + + panic("MCA architectural violation!\n"); + + while (true) + cpu_relax(); + + return true; +} + /* MSR access wrappers used for error injection */ -static u64 mce_rdmsrl(u32 msr) +static noinstr u64 mce_rdmsrl(u32 msr) { - u64 v; + DECLARE_ARGS(val, low, high); if (__this_cpu_read(injectm.finished)) { - int offset = msr_to_offset(msr); + int offset; + u64 ret; + + instrumentation_begin(); + offset = msr_to_offset(msr); if (offset < 0) - return 0; - return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset); - } + ret = 0; + else + ret = *(u64 *)((char *)this_cpu_ptr(&injectm) + offset); - if (rdmsrl_safe(msr, &v)) { - WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr); - /* - * Return zero in case the access faulted. This should - * not happen normally but can happen if the CPU does - * something weird, or if the code is buggy. - */ - v = 0; + instrumentation_end(); + + return ret; } - return v; + /* + * RDMSR on MCA MSRs should not fault. If they do, this is very much an + * architectural violation and needs to be reported to hw vendor. Panic + * the box to not allow any further progress. + */ + asm volatile("1: rdmsr\n" + "2:\n" + _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_fault) + : EAX_EDX_RET(val, low, high) : "c" (msr)); + + + return EAX_EDX_VAL(val, low, high); +} + +__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) +{ + pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax, + regs->ip, (void *)regs->ip); + + show_stack_regs(regs); + + panic("MCA architectural violation!\n"); + + while (true) + cpu_relax(); + + return true; } -static void mce_wrmsrl(u32 msr, u64 v) +static noinstr void mce_wrmsrl(u32 msr, u64 v) { + u32 low, high; + if (__this_cpu_read(injectm.finished)) { - int offset = msr_to_offset(msr); + int offset; + + instrumentation_begin(); + offset = msr_to_offset(msr); if (offset >= 0) *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v; + + instrumentation_end(); + return; } - wrmsrl(msr, v); + + low = (u32)v; + high = (u32)(v >> 32); + + /* See comment in mce_rdmsrl() */ + asm volatile("1: wrmsr\n" + "2:\n" + _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_fault) + : : "c" (msr), "a"(low), "d" (high) : "memory"); } /* @@ -745,7 +807,7 @@ log_it: goto clear_it; mce_read_aux(&m, i); - m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); + m.severity = mce_severity(&m, NULL, mca_cfg.tolerant, NULL, false); /* * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. @@ -794,7 +856,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, quirk_no_way_out(i, m, regs); m->bank = i; - if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { + if (mce_severity(m, regs, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { mce_read_aux(m, i); *msg = tmp; return 1; @@ -872,7 +934,6 @@ static void mce_reign(void) struct mce *m = NULL; int global_worst = 0; char *msg = NULL; - char *nmsg = NULL; /* * This CPU is the Monarch and the other CPUs have run @@ -880,12 +941,10 @@ static void mce_reign(void) * Grade the severity of the errors of all the CPUs. */ for_each_possible_cpu(cpu) { - int severity = mce_severity(&per_cpu(mces_seen, cpu), - mca_cfg.tolerant, - &nmsg, true); - if (severity > global_worst) { - msg = nmsg; - global_worst = severity; + struct mce *mtmp = &per_cpu(mces_seen, cpu); + + if (mtmp->severity > global_worst) { + global_worst = mtmp->severity; m = &per_cpu(mces_seen, cpu); } } @@ -895,8 +954,11 @@ static void mce_reign(void) * This dumps all the mces in the log buffer and stops the * other CPUs. */ - if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) + if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) { + /* call mce_severity() to get "msg" for panic */ + mce_severity(m, NULL, mca_cfg.tolerant, &msg, true); mce_panic("Fatal machine check", m, msg); + } /* * For UC somewhere we let the CPU who detects it handle it. @@ -1105,7 +1167,7 @@ static noinstr bool mce_check_crashing_cpu(void) return false; } -static void __mc_scan_banks(struct mce *m, struct mce *final, +static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, unsigned long *toclear, unsigned long *valid_banks, int no_way_out, int *worst) { @@ -1140,7 +1202,7 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, /* Set taint even when machine check was not enabled. */ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - severity = mce_severity(m, cfg->tolerant, NULL, true); + severity = mce_severity(m, regs, cfg->tolerant, NULL, true); /* * When machine check was for corrected/deferred handler don't @@ -1188,13 +1250,34 @@ static void kill_me_maybe(struct callback_head *cb) if (!p->mce_ripv) flags |= MF_MUST_KILL; - if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) { + if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) && + !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) { set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); + sync_core(); return; } - pr_err("Memory error not recovered"); - kill_me_now(cb); + if (p->mce_vaddr != (void __user *)-1l) { + force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT); + } else { + pr_err("Memory error not recovered"); + kill_me_now(cb); + } +} + +static void queue_task_work(struct mce *m, int kill_it) +{ + current->mce_addr = m->addr; + current->mce_kflags = m->kflags; + current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV); + current->mce_whole_page = whole_page(m); + + if (kill_it) + current->mce_kill_me.func = kill_me_now; + else + current->mce_kill_me.func = kill_me_maybe; + + task_work_add(current, ¤t->mce_kill_me, true); } /* @@ -1291,7 +1374,7 @@ noinstr void do_machine_check(struct pt_regs *regs) order = mce_start(&no_way_out); } - __mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst); + __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst); if (!no_way_out) mce_clear_state(toclear); @@ -1313,7 +1396,7 @@ noinstr void do_machine_check(struct pt_regs *regs) * make sure we have the right "msg". */ if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) { - mce_severity(&m, cfg->tolerant, &msg, true); + mce_severity(&m, regs, cfg->tolerant, &msg, true); mce_panic("Local fatal machine check!", &m, msg); } } @@ -1330,25 +1413,16 @@ noinstr void do_machine_check(struct pt_regs *regs) if (worst > 0) irq_work_queue(&mce_irq_work); - mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); - - sync_core(); - if (worst != MCE_AR_SEVERITY && !kill_it) - return; + goto out; /* Fault was in user mode and we need to take some action */ if ((m.cs & 3) == 3) { /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - current->mce_addr = m.addr; - current->mce_ripv = !!(m.mcgstatus & MCG_STATUS_RIPV); - current->mce_whole_page = whole_page(&m); - current->mce_kill_me.func = kill_me_maybe; - if (kill_it) - current->mce_kill_me.func = kill_me_now; - task_work_add(current, ¤t->mce_kill_me, true); + queue_task_work(&m, kill_it); + } else { /* * Handle an MCE which has happened in kernel space but from @@ -1363,7 +1437,12 @@ noinstr void do_machine_check(struct pt_regs *regs) if (!fixup_exception(regs, X86_TRAP_MC, 0, 0)) mce_panic("Failed kernel mode recovery", &m, msg); } + + if (m.kflags & MCE_IN_KERNEL_COPYIN) + queue_task_work(&m, kill_it); } +out: + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); } EXPORT_SYMBOL_GPL(do_machine_check); @@ -1904,6 +1983,8 @@ void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check; static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) { + bool irq_state; + WARN_ON_ONCE(user_mode(regs)); /* @@ -1914,7 +1995,7 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) mce_check_crashing_cpu()) return; - nmi_enter(); + irq_state = idtentry_enter_nmi(regs); /* * The call targets are marked noinstr, but objtool can't figure * that out because it's an indirect call. Annotate it. @@ -1925,7 +2006,7 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) if (regs->flags & X86_EFLAGS_IF) trace_hardirqs_on_prepare(); instrumentation_end(); - nmi_exit(); + idtentry_exit_nmi(regs, irq_state); } static __always_inline void exc_machine_check_user(struct pt_regs *regs) @@ -2062,7 +2143,7 @@ void mce_disable_bank(int bank) and older. * mce=nobootlog Don't log MCEs from before booting. * mce=bios_cmci_threshold Don't program the CMCI threshold - * mce=recovery force enable memcpy_mcsafe() + * mce=recovery force enable copy_mc_fragile() */ static int __init mcheck_enable(char *str) { @@ -2670,13 +2751,10 @@ static void __init mcheck_debugfs_init(void) static void __init mcheck_debugfs_init(void) { } #endif -DEFINE_STATIC_KEY_FALSE(mcsafe_key); -EXPORT_SYMBOL_GPL(mcsafe_key); - static int __init mcheck_late_init(void) { if (mca_cfg.recovery) - static_branch_inc(&mcsafe_key); + enable_copy_mc_fragile(); mcheck_debugfs_init(); diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c index 03e51053592a..100fbeebdc72 100644 --- a/arch/x86/kernel/cpu/mce/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -67,7 +67,9 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val, unlock: mutex_unlock(&mce_chrdev_read_mutex); - mce->kflags |= MCE_HANDLED_MCELOG; + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + mce->kflags |= MCE_HANDLED_MCELOG; + return NOTIFY_OK; } diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 6473070b5da4..88dcc79cfb07 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -38,7 +38,8 @@ int mce_gen_pool_add(struct mce *mce); int mce_gen_pool_init(void); struct llist_node *mce_gen_pool_prepare_records(void); -extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp); +extern int (*mce_severity)(struct mce *a, struct pt_regs *regs, + int tolerant, char **msg, bool is_excp); struct dentry *mce_get_debugfs_dir(void); extern mce_banks_t mce_banks_ce_disabled; @@ -185,4 +186,14 @@ extern bool amd_filter_mce(struct mce *m); static inline bool amd_filter_mce(struct mce *m) { return false; }; #endif +__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr); + +__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr); + #endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index e1da619add19..83df991314c5 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -9,9 +9,14 @@ #include <linux/seq_file.h> #include <linux/init.h> #include <linux/debugfs.h> -#include <asm/mce.h> #include <linux/uaccess.h> +#include <asm/mce.h> +#include <asm/intel-family.h> +#include <asm/traps.h> +#include <asm/insn.h> +#include <asm/insn-eval.h> + #include "internal.h" /* @@ -40,9 +45,14 @@ static struct severity { unsigned char context; unsigned char excp; unsigned char covered; + unsigned char cpu_model; + unsigned char cpu_minstepping; + unsigned char bank_lo, bank_hi; char *msg; } severities[] = { #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } +#define BANK_RANGE(l, h) .bank_lo = l, .bank_hi = h +#define MODEL_STEPPING(m, s) .cpu_model = m, .cpu_minstepping = s #define KERNEL .context = IN_KERNEL #define USER .context = IN_USER #define KERNEL_RECOV .context = IN_KERNEL_RECOV @@ -90,14 +100,9 @@ static struct severity { EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0) ), MCESEV( - DEFERRED, "Deferred error", - NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED) - ), - MCESEV( KEEP, "Corrected error", NOSER, BITCLR(MCI_STATUS_UC) ), - /* * known AO MCACODs reported via MCE or CMC: * @@ -113,6 +118,18 @@ static struct severity { AO, "Action optional: last level cache writeback error", SER, MASK(MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) ), + /* + * Quirk for Skylake/Cascade Lake. Patrol scrubber may be configured + * to report uncorrected errors using CMCI with a special signature. + * UC=0, MSCOD=0x0010, MCACOD=binary(000X 0000 1100 XXXX) reported + * in one of the memory controller banks. + * Set severity to "AO" for same action as normal patrol scrub error. + */ + MCESEV( + AO, "Uncorrected Patrol Scrub Error", + SER, MASK(MCI_STATUS_UC|MCI_ADDR|0xffffeff0, MCI_ADDR|0x001000c0), + MODEL_STEPPING(INTEL_FAM6_SKYLAKE_X, 4), BANK_RANGE(13, 18) + ), /* ignore OVER for UCNA */ MCESEV( @@ -198,6 +215,47 @@ static struct severity { #define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \ (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) +static bool is_copy_from_user(struct pt_regs *regs) +{ + u8 insn_buf[MAX_INSN_SIZE]; + struct insn insn; + unsigned long addr; + + if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) + return false; + + kernel_insn_init(&insn, insn_buf, MAX_INSN_SIZE); + insn_get_opcode(&insn); + if (!insn.opcode.got) + return false; + + switch (insn.opcode.value) { + /* MOV mem,reg */ + case 0x8A: case 0x8B: + /* MOVZ mem,reg */ + case 0xB60F: case 0xB70F: + insn_get_modrm(&insn); + insn_get_sib(&insn); + if (!insn.modrm.got || !insn.sib.got) + return false; + addr = (unsigned long)insn_get_addr_ref(&insn, regs); + break; + /* REP MOVS */ + case 0xA4: case 0xA5: + addr = regs->si; + break; + default: + return false; + } + + if (fault_in_kernel_space(addr)) + return false; + + current->mce_vaddr = (void __user *)addr; + + return true; +} + /* * If mcgstatus indicated that ip/cs on the stack were * no good, then "m->cs" will be zero and we will have @@ -209,15 +267,25 @@ static struct severity { * distinguish an exception taken in user from from one * taken in the kernel. */ -static int error_context(struct mce *m) +static int error_context(struct mce *m, struct pt_regs *regs) { + enum handler_type t; + if ((m->cs & 3) == 3) return IN_USER; + if (!mc_recoverable(m->mcgstatus)) + return IN_KERNEL; - if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip)) { + t = ex_get_fault_handler_type(m->ip); + if (t == EX_HANDLER_FAULT) { m->kflags |= MCE_IN_KERNEL_RECOV; return IN_KERNEL_RECOV; } + if (t == EX_HANDLER_UACCESS && regs && is_copy_from_user(regs)) { + m->kflags |= MCE_IN_KERNEL_RECOV; + m->kflags |= MCE_IN_KERNEL_COPYIN; + return IN_KERNEL_RECOV; + } return IN_KERNEL; } @@ -253,9 +321,10 @@ static int mce_severity_amd_smca(struct mce *m, enum context err_ctx) * See AMD Error Scope Hierarchy table in a newer BKDG. For example * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features" */ -static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp) +static int mce_severity_amd(struct mce *m, struct pt_regs *regs, int tolerant, + char **msg, bool is_excp) { - enum context ctx = error_context(m); + enum context ctx = error_context(m, regs); /* Processor Context Corrupt, no need to fumble too much, die! */ if (m->status & MCI_STATUS_PCC) @@ -305,10 +374,11 @@ static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_exc return MCE_KEEP_SEVERITY; } -static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp) +static int mce_severity_intel(struct mce *m, struct pt_regs *regs, + int tolerant, char **msg, bool is_excp) { enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); - enum context ctx = error_context(m); + enum context ctx = error_context(m, regs); struct severity *s; for (s = severities;; s++) { @@ -324,6 +394,12 @@ static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_e continue; if (s->excp && excp != s->excp) continue; + if (s->cpu_model && boot_cpu_data.x86_model != s->cpu_model) + continue; + if (s->cpu_minstepping && boot_cpu_data.x86_stepping < s->cpu_minstepping) + continue; + if (s->bank_lo && (m->bank < s->bank_lo || m->bank > s->bank_hi)) + continue; if (msg) *msg = s->msg; s->covered = 1; @@ -336,7 +412,7 @@ static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_e } /* Default to mce_severity_intel */ -int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) = +int (*mce_severity)(struct mce *m, struct pt_regs *regs, int tolerant, char **msg, bool is_excp) = mce_severity_intel; void __init mcheck_vendor_init_severity(void) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 31125448b174..9834a43cd0fa 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -248,7 +248,7 @@ static void __init ms_hyperv_init_platform(void) hv_host_info_edx >> 24, hv_host_info_edx & 0xFFFFFF); } - if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS && + if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { x86_platform.calibrate_tsc = hv_get_tsc_khz; x86_platform.calibrate_cpu = hv_get_tsc_khz; @@ -270,7 +270,7 @@ static void __init ms_hyperv_init_platform(void) crash_kexec_post_notifiers = true; #ifdef CONFIG_X86_LOCAL_APIC - if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS && + if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { /* * Get the APIC frequency. @@ -296,7 +296,7 @@ static void __init ms_hyperv_init_platform(void) machine_ops.shutdown = hv_machine_shutdown; machine_ops.crash_shutdown = hv_machine_crash_shutdown; #endif - if (ms_hyperv.features & HV_X64_ACCESS_TSC_INVARIANT) { + if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) { wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x1); setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); } else { @@ -330,7 +330,7 @@ static void __init ms_hyperv_init_platform(void) alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_hyperv_callback); /* Setup the IDT for reenlightenment notifications */ - if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT) { + if (ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT) { alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR, asm_sysvec_hyperv_reenlightenment); } diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 6a9df71c1b9e..e5f4ee8f4c3b 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -168,6 +168,7 @@ struct rdt_resource rdt_resources_all[] = { .name = "MB", .domains = domain_init(RDT_RESOURCE_MBA), .cache_level = 3, + .parse_ctrlval = parse_bw, .format_str = "%d=%*u", .fflags = RFTYPE_RES_MB, }, @@ -254,22 +255,30 @@ static bool __get_mem_config_intel(struct rdt_resource *r) { union cpuid_0x10_3_eax eax; union cpuid_0x10_x_edx edx; - u32 ebx, ecx; + u32 ebx, ecx, max_delay; cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full); r->num_closid = edx.split.cos_max + 1; - r->membw.max_delay = eax.split.max_delay + 1; + max_delay = eax.split.max_delay + 1; r->default_ctrl = MAX_MBA_BW; + r->membw.arch_needs_linear = true; if (ecx & MBA_IS_LINEAR) { r->membw.delay_linear = true; - r->membw.min_bw = MAX_MBA_BW - r->membw.max_delay; - r->membw.bw_gran = MAX_MBA_BW - r->membw.max_delay; + r->membw.min_bw = MAX_MBA_BW - max_delay; + r->membw.bw_gran = MAX_MBA_BW - max_delay; } else { if (!rdt_get_mb_table(r)) return false; + r->membw.arch_needs_linear = false; } r->data_width = 3; + if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA)) + r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD; + else + r->membw.throttle_mode = THREAD_THROTTLE_MAX; + thread_throttle_mode_init(); + r->alloc_capable = true; r->alloc_enabled = true; @@ -288,7 +297,13 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r) /* AMD does not use delay */ r->membw.delay_linear = false; + r->membw.arch_needs_linear = false; + /* + * AMD does not use memory delay throttle model to control + * the allocation like Intel does. + */ + r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; r->membw.min_bw = 0; r->membw.bw_gran = 1; /* Max value is 2048, Data width should be 4 in decimal */ @@ -346,19 +361,6 @@ static void rdt_get_cdp_l2_config(void) rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2CODE); } -static int get_cache_id(int cpu, int level) -{ - struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); - int i; - - for (i = 0; i < ci->num_leaves; i++) { - if (ci->info_list[i].level == level) - return ci->info_list[i].id; - } - - return -1; -} - static void mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) { @@ -556,13 +558,13 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) */ static void domain_add_cpu(int cpu, struct rdt_resource *r) { - int id = get_cache_id(cpu, r->cache_level); + int id = get_cpu_cacheinfo_id(cpu, r->cache_level); struct list_head *add_pos = NULL; struct rdt_domain *d; d = rdt_find_domain(r, id, &add_pos); if (IS_ERR(d)) { - pr_warn("Could't find cache id for cpu %d\n", cpu); + pr_warn("Couldn't find cache id for CPU %d\n", cpu); return; } @@ -602,12 +604,12 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) static void domain_remove_cpu(int cpu, struct rdt_resource *r) { - int id = get_cache_id(cpu, r->cache_level); + int id = get_cpu_cacheinfo_id(cpu, r->cache_level); struct rdt_domain *d; d = rdt_find_domain(r, id, NULL); if (IS_ERR_OR_NULL(d)) { - pr_warn("Could't find cache id for cpu %d\n", cpu); + pr_warn("Couldn't find cache id for CPU %d\n", cpu); return; } @@ -918,12 +920,12 @@ static __init void rdt_init_res_defs_intel(void) r->rid == RDT_RESOURCE_L3CODE || r->rid == RDT_RESOURCE_L2 || r->rid == RDT_RESOURCE_L2DATA || - r->rid == RDT_RESOURCE_L2CODE) - r->cbm_validate = cbm_validate_intel; - else if (r->rid == RDT_RESOURCE_MBA) { + r->rid == RDT_RESOURCE_L2CODE) { + r->cache.arch_has_sparse_bitmaps = false; + r->cache.arch_has_empty_bitmaps = false; + } else if (r->rid == RDT_RESOURCE_MBA) { r->msr_base = MSR_IA32_MBA_THRTL_BASE; r->msr_update = mba_wrmsr_intel; - r->parse_ctrlval = parse_bw_intel; } } } @@ -938,12 +940,12 @@ static __init void rdt_init_res_defs_amd(void) r->rid == RDT_RESOURCE_L3CODE || r->rid == RDT_RESOURCE_L2 || r->rid == RDT_RESOURCE_L2DATA || - r->rid == RDT_RESOURCE_L2CODE) - r->cbm_validate = cbm_validate_amd; - else if (r->rid == RDT_RESOURCE_MBA) { + r->rid == RDT_RESOURCE_L2CODE) { + r->cache.arch_has_sparse_bitmaps = true; + r->cache.arch_has_empty_bitmaps = true; + } else if (r->rid == RDT_RESOURCE_MBA) { r->msr_base = MSR_IA32_MBA_BW_BASE; r->msr_update = mba_wrmsr_amd; - r->parse_ctrlval = parse_bw_amd; } } } diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 934c8fb8a64a..c877642e8a14 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -23,53 +23,6 @@ /* * Check whether MBA bandwidth percentage value is correct. The value is - * checked against the minimum and maximum bandwidth values specified by - * the hardware. The allocated bandwidth percentage is rounded to the next - * control step available on the hardware. - */ -static bool bw_validate_amd(char *buf, unsigned long *data, - struct rdt_resource *r) -{ - unsigned long bw; - int ret; - - ret = kstrtoul(buf, 10, &bw); - if (ret) { - rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf); - return false; - } - - if (bw < r->membw.min_bw || bw > r->default_ctrl) { - rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw, - r->membw.min_bw, r->default_ctrl); - return false; - } - - *data = roundup(bw, (unsigned long)r->membw.bw_gran); - return true; -} - -int parse_bw_amd(struct rdt_parse_data *data, struct rdt_resource *r, - struct rdt_domain *d) -{ - unsigned long bw_val; - - if (d->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->id); - return -EINVAL; - } - - if (!bw_validate_amd(data->buf, &bw_val, r)) - return -EINVAL; - - d->new_ctrl = bw_val; - d->have_new_ctrl = true; - - return 0; -} - -/* - * Check whether MBA bandwidth percentage value is correct. The value is * checked against the minimum and max bandwidth values specified by the * hardware. The allocated bandwidth percentage is rounded to the next * control step available on the hardware. @@ -82,7 +35,7 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) /* * Only linear delay values is supported for current Intel SKUs. */ - if (!r->membw.delay_linear) { + if (!r->membw.delay_linear && r->membw.arch_needs_linear) { rdt_last_cmd_puts("No support for non-linear MB domains\n"); return false; } @@ -104,8 +57,8 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) return true; } -int parse_bw_intel(struct rdt_parse_data *data, struct rdt_resource *r, - struct rdt_domain *d) +int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d) { unsigned long bw_val; @@ -123,12 +76,14 @@ int parse_bw_intel(struct rdt_parse_data *data, struct rdt_resource *r, } /* - * Check whether a cache bit mask is valid. The SDM says: + * Check whether a cache bit mask is valid. + * For Intel the SDM says: * Please note that all (and only) contiguous '1' combinations * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.). * Additionally Haswell requires at least two bits set. + * AMD allows non-contiguous bitmasks. */ -bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r) +static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) { unsigned long first_bit, zero_bit, val; unsigned int cbm_len = r->cache.cbm_len; @@ -140,7 +95,8 @@ bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r) return false; } - if (val == 0 || val > r->default_ctrl) { + if ((!r->cache.arch_has_empty_bitmaps && val == 0) || + val > r->default_ctrl) { rdt_last_cmd_puts("Mask out of range\n"); return false; } @@ -148,7 +104,9 @@ bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r) first_bit = find_first_bit(&val, cbm_len); zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len) { + /* Are non-contiguous bitmaps allowed? */ + if (!r->cache.arch_has_sparse_bitmaps && + (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) { rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val); return false; } @@ -164,30 +122,6 @@ bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r) } /* - * Check whether a cache bit mask is valid. AMD allows non-contiguous - * bitmasks - */ -bool cbm_validate_amd(char *buf, u32 *data, struct rdt_resource *r) -{ - unsigned long val; - int ret; - - ret = kstrtoul(buf, 16, &val); - if (ret) { - rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf); - return false; - } - - if (val > r->default_ctrl) { - rdt_last_cmd_puts("Mask out of range\n"); - return false; - } - - *data = val; - return true; -} - -/* * Read one cache bit mask (hex). Check that it is valid for the current * resource type. */ @@ -212,7 +146,7 @@ int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, return -EINVAL; } - if (!r->cbm_validate(data->buf, &cbm_val, r)) + if (!cbm_validate(data->buf, &cbm_val, r)) return -EINVAL; if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE || diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 5ffa32256b3b..80fa997fae60 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -283,7 +283,6 @@ struct rftype { * struct mbm_state - status for each MBM counter in each domain * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it - * @chunks_bw Total local data moved. Used for bandwidth calculation * @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting * @prev_bw The most recent bandwidth in MBps * @delta_bw Difference between the current and previous bandwidth @@ -292,7 +291,6 @@ struct rftype { struct mbm_state { u64 chunks; u64 prev_msr; - u64 chunks_bw; u64 prev_bw_msr; u32 prev_bw; u32 delta_bw; @@ -360,6 +358,8 @@ struct msr_param { * in a cache bit mask * @shareable_bits: Bitmask of shareable resource with other * executing entities + * @arch_has_sparse_bitmaps: True if a bitmap like f00f is valid. + * @arch_has_empty_bitmaps: True if the '0' bitmap is valid. */ struct rdt_cache { unsigned int cbm_len; @@ -367,25 +367,43 @@ struct rdt_cache { unsigned int cbm_idx_mult; unsigned int cbm_idx_offset; unsigned int shareable_bits; + bool arch_has_sparse_bitmaps; + bool arch_has_empty_bitmaps; +}; + +/** + * enum membw_throttle_mode - System's memory bandwidth throttling mode + * @THREAD_THROTTLE_UNDEFINED: Not relevant to the system + * @THREAD_THROTTLE_MAX: Memory bandwidth is throttled at the core + * always using smallest bandwidth percentage + * assigned to threads, aka "max throttling" + * @THREAD_THROTTLE_PER_THREAD: Memory bandwidth is throttled at the thread + */ +enum membw_throttle_mode { + THREAD_THROTTLE_UNDEFINED = 0, + THREAD_THROTTLE_MAX, + THREAD_THROTTLE_PER_THREAD, }; /** * struct rdt_membw - Memory bandwidth allocation related data - * @max_delay: Max throttle delay. Delay is the hardware - * representation for memory bandwidth. * @min_bw: Minimum memory bandwidth percentage user can request * @bw_gran: Granularity at which the memory bandwidth is allocated * @delay_linear: True if memory B/W delay is in linear scale + * @arch_needs_linear: True if we can't configure non-linear resources + * @throttle_mode: Bandwidth throttling mode when threads request + * different memory bandwidths * @mba_sc: True if MBA software controller(mba_sc) is enabled * @mb_map: Mapping of memory B/W percentage to memory B/W delay */ struct rdt_membw { - u32 max_delay; - u32 min_bw; - u32 bw_gran; - u32 delay_linear; - bool mba_sc; - u32 *mb_map; + u32 min_bw; + u32 bw_gran; + u32 delay_linear; + bool arch_needs_linear; + enum membw_throttle_mode throttle_mode; + bool mba_sc; + u32 *mb_map; }; static inline bool is_llc_occupancy_enabled(void) @@ -437,7 +455,6 @@ struct rdt_parse_data { * @cache: Cache allocation related data * @format_str: Per resource format string to show domain value * @parse_ctrlval: Per resource function pointer to parse control values - * @cbm_validate Cache bitmask validate function * @evt_list: List of monitoring events * @num_rmid: Number of RMIDs available * @mon_scale: cqm counter * mon_scale = occupancy in bytes @@ -464,7 +481,6 @@ struct rdt_resource { int (*parse_ctrlval)(struct rdt_parse_data *data, struct rdt_resource *r, struct rdt_domain *d); - bool (*cbm_validate)(char *buf, u32 *data, struct rdt_resource *r); struct list_head evt_list; int num_rmid; unsigned int mon_scale; @@ -474,10 +490,8 @@ struct rdt_resource { int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, struct rdt_domain *d); -int parse_bw_intel(struct rdt_parse_data *data, struct rdt_resource *r, - struct rdt_domain *d); -int parse_bw_amd(struct rdt_parse_data *data, struct rdt_resource *r, - struct rdt_domain *d); +int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d); extern struct mutex rdtgroup_mutex; @@ -609,8 +623,7 @@ void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); void cqm_handle_limbo(struct work_struct *work); bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); void __check_limbo(struct rdt_domain *d, bool force_free); -bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r); -bool cbm_validate_amd(char *buf, u32 *data, struct rdt_resource *r); void rdt_domain_reconfigure_cdp(struct rdt_resource *r); +void __init thread_throttle_mode_init(void); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 837d7d012b7b..54dffe574e67 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -279,8 +279,7 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr) return; chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width); - m->chunks_bw += chunks; - m->chunks = m->chunks_bw; + m->chunks += chunks; cur_bw = (chunks * r->mon_scale) >> 20; if (m->delta_comp) @@ -478,19 +477,13 @@ void cqm_handle_limbo(struct work_struct *work) mutex_lock(&rdtgroup_mutex); r = &rdt_resources_all[RDT_RESOURCE_L3]; - d = get_domain_from_cpu(cpu, r); - - if (!d) { - pr_warn_once("Failure to get domain for limbo worker\n"); - goto out_unlock; - } + d = container_of(work, struct rdt_domain, cqm_limbo.work); __check_limbo(d, false); if (has_busy_rmid(r, d)) schedule_delayed_work_on(cpu, &d->cqm_limbo, delay); -out_unlock: mutex_unlock(&rdtgroup_mutex); } @@ -520,10 +513,7 @@ void mbm_handle_overflow(struct work_struct *work) goto out_unlock; r = &rdt_resources_all[RDT_RESOURCE_L3]; - - d = get_domain_from_cpu(cpu, r); - if (!d) - goto out_unlock; + d = container_of(work, struct rdt_domain, mbm_over.work); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { mbm_update(r, d, prgrp->mon.rmid); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 3f844f14fc0a..b494187632b2 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -592,6 +592,18 @@ static int __rdtgroup_move_task(struct task_struct *tsk, return ret; } +static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_alloc_capable && + (r->type == RDTCTRL_GROUP) && (t->closid == r->closid)); +} + +static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_mon_capable && + (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid)); +} + /** * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group * @r: Resource group @@ -607,8 +619,7 @@ int rdtgroup_tasks_assigned(struct rdtgroup *r) rcu_read_lock(); for_each_process_thread(p, t) { - if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) || - (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) { + if (is_closid_match(t, r) || is_rmid_match(t, r)) { ret = 1; break; } @@ -706,8 +717,7 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) rcu_read_lock(); for_each_process_thread(p, t) { - if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) || - (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) + if (is_closid_match(t, r) || is_rmid_match(t, r)) seq_printf(s, "%d\n", t->pid); } rcu_read_unlock(); @@ -1017,6 +1027,19 @@ static int max_threshold_occ_show(struct kernfs_open_file *of, return 0; } +static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + if (r->membw.throttle_mode == THREAD_THROTTLE_PER_THREAD) + seq_puts(seq, "per-thread\n"); + else + seq_puts(seq, "max\n"); + + return 0; +} + static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -1513,6 +1536,17 @@ static struct rftype res_common_files[] = { .seq_show = rdt_delay_linear_show, .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, }, + /* + * Platform specific which (if any) capabilities are provided by + * thread_throttle_mode. Defer "fflags" initialization to platform + * discovery. + */ + { + .name = "thread_throttle_mode", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_thread_throttle_mode_show, + }, { .name = "max_threshold_occupancy", .mode = 0644, @@ -1583,7 +1617,7 @@ static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) lockdep_assert_held(&rdtgroup_mutex); for (rft = rfts; rft < rfts + len; rft++) { - if ((fflags & rft->fflags) == rft->fflags) { + if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) { ret = rdtgroup_add_file(kn, rft); if (ret) goto error; @@ -1600,6 +1634,33 @@ error: return ret; } +static struct rftype *rdtgroup_get_rftype_by_name(const char *name) +{ + struct rftype *rfts, *rft; + int len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + for (rft = rfts; rft < rfts + len; rft++) { + if (!strcmp(rft->name, name)) + return rft; + } + + return NULL; +} + +void __init thread_throttle_mode_init(void) +{ + struct rftype *rft; + + rft = rdtgroup_get_rftype_by_name("thread_throttle_mode"); + if (!rft) + return; + + rft->fflags = RF_CTRL_INFO | RFTYPE_RES_MB; +} + /** * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file * @r: The resource group with which the file is associated. @@ -2245,18 +2306,6 @@ static int reset_all_ctrls(struct rdt_resource *r) return 0; } -static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) -{ - return (rdt_alloc_capable && - (r->type == RDTCTRL_GROUP) && (t->closid == r->closid)); -} - -static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) -{ - return (rdt_mon_capable && - (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid)); -} - /* * Move tasks from one to the other group. If @from is NULL, then all tasks * in the systems are moved unconditionally (used for teardown). @@ -3196,7 +3245,7 @@ int __init rdtgroup_init(void) * It may also be ok since that would enable debugging of RDT before * resctrl is mounted. * The reason why the debugfs directory is created here and not in - * rdt_mount() is because rdt_mount() takes rdtgroup_mutex and + * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and * during the debugfs directory creation also &sb->s_type->i_mutex_key * (the lockdep class of inode->i_rwsem). Other filesystem * interactions (eg. SyS_getdents) have the lock ordering: diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 62b137c3c97a..2eb0a8c44b35 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -35,12 +35,14 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 }, { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, + { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 }, { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 }, + { X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 }, { 0, 0, 0, 0, 0 } }; |