diff options
Diffstat (limited to 'arch/x86/kernel')
56 files changed, 827 insertions, 488 deletions
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 2a0ea38955df..d0918a75cb00 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -148,6 +148,9 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) pr_debug("Local APIC address 0x%08x\n", madt->address); } + if (madt->flags & ACPI_MADT_PCAT_COMPAT) + legacy_pic_pcat_compat(); + /* ACPI 6.3 and newer support the online capable bit. */ if (acpi_gbl_FADT.header.revision > 6 || (acpi_gbl_FADT.header.revision == 6 && @@ -359,7 +362,7 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e } #ifdef CONFIG_X86_64 -static int acpi_wakeup_cpu(int apicid, unsigned long start_ip) +static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip) { /* * Remap mailbox memory only for the first call to acpi_wakeup_cpu(). @@ -856,7 +859,7 @@ int acpi_unmap_cpu(int cpu) set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE); #endif - per_cpu(x86_cpu_to_apicid, cpu) = -1; + per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID; set_cpu_present(cpu, false); num_processors--; diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 356de955e78d..053f6dcc6b2c 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -27,6 +27,7 @@ #define PCI_DEVICE_ID_AMD_1AH_M00H_ROOT 0x153a #define PCI_DEVICE_ID_AMD_1AH_M20H_ROOT 0x1507 #define PCI_DEVICE_ID_AMD_MI200_ROOT 0x14bb +#define PCI_DEVICE_ID_AMD_MI300_ROOT 0x14f8 #define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec @@ -43,6 +44,7 @@ #define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc #define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4 0x12c4 #define PCI_DEVICE_ID_AMD_MI200_DF_F4 0x14d4 +#define PCI_DEVICE_ID_AMD_MI300_DF_F4 0x152c /* Protect the PCI config register pairs used for SMN. */ static DEFINE_MUTEX(smn_mutex); @@ -62,6 +64,7 @@ static const struct pci_device_id amd_root_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_ROOT) }, {} }; @@ -93,6 +96,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F3) }, {} }; @@ -112,9 +116,13 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F4) }, {} }; @@ -386,7 +394,7 @@ int amd_get_subcaches(int cpu) pci_read_config_dword(link, 0x1d4, &mask); - return (mask >> (4 * cpu_data(cpu).cpu_core_id)) & 0xf; + return (mask >> (4 * cpu_data(cpu).topo.core_id)) & 0xf; } int amd_set_subcaches(int cpu, unsigned long mask) @@ -412,7 +420,7 @@ int amd_set_subcaches(int cpu, unsigned long mask) pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000); } - cuid = cpu_data(cpu).cpu_core_id; + cuid = cpu_data(cpu).topo.core_id; mask <<= 4 * cuid; mask |= (0xf ^ (1 << cuid)) << 26; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 760adac3d1a8..41093cf20acd 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -36,6 +36,8 @@ #include <linux/smp.h> #include <linux/mm.h> +#include <xen/xen.h> + #include <asm/trace/irq_vectors.h> #include <asm/irq_remapping.h> #include <asm/pc-conf-reg.h> @@ -70,7 +72,7 @@ unsigned int num_processors; unsigned disabled_cpus; /* Processor that is doing the boot up */ -unsigned int boot_cpu_physical_apicid __ro_after_init = -1U; +u32 boot_cpu_physical_apicid __ro_after_init = BAD_APICID; EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); u8 boot_cpu_apic_version __ro_after_init; @@ -85,7 +87,7 @@ physid_mask_t phys_cpu_present_map; * disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to * avoid undefined behaviour caused by sending INIT from AP to BSP. */ -static unsigned int disabled_cpu_apicid __ro_after_init = BAD_APICID; +static u32 disabled_cpu_apicid __ro_after_init = BAD_APICID; /* * This variable controls which CPUs receive external NMIs. By default, @@ -109,7 +111,7 @@ static inline bool apic_accessible(void) /* * Map cpu index to physical APIC ID */ -DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID); DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, U32_MAX); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid); @@ -1763,7 +1765,7 @@ static void __x2apic_enable(void) static int __init setup_nox2apic(char *str) { if (x2apic_enabled()) { - int apicid = native_apic_msr_read(APIC_ID); + u32 apicid = native_apic_msr_read(APIC_ID); if (apicid >= 255) { pr_warn("Apicid: %08x, cannot enforce nox2apic\n", @@ -2316,13 +2318,11 @@ static int nr_logical_cpuids = 1; /* * Used to store mapping between logical CPU IDs and APIC IDs. */ -int cpuid_to_apicid[] = { - [0 ... NR_CPUS - 1] = -1, -}; +u32 cpuid_to_apicid[] = { [0 ... NR_CPUS - 1] = BAD_APICID, }; bool arch_match_cpu_phys_id(int cpu, u64 phys_id) { - return phys_id == cpuid_to_apicid[cpu]; + return phys_id == (u64)cpuid_to_apicid[cpu]; } #ifdef CONFIG_SMP @@ -2344,6 +2344,15 @@ static int __init smp_init_primary_thread_mask(void) { unsigned int cpu; + /* + * XEN/PV provides either none or useless topology information. + * Pretend that all vCPUs are primary threads. + */ + if (xen_pv_domain()) { + cpumask_copy(&__cpu_primary_thread_mask, cpu_possible_mask); + return 0; + } + for (cpu = 0; cpu < nr_logical_cpuids; cpu++) cpu_mark_primary_thread(cpu, cpuid_to_apicid[cpu]); return 0; @@ -2382,7 +2391,7 @@ static int allocate_logical_cpuid(int apicid) return nr_logical_cpuids++; } -static void cpu_update_apic(int cpu, int apicid) +static void cpu_update_apic(int cpu, u32 apicid) { #if defined(CONFIG_SMP) || defined(CONFIG_X86_64) early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; @@ -2535,7 +2544,7 @@ static struct { */ int active; /* r/w apic fields */ - unsigned int apic_id; + u32 apic_id; unsigned int apic_taskpri; unsigned int apic_ldr; unsigned int apic_dfr; diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c index 7bc5d9bf59cd..8a00141073ea 100644 --- a/arch/x86/kernel/apic/apic_common.c +++ b/arch/x86/kernel/apic/apic_common.c @@ -18,7 +18,7 @@ u32 apic_flat_calc_apicid(unsigned int cpu) return 1U << cpu; } -bool default_check_apicid_used(physid_mask_t *map, int apicid) +bool default_check_apicid_used(physid_mask_t *map, u32 apicid) { return physid_isset(apicid, *map); } @@ -28,7 +28,7 @@ void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) *retmap = *phys_map; } -int default_cpu_present_to_apicid(int mps_cpu) +u32 default_cpu_present_to_apicid(int mps_cpu) { if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) return (int)per_cpu(x86_cpu_to_apicid, mps_cpu); diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 032a84e2c3cc..37daa3fd6819 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -56,17 +56,17 @@ flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) _flat_send_IPI_mask(mask, vector); } -static unsigned int flat_get_apic_id(unsigned long x) +static u32 flat_get_apic_id(u32 x) { return (x >> 24) & 0xFF; } -static u32 set_apic_id(unsigned int id) +static u32 set_apic_id(u32 id) { return (id & 0xFF) << 24; } -static int flat_phys_pkg_id(int initial_apic_id, int index_msb) +static u32 flat_phys_pkg_id(u32 initial_apic_id, int index_msb) { return initial_apic_id >> index_msb; } @@ -158,8 +158,6 @@ static struct apic apic_physflat __ro_after_init = { .disable_esr = 0, - .check_apicid_used = NULL, - .ioapic_phys_id_map = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .phys_pkg_id = flat_phys_pkg_id, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 966d7cf10b95..b00d52ae84fa 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -18,6 +18,8 @@ #include <asm/apic.h> +#include "local.h" + static void noop_send_IPI(int cpu, int vector) { } static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { } static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { } @@ -25,10 +27,10 @@ static void noop_send_IPI_allbutself(int vector) { } static void noop_send_IPI_all(int vector) { } static void noop_send_IPI_self(int vector) { } static void noop_apic_icr_write(u32 low, u32 id) { } -static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip) { return -1; } +static int noop_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip) { return -1; } static u64 noop_apic_icr_read(void) { return 0; } -static int noop_phys_pkg_id(int cpuid_apic, int index_msb) { return 0; } -static unsigned int noop_get_apic_id(unsigned long x) { return 0; } +static u32 noop_phys_pkg_id(u32 cpuid_apic, int index_msb) { return 0; } +static u32 noop_get_apic_id(u32 apicid) { return 0; } static void noop_apic_eoi(void) { } static u32 noop_apic_read(u32 reg) diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 63f3d7be9dc7..456a14c44f67 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -25,7 +25,7 @@ static const struct apic apic_numachip1; static const struct apic apic_numachip2; static void (*numachip_apic_icr_write)(int apicid, unsigned int val) __read_mostly; -static unsigned int numachip1_get_apic_id(unsigned long x) +static u32 numachip1_get_apic_id(u32 x) { unsigned long value; unsigned int id = (x >> 24) & 0xff; @@ -38,12 +38,12 @@ static unsigned int numachip1_get_apic_id(unsigned long x) return id; } -static u32 numachip1_set_apic_id(unsigned int id) +static u32 numachip1_set_apic_id(u32 id) { return (id & 0xff) << 24; } -static unsigned int numachip2_get_apic_id(unsigned long x) +static u32 numachip2_get_apic_id(u32 x) { u64 mcfg; @@ -51,12 +51,12 @@ static unsigned int numachip2_get_apic_id(unsigned long x) return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24); } -static u32 numachip2_set_apic_id(unsigned int id) +static u32 numachip2_set_apic_id(u32 id) { return id << 24; } -static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) +static u32 numachip_phys_pkg_id(u32 initial_apic_id, int index_msb) { return initial_apic_id >> index_msb; } @@ -71,7 +71,7 @@ static void numachip2_apic_icr_write(int apicid, unsigned int val) numachip2_write32_lcsr(NUMACHIP2_APIC_ICR, (apicid << 12) | val); } -static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) +static int numachip_wakeup_secondary(u32 phys_apicid, unsigned long start_rip) { numachip_apic_icr_write(phys_apicid, APIC_DM_INIT); numachip_apic_icr_write(phys_apicid, APIC_DM_STARTUP | @@ -161,7 +161,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) u64 val; u32 nodes = 1; - this_cpu_write(cpu_llc_id, node); + c->topo.llc_id = node; /* Account for nodes per socket in multi-core-module processors */ if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { @@ -169,7 +169,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) nodes = ((val >> 3) & 7) + 1; } - c->phys_proc_id = node / nodes; + c->topo.pkg_id = node / nodes; } static int __init numachip_system_init(void) diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 0e5535add4b5..7ee3c486cb33 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -13,12 +13,12 @@ #include "local.h" -static unsigned bigsmp_get_apic_id(unsigned long x) +static u32 bigsmp_get_apic_id(u32 x) { return (x >> 24) & 0xFF; } -static bool bigsmp_check_apicid_used(physid_mask_t *map, int apicid) +static bool bigsmp_check_apicid_used(physid_mask_t *map, u32 apicid) { return false; } @@ -29,7 +29,7 @@ static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *re physids_promote(0xFFL, retmap); } -static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) +static u32 bigsmp_phys_pkg_id(u32 cpuid_apic, int index_msb) { return cpuid_apic >> index_msb; } diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index a44ba7209ef3..0078730a512e 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -281,7 +281,7 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) } #ifdef CONFIG_SMP -static int convert_apicid_to_cpu(int apic_id) +static int convert_apicid_to_cpu(u32 apic_id) { int i; @@ -294,7 +294,8 @@ static int convert_apicid_to_cpu(int apic_id) int safe_smp_processor_id(void) { - int apicid, cpuid; + u32 apicid; + int cpuid; if (!boot_cpu_has(X86_FEATURE_APIC)) return 0; diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h index ec219c659c7d..9ea6186ea88c 100644 --- a/arch/x86/kernel/apic/local.h +++ b/arch/x86/kernel/apic/local.h @@ -15,9 +15,9 @@ /* X2APIC */ void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest); -unsigned int x2apic_get_apic_id(unsigned long id); -u32 x2apic_set_apic_id(unsigned int id); -int x2apic_phys_pkg_id(int initial_apicid, int index_msb); +u32 x2apic_get_apic_id(u32 id); +u32 x2apic_set_apic_id(u32 id); +u32 x2apic_phys_pkg_id(u32 initial_apicid, int index_msb); void x2apic_send_IPI_all(int vector); void x2apic_send_IPI_allbutself(int vector); @@ -64,6 +64,7 @@ void default_send_IPI_all(int vector); void default_send_IPI_self(int vector); bool default_apic_id_registered(void); +bool default_check_apicid_used(physid_mask_t *map, u32 apicid); #ifdef CONFIG_X86_32 void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector); diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 6b6b711678fe..d9651f15ae4f 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -55,14 +55,14 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force) * caused by the non-atomic update of the address/data pair. * * Direct update is possible when: - * - The MSI is maskable (remapped MSI does not use this code path)). - * The quirk bit is not set in this case. + * - The MSI is maskable (remapped MSI does not use this code path). + * The reservation mode bit is set in this case. * - The new vector is the same as the old vector * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up) * - The interrupt is not yet started up * - The new destination CPU is the same as the old destination CPU */ - if (!irqd_msi_nomask_quirk(irqd) || + if (!irqd_can_reserve(irqd) || cfg->vector == old_cfg.vector || old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR || !irqd_is_started(irqd) || @@ -215,8 +215,6 @@ static bool x86_init_dev_msi_info(struct device *dev, struct irq_domain *domain, if (WARN_ON_ONCE(domain != real_parent)) return false; info->chip->irq_set_affinity = msi_set_affinity; - /* See msi_set_affinity() for the gory details */ - info->flags |= MSI_FLAG_NOMASK_QUIRK; break; case DOMAIN_BUS_DMAR: case DOMAIN_BUS_AMDVI: diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 9a06df6cdd68..5eb3fbe472da 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -18,11 +18,21 @@ #include "local.h" -static int default_phys_pkg_id(int cpuid_apic, int index_msb) +static u32 default_phys_pkg_id(u32 cpuid_apic, int index_msb) { return cpuid_apic >> index_msb; } +static u32 default_get_apic_id(u32 x) +{ + unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); + + if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID)) + return (x >> 24) & 0xFF; + else + return (x >> 24) & 0x0F; +} + /* should be called last. */ static int probe_default(void) { diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 788cdb4ee394..7c9fe28f742f 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -124,17 +124,17 @@ static int x2apic_phys_probe(void) return apic == &apic_x2apic_phys; } -unsigned int x2apic_get_apic_id(unsigned long id) +u32 x2apic_get_apic_id(u32 id) { return id; } -u32 x2apic_set_apic_id(unsigned int id) +u32 x2apic_set_apic_id(u32 id) { return id; } -int x2apic_phys_pkg_id(int initial_apicid, int index_msb) +u32 x2apic_phys_pkg_id(u32 initial_apicid, int index_msb) { return initial_apicid >> index_msb; } diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 205cee567629..1b0d7336a28f 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -110,7 +110,7 @@ static void __init early_get_pnodeid(void) } else if (UVH_RH_GAM_ADDR_MAP_CONFIG) { union uvh_rh_gam_addr_map_config_u m_n_config; - m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_ADDR_MAP_CONFIG); + m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_ADDR_MAP_CONFIG); uv_cpuid.n_skt = m_n_config.s.n_skt; if (is_uv(UV3)) uv_cpuid.m_skt = m_n_config.s3.m_skt; @@ -701,7 +701,7 @@ static __init void build_uv_gr_table(void) } } -static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) +static int uv_wakeup_secondary(u32 phys_apicid, unsigned long start_rip) { unsigned long val; int pnode; @@ -779,7 +779,7 @@ static void uv_send_IPI_all(int vector) uv_send_IPI_mask(cpu_online_mask, vector); } -static u32 set_apic_id(unsigned int id) +static u32 set_apic_id(u32 id) { return id; } @@ -789,7 +789,7 @@ static unsigned int uv_read_apic_id(void) return x2apic_get_apic_id(apic_read(APIC_ID)); } -static int uv_phys_pkg_id(int initial_apicid, int index_msb) +static u32 uv_phys_pkg_id(u32 initial_apicid, int index_msb) { return uv_read_apic_id() >> index_msb; } diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index faa9f2299848..e9ad518a5003 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -48,11 +48,6 @@ EXPORT_SYMBOL_GPL(__x86_call_count); extern s32 __call_sites[], __call_sites_end[]; -struct thunk_desc { - void *template; - unsigned int template_size; -}; - struct core_text { unsigned long base; unsigned long end; diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 4350f6bfc064..93eabf544031 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -54,6 +54,8 @@ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o obj-$(CONFIG_ACRN_GUEST) += acrn.o +obj-$(CONFIG_DEBUG_FS) += debugfs.o + quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $@ $^ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index ece2b5b7b0fe..a7eab05e5f29 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -382,7 +382,7 @@ static int nearby_node(int apicid) #endif /* - * Fix up cpu_core_id for pre-F17h systems to be in the + * Fix up topo::core_id for pre-F17h systems to be in the * [0 .. cores_per_node - 1] range. Not really needed but * kept so as not to break existing setups. */ @@ -394,7 +394,7 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c) return; cus_per_node = c->x86_max_cores / nodes_per_socket; - c->cpu_core_id %= cus_per_node; + c->topo.core_id %= cus_per_node; } /* @@ -405,8 +405,6 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c) */ static void amd_get_topology(struct cpuinfo_x86 *c) { - int cpu = smp_processor_id(); - /* get information required for multi-node processors */ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { int err; @@ -414,13 +412,13 @@ static void amd_get_topology(struct cpuinfo_x86 *c) cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - c->cpu_die_id = ecx & 0xff; + c->topo.die_id = ecx & 0xff; if (c->x86 == 0x15) - c->cu_id = ebx & 0xff; + c->topo.cu_id = ebx & 0xff; if (c->x86 >= 0x17) { - c->cpu_core_id = ebx & 0xff; + c->topo.core_id = ebx & 0xff; if (smp_num_siblings > 1) c->x86_max_cores /= smp_num_siblings; @@ -434,15 +432,14 @@ static void amd_get_topology(struct cpuinfo_x86 *c) if (!err) c->x86_coreid_bits = get_count_order(c->x86_max_cores); - cacheinfo_amd_init_llc_id(c, cpu); + cacheinfo_amd_init_llc_id(c); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); - c->cpu_die_id = value & 7; - - per_cpu(cpu_llc_id, cpu) = c->cpu_die_id; + c->topo.die_id = value & 7; + c->topo.llc_id = c->topo.die_id; } else return; @@ -459,15 +456,14 @@ static void amd_get_topology(struct cpuinfo_x86 *c) static void amd_detect_cmp(struct cpuinfo_x86 *c) { unsigned bits; - int cpu = smp_processor_id(); bits = c->x86_coreid_bits; /* Low order bits define the core id (index of core in socket) */ - c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); + c->topo.core_id = c->topo.initial_apicid & ((1 << bits)-1); /* Convert the initial APIC ID into the socket ID */ - c->phys_proc_id = c->initial_apicid >> bits; + c->topo.pkg_id = c->topo.initial_apicid >> bits; /* use socket ID also for last level cache */ - per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id; + c->topo.llc_id = c->topo.die_id = c->topo.pkg_id; } u32 amd_get_nodes_per_socket(void) @@ -481,11 +477,11 @@ static void srat_detect_node(struct cpuinfo_x86 *c) #ifdef CONFIG_NUMA int cpu = smp_processor_id(); int node; - unsigned apicid = c->apicid; + unsigned apicid = c->topo.apicid; node = numa_cpu_node(cpu); if (node == NUMA_NO_NODE) - node = get_llc_id(cpu); + node = per_cpu_llc_id(cpu); /* * On multi-fabric platform (e.g. Numascale NumaChip) a @@ -515,7 +511,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c) * through CPU mapping may alter the outcome, directly * access __apicid_to_node[]. */ - int ht_nodeid = c->initial_apicid; + int ht_nodeid = c->topo.initial_apicid; if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE) node = __apicid_to_node[ht_nodeid]; @@ -1014,7 +1010,6 @@ static bool cpu_has_zenbleed_microcode(void) default: return false; - break; } if (boot_cpu_data.microcode < good_rev) @@ -1044,6 +1039,8 @@ static void zenbleed_check(struct cpuinfo_x86 *c) static void init_amd(struct cpuinfo_x86 *c) { + u64 vm_cr; + early_init_amd(c); /* @@ -1060,7 +1057,7 @@ static void init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_FSRS); /* get apicid instead of initial apic id from cpuid */ - c->apicid = read_apic_id(); + c->topo.apicid = read_apic_id(); /* K6s reports MCEs but don't actually have all the MSRs */ if (c->x86 < 6) @@ -1095,6 +1092,14 @@ static void init_amd(struct cpuinfo_x86 *c) init_amd_cacheinfo(c); + if (cpu_has(c, X86_FEATURE_SVM)) { + rdmsrl(MSR_VM_CR, vm_cr); + if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) { + pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n"); + clear_cpu_cap(c, X86_FEATURE_SVM); + } + } + if (!cpu_has(c, X86_FEATURE_LFENCE_RDTSC) && cpu_has(c, X86_FEATURE_XMM2)) { /* * Use LFENCE for execution serialization. On families which diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 10499bcd4e39..bb0ab8466b91 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -63,7 +63,7 @@ EXPORT_SYMBOL_GPL(x86_pred_cmd); static DEFINE_MUTEX(spec_ctrl_mutex); -void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk; +void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk; /* Update SPEC_CTRL MSR and its cached copy unconditionally */ static void update_spec_ctrl(u64 val) @@ -717,7 +717,7 @@ void update_gds_msr(void) case GDS_MITIGATION_UCODE_NEEDED: case GDS_MITIGATION_HYPERVISOR: return; - }; + } wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); @@ -1019,7 +1019,6 @@ static void __init retbleed_select_mitigation(void) do_cmd_auto: case RETBLEED_CMD_AUTO: - default: if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) @@ -1042,8 +1041,7 @@ do_cmd_auto: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_UNRET); - if (IS_ENABLED(CONFIG_RETHUNK)) - x86_return_thunk = retbleed_return_thunk; + x86_return_thunk = retbleed_return_thunk; if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) @@ -1061,7 +1059,8 @@ do_cmd_auto: case RETBLEED_MITIGATION_STUFF: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH); - x86_set_skl_return_thunk(); + + x86_return_thunk = call_depth_return_thunk; break; default: @@ -1290,6 +1289,8 @@ spectre_v2_user_select_mitigation(void) spectre_v2_user_ibpb = mode; switch (cmd) { + case SPECTRE_V2_USER_CMD_NONE: + break; case SPECTRE_V2_USER_CMD_FORCE: case SPECTRE_V2_USER_CMD_PRCTL_IBPB: case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: @@ -1301,8 +1302,6 @@ spectre_v2_user_select_mitigation(void) case SPECTRE_V2_USER_CMD_SECCOMP: static_branch_enable(&switch_mm_cond_ibpb); break; - default: - break; } pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", @@ -2160,6 +2159,10 @@ static int l1d_flush_prctl_get(struct task_struct *task) static int ssb_prctl_get(struct task_struct *task) { switch (ssb_mode) { + case SPEC_STORE_BYPASS_NONE: + if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + return PR_SPEC_ENABLE; + return PR_SPEC_NOT_AFFECTED; case SPEC_STORE_BYPASS_DISABLE: return PR_SPEC_DISABLE; case SPEC_STORE_BYPASS_SECCOMP: @@ -2171,11 +2174,8 @@ static int ssb_prctl_get(struct task_struct *task) if (task_spec_ssb_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_DISABLE; return PR_SPEC_PRCTL | PR_SPEC_ENABLE; - default: - if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) - return PR_SPEC_ENABLE; - return PR_SPEC_NOT_AFFECTED; } + BUG(); } static int ib_prctl_get(struct task_struct *task) @@ -2353,6 +2353,8 @@ early_param("l1tf", l1tf_cmdline); enum srso_mitigation { SRSO_MITIGATION_NONE, + SRSO_MITIGATION_UCODE_NEEDED, + SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED, SRSO_MITIGATION_MICROCODE, SRSO_MITIGATION_SAFE_RET, SRSO_MITIGATION_IBPB, @@ -2368,11 +2370,13 @@ enum srso_mitigation_cmd { }; static const char * const srso_strings[] = { - [SRSO_MITIGATION_NONE] = "Vulnerable", - [SRSO_MITIGATION_MICROCODE] = "Mitigation: microcode", - [SRSO_MITIGATION_SAFE_RET] = "Mitigation: safe RET", - [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB", - [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only" + [SRSO_MITIGATION_NONE] = "Vulnerable", + [SRSO_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", + [SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED] = "Vulnerable: Safe RET, no microcode", + [SRSO_MITIGATION_MICROCODE] = "Vulnerable: Microcode, no safe RET", + [SRSO_MITIGATION_SAFE_RET] = "Mitigation: Safe RET", + [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB", + [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only" }; static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE; @@ -2406,34 +2410,44 @@ static void __init srso_select_mitigation(void) { bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE); - if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off()) - goto pred_cmd; + if (cpu_mitigations_off()) + return; - if (!has_microcode) { - pr_warn("IBPB-extending microcode not applied!\n"); - pr_warn(SRSO_NOTICE); - } else { + if (!boot_cpu_has_bug(X86_BUG_SRSO)) { + if (boot_cpu_has(X86_FEATURE_SBPB)) + x86_pred_cmd = PRED_CMD_SBPB; + return; + } + + if (has_microcode) { /* * Zen1/2 with SMT off aren't vulnerable after the right * IBPB microcode has been applied. + * + * Zen1/2 don't have SBPB, no need to try to enable it here. */ if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) { setup_force_cpu_cap(X86_FEATURE_SRSO_NO); return; } - } - if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { - if (has_microcode) { - pr_err("Retbleed IBPB mitigation enabled, using same for SRSO\n"); + if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { srso_mitigation = SRSO_MITIGATION_IBPB; - goto pred_cmd; + goto out; } + } else { + pr_warn("IBPB-extending microcode not applied!\n"); + pr_warn(SRSO_NOTICE); + + /* may be overwritten by SRSO_CMD_SAFE_RET below */ + srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED; } switch (srso_cmd) { case SRSO_CMD_OFF: - goto pred_cmd; + if (boot_cpu_has(X86_FEATURE_SBPB)) + x86_pred_cmd = PRED_CMD_SBPB; + return; case SRSO_CMD_MICROCODE: if (has_microcode) { @@ -2458,10 +2472,12 @@ static void __init srso_select_mitigation(void) setup_force_cpu_cap(X86_FEATURE_SRSO); x86_return_thunk = srso_return_thunk; } - srso_mitigation = SRSO_MITIGATION_SAFE_RET; + if (has_microcode) + srso_mitigation = SRSO_MITIGATION_SAFE_RET; + else + srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED; } else { pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); - goto pred_cmd; } break; @@ -2473,7 +2489,6 @@ static void __init srso_select_mitigation(void) } } else { pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); - goto pred_cmd; } break; @@ -2485,20 +2500,12 @@ static void __init srso_select_mitigation(void) } } else { pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); - goto pred_cmd; } break; - - default: - break; } - pr_info("%s%s\n", srso_strings[srso_mitigation], (has_microcode ? "" : ", no microcode")); - -pred_cmd: - if ((boot_cpu_has(X86_FEATURE_SRSO_NO) || srso_cmd == SRSO_CMD_OFF) && - boot_cpu_has(X86_FEATURE_SBPB)) - x86_pred_cmd = PRED_CMD_SBPB; +out: + pr_info("%s\n", srso_strings[srso_mitigation]); } #undef pr_fmt @@ -2704,9 +2711,7 @@ static ssize_t srso_show_state(char *buf) if (boot_cpu_has(X86_FEATURE_SRSO_NO)) return sysfs_emit(buf, "Mitigation: SMT disabled\n"); - return sysfs_emit(buf, "%s%s\n", - srso_strings[srso_mitigation], - boot_cpu_has(X86_FEATURE_IBPB_BRTYPE) ? "" : ", no microcode"); + return sysfs_emit(buf, "%s\n", srso_strings[srso_mitigation]); } static ssize_t gds_show_state(char *buf) diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 8f86eacf69f7..c131c412db89 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -661,7 +661,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c) return i; } -void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu) +void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c) { /* * We may have multiple LLCs if L3 caches exist, so check if we @@ -672,13 +672,13 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu) if (c->x86 < 0x17) { /* LLC is at the node level. */ - per_cpu(cpu_llc_id, cpu) = c->cpu_die_id; + c->topo.llc_id = c->topo.die_id; } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) { /* * LLC is at the core complex level. * Core complex ID is ApicId[3] for these processors. */ - per_cpu(cpu_llc_id, cpu) = c->apicid >> 3; + c->topo.llc_id = c->topo.apicid >> 3; } else { /* * LLC ID is calculated from the number of threads sharing the @@ -694,12 +694,12 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu) if (num_sharing_cache) { int bits = get_count_order(num_sharing_cache); - per_cpu(cpu_llc_id, cpu) = c->apicid >> bits; + c->topo.llc_id = c->topo.apicid >> bits; } } } -void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu) +void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c) { /* * We may have multiple LLCs if L3 caches exist, so check if we @@ -712,7 +712,7 @@ void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu) * LLC is at the core complex level. * Core complex ID is ApicId[3] for these processors. */ - per_cpu(cpu_llc_id, cpu) = c->apicid >> 3; + c->topo.llc_id = c->topo.apicid >> 3; } void init_amd_cacheinfo(struct cpuinfo_x86 *c) @@ -740,9 +740,6 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; -#ifdef CONFIG_SMP - unsigned int cpu = c->cpu_index; -#endif if (c->cpuid_level > 3) { static int is_initialized; @@ -776,13 +773,13 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) new_l2 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); - l2_id = c->apicid & ~((1 << index_msb) - 1); + l2_id = c->topo.apicid & ~((1 << index_msb) - 1); break; case 3: new_l3 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); - l3_id = c->apicid & ~((1 << index_msb) - 1); + l3_id = c->topo.apicid & ~((1 << index_msb) - 1); break; default: break; @@ -856,30 +853,24 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) if (new_l2) { l2 = new_l2; -#ifdef CONFIG_SMP - per_cpu(cpu_llc_id, cpu) = l2_id; - per_cpu(cpu_l2c_id, cpu) = l2_id; -#endif + c->topo.llc_id = l2_id; + c->topo.l2c_id = l2_id; } if (new_l3) { l3 = new_l3; -#ifdef CONFIG_SMP - per_cpu(cpu_llc_id, cpu) = l3_id; -#endif + c->topo.llc_id = l3_id; } -#ifdef CONFIG_SMP /* - * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in + * If llc_id is not yet set, this means cpuid_level < 4 which in * turns means that the only possibility is SMT (as indicated in * cpuid1). Since cpuid2 doesn't specify shared caches, and we know * that SMT shares all caches, we can unconditionally set cpu_llc_id to - * c->phys_proc_id. + * c->topo.pkg_id. */ - if (per_cpu(cpu_llc_id, cpu) == BAD_APICID) - per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; -#endif + if (c->topo.llc_id == BAD_APICID) + c->topo.llc_id = c->topo.pkg_id; c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); @@ -915,7 +906,7 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, unsigned int apicid, nshared, first, last; nshared = base->eax.split.num_threads_sharing + 1; - apicid = cpu_data(cpu).apicid; + apicid = cpu_data(cpu).topo.apicid; first = apicid - (apicid % nshared); last = first + nshared - 1; @@ -924,14 +915,14 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, if (!this_cpu_ci->info_list) continue; - apicid = cpu_data(i).apicid; + apicid = cpu_data(i).topo.apicid; if ((apicid < first) || (apicid > last)) continue; this_leaf = this_cpu_ci->info_list + index; for_each_online_cpu(sibling) { - apicid = cpu_data(sibling).apicid; + apicid = cpu_data(sibling).topo.apicid; if ((apicid < first) || (apicid > last)) continue; cpumask_set_cpu(sibling, @@ -969,7 +960,7 @@ static void __cache_cpumap_setup(unsigned int cpu, int index, index_msb = get_count_order(num_threads_sharing); for_each_online_cpu(i) - if (cpu_data(i).apicid >> index_msb == c->apicid >> index_msb) { + if (cpu_data(i).topo.apicid >> index_msb == c->topo.apicid >> index_msb) { struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i); if (i == cpu || !sib_cpu_ci->info_list) @@ -1024,7 +1015,7 @@ static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs) num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); - id4_regs->id = c->apicid >> index_msb; + id4_regs->id = c->topo.apicid >> index_msb; } int populate_cache_leaves(unsigned int cpu) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4e5ffc8b0e46..5d9591146244 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -62,6 +62,7 @@ #include <asm/intel-family.h> #include <asm/cpu_device_id.h> #include <asm/uv/uv.h> +#include <asm/ia32.h> #include <asm/set_memory.h> #include <asm/traps.h> #include <asm/sev.h> @@ -74,18 +75,6 @@ u32 elf_hwcap2 __read_mostly; int smp_num_siblings = 1; EXPORT_SYMBOL(smp_num_siblings); -/* Last level cache ID of each logical CPU */ -DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID; - -u16 get_llc_id(unsigned int cpu) -{ - return per_cpu(cpu_llc_id, cpu); -} -EXPORT_SYMBOL_GPL(get_llc_id); - -/* L2 cache ID of each logical CPU */ -DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id) = BAD_APICID; - static struct ppin_info { int feature; int msr_ppin_ctl; @@ -914,7 +903,7 @@ void detect_ht(struct cpuinfo_x86 *c) return; index_msb = get_count_order(smp_num_siblings); - c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb); + c->topo.pkg_id = apic->phys_pkg_id(c->topo.initial_apicid, index_msb); smp_num_siblings = smp_num_siblings / c->x86_max_cores; @@ -922,8 +911,8 @@ void detect_ht(struct cpuinfo_x86 *c) core_bits = get_count_order(c->x86_max_cores); - c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) & - ((1 << core_bits) - 1); + c->topo.core_id = apic->phys_pkg_id(c->topo.initial_apicid, index_msb) & + ((1 << core_bits) - 1); #endif } @@ -1114,18 +1103,34 @@ void get_cpu_cap(struct cpuinfo_x86 *c) void get_cpu_address_sizes(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; + bool vp_bits_from_cpuid = true; - if (c->extended_cpuid_level >= 0x80000008) { + if (!cpu_has(c, X86_FEATURE_CPUID) || + (c->extended_cpuid_level < 0x80000008)) + vp_bits_from_cpuid = false; + + if (vp_bits_from_cpuid) { cpuid(0x80000008, &eax, &ebx, &ecx, &edx); c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; + } else { + if (IS_ENABLED(CONFIG_X86_64)) { + c->x86_clflush_size = 64; + c->x86_phys_bits = 36; + c->x86_virt_bits = 48; + } else { + c->x86_clflush_size = 32; + c->x86_virt_bits = 32; + c->x86_phys_bits = 32; + + if (cpu_has(c, X86_FEATURE_PAE) || + cpu_has(c, X86_FEATURE_PSE36)) + c->x86_phys_bits = 36; + } } -#ifdef CONFIG_X86_32 - else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36)) - c->x86_phys_bits = 36; -#endif c->x86_cache_bits = c->x86_phys_bits; + c->x86_cache_alignment = c->x86_clflush_size; } static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) @@ -1579,17 +1584,6 @@ static void __init cpu_parse_early_param(void) */ static void __init early_identify_cpu(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_64 - c->x86_clflush_size = 64; - c->x86_phys_bits = 36; - c->x86_virt_bits = 48; -#else - c->x86_clflush_size = 32; - c->x86_phys_bits = 32; - c->x86_virt_bits = 32; -#endif - c->x86_cache_alignment = c->x86_clflush_size; - memset(&c->x86_capability, 0, sizeof(c->x86_capability)); c->extended_cpuid_level = 0; @@ -1601,7 +1595,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) cpu_detect(c); get_cpu_vendor(c); get_cpu_cap(c); - get_cpu_address_sizes(c); setup_force_cpu_cap(X86_FEATURE_CPUID); cpu_parse_early_param(); @@ -1617,6 +1610,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_clear_cpu_cap(X86_FEATURE_CPUID); } + get_cpu_address_sizes(c); + setup_force_cpu_cap(X86_FEATURE_ALWAYS); cpu_set_bug_bits(c); @@ -1761,15 +1756,15 @@ static void generic_identify(struct cpuinfo_x86 *c) get_cpu_address_sizes(c); if (c->cpuid_level >= 0x00000001) { - c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; + c->topo.initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; #ifdef CONFIG_X86_32 # ifdef CONFIG_SMP - c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); + c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0); # else - c->apicid = c->initial_apicid; + c->topo.apicid = c->topo.initial_apicid; # endif #endif - c->phys_proc_id = c->initial_apicid; + c->topo.pkg_id = c->topo.initial_apicid; } get_model_name(c); /* Default name */ @@ -1799,18 +1794,19 @@ static void generic_identify(struct cpuinfo_x86 *c) static void validate_apic_and_package_id(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP - unsigned int apicid, cpu = smp_processor_id(); + unsigned int cpu = smp_processor_id(); + u32 apicid; apicid = apic->cpu_present_to_apicid(cpu); - if (apicid != c->apicid) { + if (apicid != c->topo.apicid) { pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n", - cpu, apicid, c->initial_apicid); + cpu, apicid, c->topo.initial_apicid); } - BUG_ON(topology_update_package_map(c->phys_proc_id, cpu)); - BUG_ON(topology_update_die_map(c->cpu_die_id, cpu)); + BUG_ON(topology_update_package_map(c->topo.pkg_id, cpu)); + BUG_ON(topology_update_die_map(c->topo.die_id, cpu)); #else - c->logical_proc_id = 0; + c->topo.logical_pkg_id = 0; #endif } @@ -1829,7 +1825,9 @@ static void identify_cpu(struct cpuinfo_x86 *c) c->x86_model_id[0] = '\0'; /* Unset */ c->x86_max_cores = 1; c->x86_coreid_bits = 0; - c->cu_id = 0xff; + c->topo.cu_id = 0xff; + c->topo.llc_id = BAD_APICID; + c->topo.l2c_id = BAD_APICID; #ifdef CONFIG_X86_64 c->x86_clflush_size = 64; c->x86_phys_bits = 36; @@ -1855,7 +1853,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) apply_forced_caps(c); #ifdef CONFIG_X86_64 - c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); + c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0); #endif /* @@ -2074,24 +2072,24 @@ void syscall_init(void) wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); -#ifdef CONFIG_IA32_EMULATION - wrmsrl_cstar((unsigned long)entry_SYSCALL_compat); - /* - * This only works on Intel CPUs. - * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. - * This does not cause SYSENTER to jump to the wrong location, because - * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). - */ - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, - (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1)); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); -#else - wrmsrl_cstar((unsigned long)ignore_sysret); - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); -#endif + if (ia32_enabled()) { + wrmsrl_cstar((unsigned long)entry_SYSCALL_compat); + /* + * This only works on Intel CPUs. + * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. + * This does not cause SYSENTER to jump to the wrong location, because + * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). + */ + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, + (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1)); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); + } else { + wrmsrl_cstar((unsigned long)entry_SYSCALL32_ignore); + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); + } /* * Flags to clear on syscall; clear as much as possible diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 1dcd7d4e38ef..885281ae79a5 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -78,6 +78,9 @@ extern int detect_ht_early(struct cpuinfo_x86 *c); extern void detect_ht(struct cpuinfo_x86 *c); extern void check_null_seg_clears_base(struct cpuinfo_x86 *c); +void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c); +void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c); + unsigned int aperfmperf_get_khz(int cpu); void cpu_select_mitigations(void); diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c new file mode 100644 index 000000000000..0c179d684b3b --- /dev/null +++ b/arch/x86/kernel/cpu/debugfs.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/debugfs.h> + +#include <asm/apic.h> +#include <asm/processor.h> + +static int cpu_debug_show(struct seq_file *m, void *p) +{ + unsigned long cpu = (unsigned long)m->private; + struct cpuinfo_x86 *c = per_cpu_ptr(&cpu_info, cpu); + + seq_printf(m, "online: %d\n", cpu_online(cpu)); + if (!c->initialized) + return 0; + + seq_printf(m, "initial_apicid: %x\n", c->topo.initial_apicid); + seq_printf(m, "apicid: %x\n", c->topo.apicid); + seq_printf(m, "pkg_id: %u\n", c->topo.pkg_id); + seq_printf(m, "die_id: %u\n", c->topo.die_id); + seq_printf(m, "cu_id: %u\n", c->topo.cu_id); + seq_printf(m, "core_id: %u\n", c->topo.core_id); + seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id); + seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id); + seq_printf(m, "llc_id: %u\n", c->topo.llc_id); + seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id); + seq_printf(m, "max_cores: %u\n", c->x86_max_cores); + seq_printf(m, "max_die_per_pkg: %u\n", __max_die_per_package); + seq_printf(m, "smp_num_siblings: %u\n", smp_num_siblings); + return 0; +} + +static int cpu_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, cpu_debug_show, inode->i_private); +} + +static const struct file_operations dfs_cpu_ops = { + .open = cpu_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static __init int cpu_init_debugfs(void) +{ + struct dentry *dir, *base = debugfs_create_dir("topo", arch_debugfs_dir); + unsigned long id; + char name[24]; + + dir = debugfs_create_dir("cpus", base); + for_each_possible_cpu(id) { + sprintf(name, "%lu", id); + debugfs_create_file(name, 0444, dir, (void *)id, &dfs_cpu_ops); + } + return 0; +} +late_initcall(cpu_init_debugfs); diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index defdc594be14..6f247d66758d 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -63,8 +63,6 @@ static void hygon_get_topology_early(struct cpuinfo_x86 *c) */ static void hygon_get_topology(struct cpuinfo_x86 *c) { - int cpu = smp_processor_id(); - /* get information required for multi-node processors */ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { int err; @@ -72,9 +70,9 @@ static void hygon_get_topology(struct cpuinfo_x86 *c) cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - c->cpu_die_id = ecx & 0xff; + c->topo.die_id = ecx & 0xff; - c->cpu_core_id = ebx & 0xff; + c->topo.core_id = ebx & 0xff; if (smp_num_siblings > 1) c->x86_max_cores /= smp_num_siblings; @@ -87,17 +85,20 @@ static void hygon_get_topology(struct cpuinfo_x86 *c) if (!err) c->x86_coreid_bits = get_count_order(c->x86_max_cores); - /* Socket ID is ApicId[6] for these processors. */ - c->phys_proc_id = c->apicid >> APICID_SOCKET_ID_BIT; + /* + * Socket ID is ApicId[6] for the processors with model <= 0x3 + * when running on host. + */ + if (!boot_cpu_has(X86_FEATURE_HYPERVISOR) && c->x86_model <= 0x3) + c->topo.pkg_id = c->topo.apicid >> APICID_SOCKET_ID_BIT; - cacheinfo_hygon_init_llc_id(c, cpu); + cacheinfo_hygon_init_llc_id(c); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); - c->cpu_die_id = value & 7; - - per_cpu(cpu_llc_id, cpu) = c->cpu_die_id; + c->topo.die_id = value & 7; + c->topo.llc_id = c->topo.die_id; } else return; @@ -112,15 +113,14 @@ static void hygon_get_topology(struct cpuinfo_x86 *c) static void hygon_detect_cmp(struct cpuinfo_x86 *c) { unsigned int bits; - int cpu = smp_processor_id(); bits = c->x86_coreid_bits; /* Low order bits define the core id (index of core in socket) */ - c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); + c->topo.core_id = c->topo.initial_apicid & ((1 << bits)-1); /* Convert the initial APIC ID into the socket ID */ - c->phys_proc_id = c->initial_apicid >> bits; - /* use socket ID also for last level cache */ - per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id; + c->topo.pkg_id = c->topo.initial_apicid >> bits; + /* Use package ID also for last level cache */ + c->topo.llc_id = c->topo.die_id = c->topo.pkg_id; } static void srat_detect_node(struct cpuinfo_x86 *c) @@ -128,11 +128,11 @@ static void srat_detect_node(struct cpuinfo_x86 *c) #ifdef CONFIG_NUMA int cpu = smp_processor_id(); int node; - unsigned int apicid = c->apicid; + unsigned int apicid = c->topo.apicid; node = numa_cpu_node(cpu); if (node == NUMA_NO_NODE) - node = per_cpu(cpu_llc_id, cpu); + node = c->topo.llc_id; /* * On multi-fabric platform (e.g. Numascale NumaChip) a @@ -161,7 +161,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c) * through CPU mapping may alter the outcome, directly * access __apicid_to_node[]. */ - int ht_nodeid = c->initial_apicid; + int ht_nodeid = c->topo.initial_apicid; if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE) node = __apicid_to_node[ht_nodeid]; @@ -290,6 +290,8 @@ static void early_init_hygon(struct cpuinfo_x86 *c) static void init_hygon(struct cpuinfo_x86 *c) { + u64 vm_cr; + early_init_hygon(c); /* @@ -301,7 +303,7 @@ static void init_hygon(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_REP_GOOD); /* get apicid instead of initial apic id from cpuid */ - c->apicid = read_apic_id(); + c->topo.apicid = read_apic_id(); /* * XXX someone from Hygon needs to confirm this DTRT @@ -320,6 +322,14 @@ static void init_hygon(struct cpuinfo_x86 *c) init_hygon_cacheinfo(c); + if (cpu_has(c, X86_FEATURE_SVM)) { + rdmsrl(MSR_VM_CR, vm_cr); + if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) { + pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n"); + clear_cpu_cap(c, X86_FEATURE_SVM); + } + } + if (cpu_has(c, X86_FEATURE_XMM2)) { /* * Use LFENCE for execution serialization. On families which diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index be4045628fd3..55efadb0e998 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -314,19 +314,6 @@ static void early_init_intel(struct cpuinfo_x86 *c) setup_clear_cpu_cap(X86_FEATURE_PGE); } - if (c->cpuid_level >= 0x00000001) { - u32 eax, ebx, ecx, edx; - - cpuid(0x00000001, &eax, &ebx, &ecx, &edx); - /* - * If HTT (EDX[28]) is set EBX[16:23] contain the number of - * apicids which are reserved per package. Store the resulting - * shift value for the package management code. - */ - if (edx & (1U << 28)) - c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff); - } - check_memory_type_self_snoop_errata(c); /* diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index c267f43de39e..f3517b8a8e91 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -713,17 +713,75 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) deferred_error_interrupt_enable(c); } -bool amd_mce_is_memory_error(struct mce *m) +/* + * DRAM ECC errors are reported in the Northbridge (bank 4) with + * Extended Error Code 8. + */ +static bool legacy_mce_is_memory_error(struct mce *m) +{ + return m->bank == 4 && XEC(m->status, 0x1f) == 8; +} + +/* + * DRAM ECC errors are reported in Unified Memory Controllers with + * Extended Error Code 0. + */ +static bool smca_mce_is_memory_error(struct mce *m) { enum smca_bank_types bank_type; - /* ErrCodeExt[20:16] */ - u8 xec = (m->status >> 16) & 0x1f; + + if (XEC(m->status, 0x3f)) + return false; bank_type = smca_get_bank_type(m->extcpu, m->bank); + + return bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2; +} + +bool amd_mce_is_memory_error(struct mce *m) +{ if (mce_flags.smca) - return (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) && xec == 0x0; + return smca_mce_is_memory_error(m); + else + return legacy_mce_is_memory_error(m); +} + +/* + * AMD systems do not have an explicit indicator that the value in MCA_ADDR is + * a system physical address. Therefore, individual cases need to be detected. + * Future cases and checks will be added as needed. + * + * 1) General case + * a) Assume address is not usable. + * 2) Poison errors + * a) Indicated by MCA_STATUS[43]: poison. Defined for all banks except legacy + * northbridge (bank 4). + * b) Refers to poison consumption in the core. Does not include "no action", + * "action optional", or "deferred" error severities. + * c) Will include a usable address so that immediate action can be taken. + * 3) Northbridge DRAM ECC errors + * a) Reported in legacy bank 4 with extended error code (XEC) 8. + * b) MCA_STATUS[43] is *not* defined as poison in legacy bank 4. Therefore, + * this bit should not be checked. + * + * NOTE: SMCA UMC memory errors fall into case #1. + */ +bool amd_mce_usable_address(struct mce *m) +{ + /* Check special northbridge case 3) first. */ + if (!mce_flags.smca) { + if (legacy_mce_is_memory_error(m)) + return true; + else if (m->bank == 4) + return false; + } - return m->bank == 4 && xec == 0x8; + /* Check poison bit for all other bank types. */ + if (m->status & MCI_STATUS_POISON) + return true; + + /* Assume address is not usable for all others. */ + return false; } static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index 8ed341714686..7f7309ff67d0 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -103,9 +103,9 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) m.socketid = -1; for_each_possible_cpu(cpu) { - if (cpu_data(cpu).initial_apicid == lapic_id) { + if (cpu_data(cpu).topo.initial_apicid == lapic_id) { m.extcpu = cpu; - m.socketid = cpu_data(m.extcpu).phys_proc_id; + m.socketid = cpu_data(m.extcpu).topo.pkg_id; break; } } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 6f35f724cc14..7b397370b4d6 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -123,8 +123,8 @@ void mce_setup(struct mce *m) m->time = __ktime_get_real_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; m->cpuid = cpuid_eax(1); - m->socketid = cpu_data(m->extcpu).phys_proc_id; - m->apicid = cpu_data(m->extcpu).initial_apicid; + m->socketid = cpu_data(m->extcpu).topo.pkg_id; + m->apicid = cpu_data(m->extcpu).topo.initial_apicid; m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); m->ppin = cpu_data(m->extcpu).ppin; m->microcode = boot_cpu_data.microcode; @@ -453,32 +453,22 @@ static void mce_irq_work_cb(struct irq_work *entry) mce_schedule_work(); } -/* - * Check if the address reported by the CPU is in a format we can parse. - * It would be possible to add code for most other cases, but all would - * be somewhat complicated (e.g. segment offset would require an instruction - * parser). So only support physical addresses up to page granularity for now. - */ -int mce_usable_address(struct mce *m) +bool mce_usable_address(struct mce *m) { if (!(m->status & MCI_STATUS_ADDRV)) - return 0; - - /* Checks after this one are Intel/Zhaoxin-specific: */ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && - boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) - return 1; - - if (!(m->status & MCI_STATUS_MISCV)) - return 0; + return false; - if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) - return 0; + switch (m->cpuvendor) { + case X86_VENDOR_AMD: + return amd_mce_usable_address(m); - if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) - return 0; + case X86_VENDOR_INTEL: + case X86_VENDOR_ZHAOXIN: + return intel_mce_usable_address(m); - return 1; + default: + return true; + } } EXPORT_SYMBOL_GPL(mce_usable_address); diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index f5323551c1a9..52bce533ddcc 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -536,3 +536,23 @@ bool intel_filter_mce(struct mce *m) return false; } + +/* + * Check if the address reported by the CPU is in a format we can parse. + * It would be possible to add code for most other cases, but all would + * be somewhat complicated (e.g. segment offset would require an instruction + * parser). So only support physical addresses up to page granularity for now. + */ +bool intel_mce_usable_address(struct mce *m) +{ + if (!(m->status & MCI_STATUS_MISCV)) + return false; + + if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) + return false; + + if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) + return false; + + return true; +} diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index bcf1b3c66c9c..e13a26c9c0ac 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -49,6 +49,7 @@ void intel_init_cmci(void); void intel_init_lmce(void); void intel_clear_lmce(void); bool intel_filter_mce(struct mce *m); +bool intel_mce_usable_address(struct mce *m); #else # define cmci_intel_adjust_timer mce_adjust_timer_default static inline bool mce_intel_cmci_poll(void) { return false; } @@ -58,6 +59,7 @@ static inline void intel_init_cmci(void) { } static inline void intel_init_lmce(void) { } static inline void intel_clear_lmce(void) { } static inline bool intel_filter_mce(struct mce *m) { return false; } +static inline bool intel_mce_usable_address(struct mce *m) { return false; } #endif void mce_timer_kick(unsigned long interval); @@ -210,6 +212,7 @@ extern bool filter_mce(struct mce *m); #ifdef CONFIG_X86_MCE_AMD extern bool amd_filter_mce(struct mce *m); +bool amd_mce_usable_address(struct mce *m); /* * If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits @@ -237,6 +240,7 @@ static __always_inline void smca_extract_err_addr(struct mce *m) #else static inline bool amd_filter_mce(struct mce *m) { return false; } +static inline bool amd_mce_usable_address(struct mce *m) { return false; } static inline void smca_extract_err_addr(struct mce *m) { } #endif diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 31c0e68f6227..e65fae63660e 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -20,13 +20,13 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, unsigned int cpu) { #ifdef CONFIG_SMP - seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); + seq_printf(m, "physical id\t: %d\n", c->topo.pkg_id); seq_printf(m, "siblings\t: %d\n", cpumask_weight(topology_core_cpumask(cpu))); - seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); + seq_printf(m, "core id\t\t: %d\n", c->topo.core_id); seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); - seq_printf(m, "apicid\t\t: %d\n", c->apicid); - seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid); + seq_printf(m, "apicid\t\t: %d\n", c->topo.apicid); + seq_printf(m, "initial apicid\t: %d\n", c->topo.initial_apicid); #endif } diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 030d3b409768..19e0681f0435 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -152,6 +152,7 @@ static inline void cache_alloc_hsw_probe(void) r->cache.cbm_len = 20; r->cache.shareable_bits = 0xc0000; r->cache.min_cbm_bits = 2; + r->cache.arch_has_sparse_bitmasks = false; r->alloc_capable = true; rdt_alloc_capable = true; @@ -267,15 +268,18 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); union cpuid_0x10_1_eax eax; + union cpuid_0x10_x_ecx ecx; union cpuid_0x10_x_edx edx; - u32 ebx, ecx; + u32 ebx; - cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full); + cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx.full, &edx.full); hw_res->num_closid = edx.split.cos_max + 1; r->cache.cbm_len = eax.split.cbm_len + 1; r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; r->cache.shareable_bits = ebx & r->default_ctrl; r->data_width = (r->cache.cbm_len + 3) / 4; + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + r->cache.arch_has_sparse_bitmasks = ecx.split.noncont; r->alloc_capable = true; } @@ -872,7 +876,6 @@ static __init void rdt_init_res_defs_intel(void) if (r->rid == RDT_RESOURCE_L3 || r->rid == RDT_RESOURCE_L2) { - r->cache.arch_has_sparse_bitmaps = false; r->cache.arch_has_per_cpu_cfg = false; r->cache.min_cbm_bits = 1; } else if (r->rid == RDT_RESOURCE_MBA) { @@ -892,7 +895,7 @@ static __init void rdt_init_res_defs_amd(void) if (r->rid == RDT_RESOURCE_L3 || r->rid == RDT_RESOURCE_L2) { - r->cache.arch_has_sparse_bitmaps = true; + r->cache.arch_has_sparse_bitmasks = true; r->cache.arch_has_per_cpu_cfg = true; r->cache.min_cbm_bits = 0; } else if (r->rid == RDT_RESOURCE_MBA) { diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index b44c487727d4..beccb0e87ba7 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -87,10 +87,12 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, /* * Check whether a cache bit mask is valid. - * For Intel the SDM says: - * Please note that all (and only) contiguous '1' combinations - * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.). - * Additionally Haswell requires at least two bits set. + * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID: + * - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1 + * - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1 + * + * Haswell does not support a non-contiguous 1s value and additionally + * requires at least two bits set. * AMD allows non-contiguous bitmasks. */ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) @@ -113,8 +115,8 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) first_bit = find_first_bit(&val, cbm_len); zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - /* Are non-contiguous bitmaps allowed? */ - if (!r->cache.arch_has_sparse_bitmaps && + /* Are non-contiguous bitmasks allowed? */ + if (!r->cache.arch_has_sparse_bitmasks && (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) { rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val); return false; diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 85ceaf9a31ac..a4f1aa15f0a2 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -59,6 +59,7 @@ struct rdt_fs_context { bool enable_cdpl2; bool enable_cdpl3; bool enable_mba_mbps; + bool enable_debug; }; static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) @@ -243,18 +244,17 @@ struct rdtgroup { */ #define RFTYPE_INFO BIT(0) #define RFTYPE_BASE BIT(1) -#define RF_CTRLSHIFT 4 -#define RF_MONSHIFT 5 -#define RF_TOPSHIFT 6 -#define RFTYPE_CTRL BIT(RF_CTRLSHIFT) -#define RFTYPE_MON BIT(RF_MONSHIFT) -#define RFTYPE_TOP BIT(RF_TOPSHIFT) +#define RFTYPE_CTRL BIT(4) +#define RFTYPE_MON BIT(5) +#define RFTYPE_TOP BIT(6) #define RFTYPE_RES_CACHE BIT(8) #define RFTYPE_RES_MB BIT(9) -#define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) -#define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON) -#define RF_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) -#define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) +#define RFTYPE_DEBUG BIT(10) +#define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) +#define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) +#define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) +#define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) +#define RFTYPE_MON_BASE (RFTYPE_BASE | RFTYPE_MON) /* List of all resource groups */ extern struct list_head rdt_all_groups; @@ -270,7 +270,7 @@ void __exit rdtgroup_exit(void); * @mode: Access mode * @kf_ops: File operations * @flags: File specific RFTYPE_FLAGS_* flags - * @fflags: File specific RF_* or RFTYPE_* flags + * @fflags: File specific RFTYPE_* flags * @seq_show: Show content of the file * @write: Write to the file */ @@ -492,6 +492,15 @@ union cpuid_0x10_3_eax { unsigned int full; }; +/* CPUID.(EAX=10H, ECX=ResID).ECX */ +union cpuid_0x10_x_ecx { + struct { + unsigned int reserved:3; + unsigned int noncont:1; + } split; + unsigned int full; +}; + /* CPUID.(EAX=10H, ECX=ResID).EDX */ union cpuid_0x10_x_edx { struct { diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 725344048f85..69a1de92384a 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -54,8 +54,13 @@ static struct kernfs_node *kn_mondata; static struct seq_buf last_cmd_status; static char last_cmd_status_buf[512]; +static int rdtgroup_setup_root(struct rdt_fs_context *ctx); +static void rdtgroup_destroy_root(void); + struct dentry *debugfs_resctrl; +static bool resctrl_debug; + void rdt_last_cmd_clear(void) { lockdep_assert_held(&rdtgroup_mutex); @@ -696,11 +701,10 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct rdtgroup *rdtgrp; + char *pid_str; int ret = 0; pid_t pid; - if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) - return -EINVAL; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { rdtgroup_kn_unlock(of->kn); @@ -715,7 +719,27 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, goto unlock; } - ret = rdtgroup_move_task(pid, rdtgrp, of); + while (buf && buf[0] != '\0' && buf[0] != '\n') { + pid_str = strim(strsep(&buf, ",")); + + if (kstrtoint(pid_str, 0, &pid)) { + rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str); + ret = -EINVAL; + break; + } + + if (pid < 0) { + rdt_last_cmd_printf("Invalid pid %d\n", pid); + ret = -EINVAL; + break; + } + + ret = rdtgroup_move_task(pid, rdtgrp, of); + if (ret) { + rdt_last_cmd_printf("Error while processing task %d\n", pid); + break; + } + } unlock: rdtgroup_kn_unlock(of->kn); @@ -755,6 +779,38 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of, return ret; } +static int rdtgroup_closid_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) + seq_printf(s, "%u\n", rdtgrp->closid); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +static int rdtgroup_rmid_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) + seq_printf(s, "%u\n", rdtgrp->mon.rmid); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + #ifdef CONFIG_PROC_CPU_RESCTRL /* @@ -895,7 +951,7 @@ static int rdt_shareable_bits_show(struct kernfs_open_file *of, return 0; } -/** +/* * rdt_bit_usage_show - Display current usage of resources * * A domain is a shared resource that can now be allocated differently. Here @@ -1117,12 +1173,24 @@ static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type) } } +static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = of->kn->parent->priv; + struct rdt_resource *r = s->res; + + seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks); + + return 0; +} + /** * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other * @r: Resource to which domain instance @d belongs. * @d: The domain instance for which @closid is being tested. * @cbm: Capacity bitmask being tested. * @closid: Intended closid for @cbm. + * @type: CDP type of @r. * @exclusive: Only check if overlaps with exclusive resource groups * * Checks if provided @cbm intended to be used for @closid on domain @@ -1209,6 +1277,7 @@ bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d, /** * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive + * @rdtgrp: Resource group identified through its closid. * * An exclusive resource group implies that there should be no sharing of * its allocated resources. At the time this group is considered to be @@ -1251,9 +1320,8 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) return true; } -/** +/* * rdtgroup_mode_write - Modify the resource group's mode - * */ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) @@ -1357,12 +1425,11 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, return size; } -/** +/* * rdtgroup_size_show - Display size in bytes of allocated regions * * The "size" file mirrors the layout of the "schemata" file, printing the * size in bytes of each region instead of the capacity bitmask. - * */ static int rdtgroup_size_show(struct kernfs_open_file *of, struct seq_file *s, void *v) @@ -1686,77 +1753,77 @@ static struct rftype res_common_files[] = { .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_last_cmd_status_show, - .fflags = RF_TOP_INFO, + .fflags = RFTYPE_TOP_INFO, }, { .name = "num_closids", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_num_closids_show, - .fflags = RF_CTRL_INFO, + .fflags = RFTYPE_CTRL_INFO, }, { .name = "mon_features", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_mon_features_show, - .fflags = RF_MON_INFO, + .fflags = RFTYPE_MON_INFO, }, { .name = "num_rmids", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_num_rmids_show, - .fflags = RF_MON_INFO, + .fflags = RFTYPE_MON_INFO, }, { .name = "cbm_mask", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_default_ctrl_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "min_cbm_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_min_cbm_bits_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "shareable_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_shareable_bits_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "bit_usage", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_bit_usage_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "min_bandwidth", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_min_bw_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, }, { .name = "bandwidth_gran", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_bw_gran_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, }, { .name = "delay_linear", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_delay_linear_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, }, /* * Platform specific which (if any) capabilities are provided by @@ -1775,7 +1842,7 @@ static struct rftype res_common_files[] = { .kf_ops = &rdtgroup_kf_single_ops, .write = max_threshold_occ_write, .seq_show = max_threshold_occ_show, - .fflags = RF_MON_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, }, { .name = "mbm_total_bytes_config", @@ -1817,12 +1884,19 @@ static struct rftype res_common_files[] = { .fflags = RFTYPE_BASE, }, { + .name = "mon_hw_id", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdtgroup_rmid_show, + .fflags = RFTYPE_MON_BASE | RFTYPE_DEBUG, + }, + { .name = "schemata", .mode = 0644, .kf_ops = &rdtgroup_kf_single_ops, .write = rdtgroup_schemata_write, .seq_show = rdtgroup_schemata_show, - .fflags = RF_CTRL_BASE, + .fflags = RFTYPE_CTRL_BASE, }, { .name = "mode", @@ -1830,14 +1904,28 @@ static struct rftype res_common_files[] = { .kf_ops = &rdtgroup_kf_single_ops, .write = rdtgroup_mode_write, .seq_show = rdtgroup_mode_show, - .fflags = RF_CTRL_BASE, + .fflags = RFTYPE_CTRL_BASE, }, { .name = "size", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdtgroup_size_show, - .fflags = RF_CTRL_BASE, + .fflags = RFTYPE_CTRL_BASE, + }, + { + .name = "sparse_masks", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_has_sparse_bitmasks_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "ctrl_hw_id", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdtgroup_closid_show, + .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG, }, }; @@ -1852,6 +1940,9 @@ static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) lockdep_assert_held(&rdtgroup_mutex); + if (resctrl_debug) + fflags |= RFTYPE_DEBUG; + for (rft = rfts; rft < rfts + len; rft++) { if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) { ret = rdtgroup_add_file(kn, rft); @@ -1894,7 +1985,7 @@ void __init thread_throttle_mode_init(void) if (!rft) return; - rft->fflags = RF_CTRL_INFO | RFTYPE_RES_MB; + rft->fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB; } void __init mbm_config_rftype_init(const char *config) @@ -1903,7 +1994,7 @@ void __init mbm_config_rftype_init(const char *config) rft = rdtgroup_get_rftype_by_name(config); if (rft) - rft->fflags = RF_MON_INFO | RFTYPE_RES_CACHE; + rft->fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE; } /** @@ -2038,21 +2129,21 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) if (IS_ERR(kn_info)) return PTR_ERR(kn_info); - ret = rdtgroup_add_files(kn_info, RF_TOP_INFO); + ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO); if (ret) goto out_destroy; /* loop over enabled controls, these are all alloc_capable */ list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; - fflags = r->fflags | RF_CTRL_INFO; + fflags = r->fflags | RFTYPE_CTRL_INFO; ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags); if (ret) goto out_destroy; } for_each_mon_capable_rdt_resource(r) { - fflags = r->fflags | RF_MON_INFO; + fflags = r->fflags | RFTYPE_MON_INFO; sprintf(name, "%s_MON", r->name); ret = rdtgroup_mkdir_info_resdir(r, name, fflags); if (ret) @@ -2271,14 +2362,6 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable) return 0; } -static void cdp_disable_all(void) -{ - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3)) - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); -} - /* * We don't allow rdtgroup directories to be created anywhere * except the root directory. Thus when looking for the rdtgroup @@ -2358,19 +2441,47 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, struct kernfs_node **mon_data_kn); +static void rdt_disable_ctx(void) +{ + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); + set_mba_sc(false); + + resctrl_debug = false; +} + static int rdt_enable_ctx(struct rdt_fs_context *ctx) { int ret = 0; - if (ctx->enable_cdpl2) + if (ctx->enable_cdpl2) { ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true); + if (ret) + goto out_done; + } - if (!ret && ctx->enable_cdpl3) + if (ctx->enable_cdpl3) { ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true); + if (ret) + goto out_cdpl2; + } - if (!ret && ctx->enable_mba_mbps) + if (ctx->enable_mba_mbps) { ret = set_mba_sc(true); + if (ret) + goto out_cdpl3; + } + + if (ctx->enable_debug) + resctrl_debug = true; + return 0; + +out_cdpl3: + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); +out_cdpl2: + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); +out_done: return ret; } @@ -2463,6 +2574,7 @@ static void schemata_list_destroy(void) static int rdt_get_tree(struct fs_context *fc) { struct rdt_fs_context *ctx = rdt_fc2context(fc); + unsigned long flags = RFTYPE_CTRL_BASE; struct rdt_domain *dom; struct rdt_resource *r; int ret; @@ -2477,18 +2589,31 @@ static int rdt_get_tree(struct fs_context *fc) goto out; } + ret = rdtgroup_setup_root(ctx); + if (ret) + goto out; + ret = rdt_enable_ctx(ctx); - if (ret < 0) - goto out_cdp; + if (ret) + goto out_root; ret = schemata_list_create(); if (ret) { schemata_list_destroy(); - goto out_mba; + goto out_ctx; } closid_init(); + if (rdt_mon_capable) + flags |= RFTYPE_MON; + + ret = rdtgroup_add_files(rdtgroup_default.kn, flags); + if (ret) + goto out_schemata_free; + + kernfs_activate(rdtgroup_default.kn); + ret = rdtgroup_create_info_dir(rdtgroup_default.kn); if (ret < 0) goto out_schemata_free; @@ -2543,11 +2668,10 @@ out_info: kernfs_remove(kn_info); out_schemata_free: schemata_list_destroy(); -out_mba: - if (ctx->enable_mba_mbps) - set_mba_sc(false); -out_cdp: - cdp_disable_all(); +out_ctx: + rdt_disable_ctx(); +out_root: + rdtgroup_destroy_root(); out: rdt_last_cmd_clear(); mutex_unlock(&rdtgroup_mutex); @@ -2559,6 +2683,7 @@ enum rdt_param { Opt_cdp, Opt_cdpl2, Opt_mba_mbps, + Opt_debug, nr__rdt_params }; @@ -2566,6 +2691,7 @@ static const struct fs_parameter_spec rdt_fs_parameters[] = { fsparam_flag("cdp", Opt_cdp), fsparam_flag("cdpl2", Opt_cdpl2), fsparam_flag("mba_MBps", Opt_mba_mbps), + fsparam_flag("debug", Opt_debug), {} }; @@ -2591,6 +2717,9 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) return -EINVAL; ctx->enable_mba_mbps = true; return 0; + case Opt_debug: + ctx->enable_debug = true; + return 0; } return -EINVAL; @@ -2618,7 +2747,6 @@ static int rdt_init_fs_context(struct fs_context *fc) if (!ctx) return -ENOMEM; - ctx->kfc.root = rdt_root; ctx->kfc.magic = RDTGROUP_SUPER_MAGIC; fc->fs_private = &ctx->kfc; fc->ops = &rdt_fs_context_ops; @@ -2779,16 +2907,16 @@ static void rdt_kill_sb(struct super_block *sb) cpus_read_lock(); mutex_lock(&rdtgroup_mutex); - set_mba_sc(false); + rdt_disable_ctx(); /*Put everything back to default values. */ for_each_alloc_capable_rdt_resource(r) reset_all_ctrls(r); - cdp_disable_all(); rmdir_all_sub(); rdt_pseudo_lock_release(); rdtgroup_default.mode = RDT_MODE_SHAREABLE; schemata_list_destroy(); + rdtgroup_destroy_root(); static_branch_disable_cpuslocked(&rdt_alloc_enable_key); static_branch_disable_cpuslocked(&rdt_mon_enable_key); static_branch_disable_cpuslocked(&rdt_enable_key); @@ -3170,8 +3298,8 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, enum rdt_group_type rtype, struct rdtgroup **r) { struct rdtgroup *prdtgrp, *rdtgrp; + unsigned long files = 0; struct kernfs_node *kn; - uint files = 0; int ret; prdtgrp = rdtgroup_kn_lock_live(parent_kn); @@ -3223,7 +3351,14 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, goto out_destroy; } - files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype); + if (rtype == RDTCTRL_GROUP) { + files = RFTYPE_BASE | RFTYPE_CTRL; + if (rdt_mon_capable) + files |= RFTYPE_MON; + } else { + files = RFTYPE_BASE | RFTYPE_MON; + } + ret = rdtgroup_add_files(kn, files); if (ret) { rdt_last_cmd_puts("kernfs fill error\n"); @@ -3656,6 +3791,9 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl)) seq_puts(seq, ",mba_MBps"); + if (resctrl_debug) + seq_puts(seq, ",debug"); + return 0; } @@ -3666,10 +3804,8 @@ static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { .show_options = rdtgroup_show_options, }; -static int __init rdtgroup_setup_root(void) +static int rdtgroup_setup_root(struct rdt_fs_context *ctx) { - int ret; - rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, KERNFS_ROOT_CREATE_DEACTIVATED | KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, @@ -3677,6 +3813,20 @@ static int __init rdtgroup_setup_root(void) if (IS_ERR(rdt_root)) return PTR_ERR(rdt_root); + ctx->kfc.root = rdt_root; + rdtgroup_default.kn = kernfs_root_to_node(rdt_root); + + return 0; +} + +static void rdtgroup_destroy_root(void) +{ + kernfs_destroy_root(rdt_root); + rdtgroup_default.kn = NULL; +} + +static void __init rdtgroup_setup_default(void) +{ mutex_lock(&rdtgroup_mutex); rdtgroup_default.closid = 0; @@ -3686,19 +3836,7 @@ static int __init rdtgroup_setup_root(void) list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); - ret = rdtgroup_add_files(kernfs_root_to_node(rdt_root), RF_CTRL_BASE); - if (ret) { - kernfs_destroy_root(rdt_root); - goto out; - } - - rdtgroup_default.kn = kernfs_root_to_node(rdt_root); - kernfs_activate(rdtgroup_default.kn); - -out: mutex_unlock(&rdtgroup_mutex); - - return ret; } static void domain_destroy_mon_state(struct rdt_domain *d) @@ -3820,13 +3958,11 @@ int __init rdtgroup_init(void) seq_buf_init(&last_cmd_status, last_cmd_status_buf, sizeof(last_cmd_status_buf)); - ret = rdtgroup_setup_root(); - if (ret) - return ret; + rdtgroup_setup_default(); ret = sysfs_create_mount_point(fs_kobj, "resctrl"); if (ret) - goto cleanup_root; + return ret; ret = register_filesystem(&rdt_fs_type); if (ret) @@ -3859,8 +3995,6 @@ int __init rdtgroup_init(void) cleanup_mountpoint: sysfs_remove_mount_point(fs_kobj, "resctrl"); -cleanup_root: - kernfs_destroy_root(rdt_root); return ret; } @@ -3870,5 +4004,4 @@ void __exit rdtgroup_exit(void) debugfs_remove_recursive(debugfs_resctrl); unregister_filesystem(&rdt_fs_type); sysfs_remove_mount_point(fs_kobj, "resctrl"); - kernfs_destroy_root(rdt_root); } diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 0270925fe013..dc136703566f 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -78,7 +78,7 @@ int detect_extended_topology_early(struct cpuinfo_x86 *c) /* * initial apic id, which also represents 32-bit extended x2apic id. */ - c->initial_apicid = edx; + c->topo.initial_apicid = edx; smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); #endif return 0; @@ -108,7 +108,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c) * Populate HT related information from sub-leaf level 0. */ cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); - c->initial_apicid = edx; + c->topo.initial_apicid = edx; core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); @@ -146,20 +146,19 @@ int detect_extended_topology(struct cpuinfo_x86 *c) die_select_mask = (~(-1 << die_plus_mask_width)) >> core_plus_mask_width; - c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, + c->topo.core_id = apic->phys_pkg_id(c->topo.initial_apicid, ht_mask_width) & core_select_mask; if (die_level_present) { - c->cpu_die_id = apic->phys_pkg_id(c->initial_apicid, + c->topo.die_id = apic->phys_pkg_id(c->topo.initial_apicid, core_plus_mask_width) & die_select_mask; } - c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, - pkg_mask_width); + c->topo.pkg_id = apic->phys_pkg_id(c->topo.initial_apicid, pkg_mask_width); /* * Reinit the apicid, now that we have extended initial_apicid. */ - c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); + c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0); c->x86_max_cores = (core_level_siblings / smp_num_siblings); __max_die_per_package = (die_level_siblings / core_level_siblings); diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c index 05fa4ef63490..415564a6523b 100644 --- a/arch/x86/kernel/cpu/zhaoxin.c +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -65,20 +65,6 @@ static void early_init_zhaoxin(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); } - - if (c->cpuid_level >= 0x00000001) { - u32 eax, ebx, ecx, edx; - - cpuid(0x00000001, &eax, &ebx, &ecx, &edx); - /* - * If HTT (EDX[28]) is set EBX[16:23] contain the number of - * apicids which are reserved per package. Store the resulting - * shift value for the package management code. - */ - if (edx & (1U << 28)) - c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff); - } - } static void init_zhaoxin(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 87d38f17ff5c..afd09924094e 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -278,7 +278,7 @@ static void __init dtb_apic_setup(void) } #ifdef CONFIG_OF_EARLY_FLATTREE -static void __init x86_flattree_get_config(void) +void __init x86_flattree_get_config(void) { u32 size, map_len; void *dt; @@ -300,14 +300,10 @@ static void __init x86_flattree_get_config(void) unflatten_and_copy_device_tree(); early_memunmap(dt, map_len); } -#else -static inline void x86_flattree_get_config(void) { } #endif void __init x86_dtb_init(void) { - x86_flattree_get_config(); - if (!of_have_populated_dt()) return; diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index ef6906107c54..117e74c44e75 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1736,7 +1736,6 @@ EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm); /** * fpu_xstate_prctl - xstate permission operations - * @tsk: Redundant pointer to current * @option: A subfunction of arch_prctl() * @arg2: option argument * Return: 0 if successful; otherwise, an error code diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index 24c1175a47e2..58d9ed50fe61 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -3,10 +3,10 @@ * Copyright (C) 2017 Steven Rostedt, VMware Inc. */ +#include <linux/export.h> #include <linux/linkage.h> #include <asm/page_types.h> #include <asm/segment.h> -#include <asm/export.h> #include <asm/ftrace.h> #include <asm/nospec-branch.h> #include <asm/frame.h> diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 945cfa5f7239..214f30e9f0c0 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -3,12 +3,12 @@ * Copyright (C) 2014 Steven Rostedt, Red Hat Inc */ +#include <linux/export.h> #include <linux/cfi_types.h> #include <linux/linkage.h> #include <asm/asm-offsets.h> #include <asm/ptrace.h> #include <asm/ftrace.h> -#include <asm/export.h> #include <asm/nospec-branch.h> #include <asm/unwind_hints.h> #include <asm/frame.h> diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 49f7629b17f7..05a110c97111 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -41,6 +41,7 @@ #include <asm/trapnr.h> #include <asm/sev.h> #include <asm/tdx.h> +#include <asm/init.h> /* * Manage page tables very early on. @@ -69,7 +70,7 @@ EXPORT_SYMBOL(vmemmap_base); /* * GDT used on the boot CPU before switching to virtual addresses. */ -static struct desc_struct startup_gdt[GDT_ENTRIES] = { +static struct desc_struct startup_gdt[GDT_ENTRIES] __initdata = { [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), @@ -79,13 +80,11 @@ static struct desc_struct startup_gdt[GDT_ENTRIES] = { * Address needs to be set at runtime because it references the startup_gdt * while the kernel still uses a direct mapping. */ -static struct desc_ptr startup_gdt_descr = { - .size = sizeof(startup_gdt), +static struct desc_ptr startup_gdt_descr __initdata = { + .size = sizeof(startup_gdt)-1, .address = 0, }; -#define __head __section(".head.text") - static void __head *fixup_pointer(void *ptr, unsigned long physaddr) { return ptr - (void *)_text + (void *)physaddr; @@ -211,7 +210,7 @@ unsigned long __head __startup_64(unsigned long physaddr, /* Fixup the physical addresses in the page table */ - pgd = fixup_pointer(&early_top_pgt, physaddr); + pgd = fixup_pointer(early_top_pgt, physaddr); p = pgd + pgd_index(__START_KERNEL_map); if (la57) *p = (unsigned long)level4_kernel_pgt; @@ -220,11 +219,11 @@ unsigned long __head __startup_64(unsigned long physaddr, *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta; if (la57) { - p4d = fixup_pointer(&level4_kernel_pgt, physaddr); + p4d = fixup_pointer(level4_kernel_pgt, physaddr); p4d[511] += load_delta; } - pud = fixup_pointer(&level3_kernel_pgt, physaddr); + pud = fixup_pointer(level3_kernel_pgt, physaddr); pud[510] += load_delta; pud[511] += load_delta; @@ -588,7 +587,7 @@ static void set_bringup_idt_handler(gate_desc *idt, int n, void *handler) } /* This runs while still in the direct mapping */ -static void startup_64_load_idt(unsigned long physbase) +static void __head startup_64_load_idt(unsigned long physbase) { struct desc_ptr *desc = fixup_pointer(&bringup_idt_descr, physbase); gate_desc *idt = fixup_pointer(bringup_idt_table, physbase); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index c9318993f959..b6554212b7c7 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -8,6 +8,7 @@ */ .text +#include <linux/export.h> #include <linux/threads.h> #include <linux/init.h> #include <linux/linkage.h> @@ -25,7 +26,6 @@ #include <asm/nops.h> #include <asm/nospec-branch.h> #include <asm/bootparam.h> -#include <asm/export.h> #include <asm/pgtable_32.h> /* Physical address */ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index ea6995920b7a..086a2c3aaaa0 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -9,7 +9,7 @@ * Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com> */ - +#include <linux/export.h> #include <linux/linkage.h> #include <linux/threads.h> #include <linux/init.h> @@ -22,7 +22,6 @@ #include <asm/percpu.h> #include <asm/nops.h> #include "../entry/calling.h" -#include <asm/export.h> #include <asm/nospec-branch.h> #include <asm/apicdef.h> #include <asm/fixmap.h> @@ -180,8 +179,8 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) movl $0, %ecx #endif - /* Enable PAE mode, PGE and LA57 */ - orl $(X86_CR4_PAE | X86_CR4_PGE), %ecx + /* Enable PAE mode, PSE, PGE and LA57 */ + orl $(X86_CR4_PAE | X86_CR4_PSE | X86_CR4_PGE), %ecx #ifdef CONFIG_X86_5LEVEL testl $1, __pgtable_l5_enabled(%rip) jz 1f diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 1648aa0204d9..41eecf180b7f 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -52,7 +52,7 @@ unsigned long hpet_address; u8 hpet_blockid; /* OS timer block num */ bool hpet_msi_disable; -#ifdef CONFIG_GENERIC_MSI_IRQ +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_GENERIC_MSI_IRQ) static DEFINE_PER_CPU(struct hpet_channel *, cpu_hpet_channel); static struct irq_domain *hpet_domain; #endif @@ -469,7 +469,7 @@ static void __init hpet_legacy_clockevent_register(struct hpet_channel *hc) /* * HPET MSI Support */ -#ifdef CONFIG_GENERIC_MSI_IRQ +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_GENERIC_MSI_IRQ) static void hpet_msi_unmask(struct irq_data *data) { struct hpet_channel *hc = irq_data_get_irq_handler_data(data); diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 30a55207c000..c20d1832c481 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -32,6 +32,7 @@ */ static void init_8259A(int auto_eoi); +static bool pcat_compat __ro_after_init; static int i8259A_auto_eoi; DEFINE_RAW_SPINLOCK(i8259A_lock); @@ -299,15 +300,32 @@ static void unmask_8259A(void) static int probe_8259A(void) { + unsigned char new_val, probe_val = ~(1 << PIC_CASCADE_IR); unsigned long flags; - unsigned char probe_val = ~(1 << PIC_CASCADE_IR); - unsigned char new_val; + + /* + * If MADT has the PCAT_COMPAT flag set, then do not bother probing + * for the PIC. Some BIOSes leave the PIC uninitialized and probing + * fails. + * + * Right now this causes problems as quite some code depends on + * nr_legacy_irqs() > 0 or has_legacy_pic() == true. This is silly + * when the system has an IO/APIC because then PIC is not required + * at all, except for really old machines where the timer interrupt + * must be routed through the PIC. So just pretend that the PIC is + * there and let legacy_pic->init() initialize it for nothing. + * + * Alternatively this could just try to initialize the PIC and + * repeat the probe, but for cases where there is no PIC that's + * just pointless. + */ + if (pcat_compat) + return nr_legacy_irqs(); + /* - * Check to see if we have a PIC. - * Mask all except the cascade and read - * back the value we just wrote. If we don't - * have a PIC, we will read 0xff as opposed to the - * value we wrote. + * Check to see if we have a PIC. Mask all except the cascade and + * read back the value we just wrote. If we don't have a PIC, we + * will read 0xff as opposed to the value we wrote. */ raw_spin_lock_irqsave(&i8259A_lock, flags); @@ -429,5 +447,9 @@ static int __init i8259A_init_ops(void) return 0; } - device_initcall(i8259A_init_ops); + +void __init legacy_pic_pcat_compat(void) +{ + pcat_compat = true; +} diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index b786d48f5a0f..8857abc706e4 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -10,6 +10,7 @@ #include <asm/proto.h> #include <asm/desc.h> #include <asm/hw_irq.h> +#include <asm/ia32.h> #include <asm/idtentry.h> #define DPL0 0x0 @@ -116,6 +117,9 @@ static const __initconst struct idt_data def_idts[] = { #endif SYSG(X86_TRAP_OF, asm_exc_overflow), +}; + +static const struct idt_data ia32_idt[] __initconst = { #if defined(CONFIG_IA32_EMULATION) SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat), #elif defined(CONFIG_X86_32) @@ -225,6 +229,9 @@ void __init idt_setup_early_traps(void) void __init idt_setup_traps(void) { idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true); + + if (ia32_enabled()) + idt_setup_from_table(idt_table, ia32_idt, ARRAY_SIZE(ia32_idt), true); } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S index aaf9e776f323..7f542a7799cb 100644 --- a/arch/x86/kernel/irqflags.S +++ b/arch/x86/kernel/irqflags.S @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <asm/asm.h> -#include <asm/export.h> +#include <linux/export.h> #include <linux/linkage.h> /* diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index b8ab9ee5896c..0ddb3bd0f1aa 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -500,13 +500,13 @@ static bool pv_sched_yield_supported(void) static void __send_ipi_mask(const struct cpumask *mask, int vector) { unsigned long flags; - int cpu, apic_id, icr; - int min = 0, max = 0; + int cpu, min = 0, max = 0; #ifdef CONFIG_X86_64 __uint128_t ipi_bitmap = 0; #else u64 ipi_bitmap = 0; #endif + u32 apic_id, icr; long ret; if (cpumask_empty(mask)) @@ -1028,8 +1028,8 @@ arch_initcall(activate_jump_labels); /* Kick a cpu by its apicid. Used to wake up a halted vcpu */ static void kvm_kick_cpu(int cpu) { - int apicid; unsigned long flags = 0; + u32 apicid; apicid = per_cpu(x86_cpu_to_apicid, cpu); kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index a0c551846b35..4766b6bed443 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -507,12 +507,13 @@ DEFINE_IDTENTRY_RAW(exc_nmi) } this_cpu_write(nmi_state, NMI_EXECUTING); this_cpu_write(nmi_cr2, read_cr2()); + +nmi_restart: if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) { WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1); WARN_ON_ONCE(!(nsp->idt_seq & 0x1)); WRITE_ONCE(nsp->recv_jiffies, jiffies); } -nmi_restart: /* * Needs to happen before DR7 is accessed, because the hypervisor can @@ -548,16 +549,16 @@ nmi_restart: if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) write_cr2(this_cpu_read(nmi_cr2)); - if (this_cpu_dec_return(nmi_state)) - goto nmi_restart; - - if (user_mode(regs)) - mds_user_clear_cpu_buffers(); if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) { WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1); WARN_ON_ONCE(nsp->idt_seq & 0x1); WRITE_ONCE(nsp->recv_jiffies, jiffies); } + if (this_cpu_dec_return(nmi_state)) + goto nmi_restart; + + if (user_mode(regs)) + mds_user_clear_cpu_buffers(); } #if IS_ENABLED(CONFIG_KVM_INTEL) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b098b1fa2470..ccd3ad29a1dc 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1120,7 +1120,7 @@ void __init setup_arch(char **cmdline_p) * Needs to run after memblock setup because it needs the physical * memory size. */ - sev_setup_arch(); + mem_encrypt_setup_arch(); efi_fake_memmap(); efi_find_mirror(); @@ -1217,6 +1217,8 @@ void __init setup_arch(char **cmdline_p) early_acpi_boot_init(); + x86_flattree_get_config(); + initmem_init(); dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT); diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 6395bfd87b68..70472eebe719 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -966,7 +966,7 @@ static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa) free_page((unsigned long)vmsa); } -static int wakeup_cpu_via_vmgexit(int apic_id, unsigned long start_ip) +static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip) { struct sev_es_save_area *cur_vmsa, *vmsa; struct ghcb_state state; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2a187c0cbd5b..c4aca66f0902 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -87,6 +87,7 @@ #include <asm/hw_irq.h> #include <asm/stackprotector.h> #include <asm/sev.h> +#include <asm/spec-ctrl.h> /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); @@ -124,7 +125,20 @@ struct mwait_cpu_dead { */ static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead); -/* Logical package management. We might want to allocate that dynamically */ +/* Logical package management. */ +struct logical_maps { + u32 phys_pkg_id; + u32 phys_die_id; + u32 logical_pkg_id; + u32 logical_die_id; +}; + +/* Temporary workaround until the full topology mechanics is in place */ +static DEFINE_PER_CPU_READ_MOSTLY(struct logical_maps, logical_maps) = { + .phys_pkg_id = U32_MAX, + .phys_die_id = U32_MAX, +}; + unsigned int __max_logical_packages __read_mostly; EXPORT_SYMBOL(__max_logical_packages); static unsigned int logical_packages __read_mostly; @@ -288,7 +302,7 @@ static void notrace start_secondary(void *unused) cpu_init(); fpu__init_cpu(); - rcu_cpu_starting(raw_smp_processor_id()); + rcutree_report_cpu_starting(raw_smp_processor_id()); x86_cpuinit.early_percpu_clock_init(); ap_starting(); @@ -337,10 +351,8 @@ int topology_phys_to_logical_pkg(unsigned int phys_pkg) int cpu; for_each_possible_cpu(cpu) { - struct cpuinfo_x86 *c = &cpu_data(cpu); - - if (c->initialized && c->phys_proc_id == phys_pkg) - return c->logical_proc_id; + if (per_cpu(logical_maps.phys_pkg_id, cpu) == phys_pkg) + return per_cpu(logical_maps.logical_pkg_id, cpu); } return -1; } @@ -355,14 +367,12 @@ EXPORT_SYMBOL(topology_phys_to_logical_pkg); */ static int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu) { - int cpu, proc_id = cpu_data(cur_cpu).phys_proc_id; + int cpu, proc_id = cpu_data(cur_cpu).topo.pkg_id; for_each_possible_cpu(cpu) { - struct cpuinfo_x86 *c = &cpu_data(cpu); - - if (c->initialized && c->cpu_die_id == die_id && - c->phys_proc_id == proc_id) - return c->logical_die_id; + if (per_cpu(logical_maps.phys_pkg_id, cpu) == proc_id && + per_cpu(logical_maps.phys_die_id, cpu) == die_id) + return per_cpu(logical_maps.logical_die_id, cpu); } return -1; } @@ -387,7 +397,9 @@ int topology_update_package_map(unsigned int pkg, unsigned int cpu) cpu, pkg, new); } found: - cpu_data(cpu).logical_proc_id = new; + per_cpu(logical_maps.phys_pkg_id, cpu) = pkg; + per_cpu(logical_maps.logical_pkg_id, cpu) = new; + cpu_data(cpu).topo.logical_pkg_id = new; return 0; } /** @@ -410,7 +422,9 @@ int topology_update_die_map(unsigned int die, unsigned int cpu) cpu, die, new); } found: - cpu_data(cpu).logical_die_id = new; + per_cpu(logical_maps.phys_die_id, cpu) = die; + per_cpu(logical_maps.logical_die_id, cpu) = new; + cpu_data(cpu).topo.logical_die_id = new; return 0; } @@ -421,8 +435,8 @@ static void __init smp_store_boot_cpu_info(void) *c = boot_cpu_data; c->cpu_index = id; - topology_update_package_map(c->phys_proc_id, id); - topology_update_die_map(c->cpu_die_id, id); + topology_update_package_map(c->topo.pkg_id, id); + topology_update_die_map(c->topo.die_id, id); c->initialized = true; } @@ -476,21 +490,21 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { int cpu1 = c->cpu_index, cpu2 = o->cpu_index; - if (c->phys_proc_id == o->phys_proc_id && - c->cpu_die_id == o->cpu_die_id && - per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) { - if (c->cpu_core_id == o->cpu_core_id) + if (c->topo.pkg_id == o->topo.pkg_id && + c->topo.die_id == o->topo.die_id && + per_cpu_llc_id(cpu1) == per_cpu_llc_id(cpu2)) { + if (c->topo.core_id == o->topo.core_id) return topology_sane(c, o, "smt"); - if ((c->cu_id != 0xff) && - (o->cu_id != 0xff) && - (c->cu_id == o->cu_id)) + if ((c->topo.cu_id != 0xff) && + (o->topo.cu_id != 0xff) && + (c->topo.cu_id == o->topo.cu_id)) return topology_sane(c, o, "smt"); } - } else if (c->phys_proc_id == o->phys_proc_id && - c->cpu_die_id == o->cpu_die_id && - c->cpu_core_id == o->cpu_core_id) { + } else if (c->topo.pkg_id == o->topo.pkg_id && + c->topo.die_id == o->topo.die_id && + c->topo.core_id == o->topo.core_id) { return topology_sane(c, o, "smt"); } @@ -499,8 +513,8 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { - if (c->phys_proc_id == o->phys_proc_id && - c->cpu_die_id == o->cpu_die_id) + if (c->topo.pkg_id == o->topo.pkg_id && + c->topo.die_id == o->topo.die_id) return true; return false; } @@ -510,11 +524,11 @@ static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) int cpu1 = c->cpu_index, cpu2 = o->cpu_index; /* If the arch didn't set up l2c_id, fall back to SMT */ - if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID) + if (per_cpu_l2c_id(cpu1) == BAD_APICID) return match_smt(c, o); /* Do not match if L2 cache id does not match: */ - if (per_cpu(cpu_l2c_id, cpu1) != per_cpu(cpu_l2c_id, cpu2)) + if (per_cpu_l2c_id(cpu1) != per_cpu_l2c_id(cpu2)) return false; return topology_sane(c, o, "l2c"); @@ -527,7 +541,7 @@ static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) */ static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { - if (c->phys_proc_id == o->phys_proc_id) + if (c->topo.pkg_id == o->topo.pkg_id) return true; return false; } @@ -560,11 +574,11 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) bool intel_snc = id && id->driver_data; /* Do not match if we do not have a valid APICID for cpu: */ - if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID) + if (per_cpu_llc_id(cpu1) == BAD_APICID) return false; /* Do not match if LLC id does not match: */ - if (per_cpu(cpu_llc_id, cpu1) != per_cpu(cpu_llc_id, cpu2)) + if (per_cpu_llc_id(cpu1) != per_cpu_llc_id(cpu2)) return false; /* @@ -640,13 +654,13 @@ static void __init build_sched_topology(void) }; #endif /* - * When there is NUMA topology inside the package skip the DIE domain + * When there is NUMA topology inside the package skip the PKG domain * since the NUMA domains will auto-magically create the right spanning * domains based on the SLIT. */ if (!x86_has_numa_in_package) { x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(DIE) + cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(PKG) }; } @@ -809,7 +823,7 @@ static void __init smp_quirk_init_udelay(void) /* * Wake up AP by INIT, INIT, STARTUP sequence. */ -static void send_init_sequence(int phys_apicid) +static void send_init_sequence(u32 phys_apicid) { int maxlvt = lapic_get_maxlvt(); @@ -835,7 +849,7 @@ static void send_init_sequence(int phys_apicid) /* * Wake up AP by INIT, INIT, STARTUP sequence. */ -static int wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) +static int wakeup_secondary_cpu_via_init(u32 phys_apicid, unsigned long start_eip) { unsigned long send_status = 0, accept_status = 0; int num_starts, j, maxlvt; @@ -982,7 +996,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) * Returns zero if startup was successfully sent, else error code from * ->wakeup_secondary_cpu. */ -static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) +static int do_boot_cpu(u32 apicid, int cpu, struct task_struct *idle) { unsigned long start_ip = real_mode_header->trampoline_start; int ret; @@ -1050,7 +1064,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) int native_kick_ap(unsigned int cpu, struct task_struct *tidle) { - int apicid = apic->cpu_present_to_apicid(cpu); + u32 apicid = apic->cpu_present_to_apicid(cpu); int err; lockdep_assert_irqs_enabled(); @@ -1405,7 +1419,7 @@ static void remove_siblinginfo(int cpu) cpumask_clear(topology_sibling_cpumask(cpu)); cpumask_clear(topology_core_cpumask(cpu)); cpumask_clear(topology_die_cpumask(cpu)); - c->cpu_core_id = 0; + c->topo.core_id = 0; c->booted_cores = 0; cpumask_clear_cpu(cpu, cpu_sibling_setup_mask); recompute_smt_state(); @@ -1596,8 +1610,15 @@ void __noreturn hlt_play_dead(void) native_halt(); } +/* + * native_play_dead() is essentially a __noreturn function, but it can't + * be marked as such as the compiler may complain about it. + */ void native_play_dead(void) { + if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) + __update_spec_ctrl(0); + play_dead_common(); tboot_shutdown(TB_SHUTDOWN_WFS); diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index bbc440c93e08..1123ef3ccf90 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -15,6 +15,7 @@ * ( The serial nature of the boot logic and the CPU hotplug lock * protects against more than 2 CPUs entering this code. ) */ +#include <linux/workqueue.h> #include <linux/topology.h> #include <linux/spinlock.h> #include <linux/kernel.h> @@ -342,6 +343,13 @@ static inline unsigned int loop_timeout(int cpu) return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20; } +static void tsc_sync_mark_tsc_unstable(struct work_struct *work) +{ + mark_tsc_unstable("check_tsc_sync_source failed"); +} + +static DECLARE_WORK(tsc_sync_work, tsc_sync_mark_tsc_unstable); + /* * The freshly booted CPU initiates this via an async SMP function call. */ @@ -395,7 +403,7 @@ retry: "turning off TSC clock.\n", max_warp); if (random_warps) pr_warn("TSC warped randomly between CPUs\n"); - mark_tsc_unstable("check_tsc_sync_source failed"); + schedule_work(&tsc_sync_work); } /* diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 7e574cf3bf8a..d00c28aaa5be 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -85,7 +85,7 @@ static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table, { int *first = ip_table; int *last = ip_table + num_entries - 1; - int *mid = first, *found = first; + int *mid, *found = first; if (!num_entries) return NULL; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index f15fb71f280e..54a5596adaa6 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -139,10 +139,7 @@ SECTIONS STATIC_CALL_TEXT ALIGN_ENTRY_TEXT_BEGIN -#ifdef CONFIG_CPU_SRSO *(.text..__x86.rethunk_untrain) -#endif - ENTRY_TEXT #ifdef CONFIG_CPU_SRSO @@ -520,12 +517,12 @@ INIT_PER_CPU(irq_stack_backing_store); "fixed_percpu_data is not at start of per-cpu area"); #endif -#ifdef CONFIG_RETHUNK +#ifdef CONFIG_CPU_UNRET_ENTRY . = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned"); -. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned"); #endif #ifdef CONFIG_CPU_SRSO +. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned"); /* * GNU ld cannot do XOR until 2.41. * https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=f6f78318fca803c4907fb8d7f6ded8295f1947b1 diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 65e96b76c423..d3fc01770558 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -127,7 +127,7 @@ static void __init vsmp_cap_cpus(void) #endif } -static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) +static u32 apicid_phys_pkg_id(u32 initial_apic_id, int index_msb) { return read_apic_id() >> index_msb; } |