diff options
Diffstat (limited to 'arch/x86/kernel')
32 files changed, 258 insertions, 198 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 04105574c8e9..82f2912155a5 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -17,19 +17,6 @@ CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg endif -# -# vsyscalls (which work on the user stack) should have -# no stack-protector checks: -# -nostackp := $(call cc-option, -fno-stack-protector) -CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) -CFLAGS_hpet.o := $(nostackp) -CFLAGS_paravirt.o := $(nostackp) -GCOV_PROFILE_vsyscall_64.o := n -GCOV_PROFILE_hpet.o := n -GCOV_PROFILE_tsc.o := n -GCOV_PROFILE_paravirt.o := n - obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 5812404a0d4c..f50e7fb2a201 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -149,6 +149,29 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, } EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); +/* + * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, + * which can obviate IPI to trigger checking of need_resched. + * We execute MONITOR against need_resched and enter optimized wait state + * through MWAIT. Whenever someone changes need_resched, we would be woken + * up from MWAIT (without an IPI). + * + * New with Core Duo processors, MWAIT can take some hints based on CPU + * capability. + */ +void mwait_idle_with_hints(unsigned long ax, unsigned long cx) +{ + if (!need_resched()) { + if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) + clflush((void *)¤t_thread_info()->flags); + + __monitor((void *)¤t_thread_info()->flags, 0, 0); + smp_mb(); + if (!need_resched()) + __mwait(ax, cx); + } +} + void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) { unsigned int cpu = smp_processor_id(); diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index b117efd24f71..8a439d364b94 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -30,7 +30,7 @@ #include <linux/syscore_ops.h> #include <linux/io.h> #include <linux/gfp.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <asm/mtrr.h> #include <asm/pgtable.h> #include <asm/proto.h> diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b24be38c8cf8..52fa56399a50 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -38,7 +38,7 @@ #include <asm/perf_event.h> #include <asm/x86_init.h> #include <asm/pgalloc.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <asm/mpspec.h> #include <asm/i8259.h> #include <asm/proto.h> diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 9536b3fe43f8..5d513bc47b6b 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -48,7 +48,7 @@ #include <linux/io.h> #include <asm/apicdef.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <asm/fixmap.h> #include <asm/mpspec.h> #include <asm/setup.h> diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index adc66c3a1fef..34b18594e724 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -207,7 +207,6 @@ static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_ri ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | APIC_DM_INIT; uv_write_global_mmr64(pnode, UVH_IPI_INT, val); - mdelay(10); val = (1UL << UVH_IPI_INT_SEND_SHFT) | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 22a073d7fbff..62184390a601 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -21,7 +21,7 @@ #include <linux/topology.h> #include <linux/cpumask.h> #include <asm/pgtable.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <asm/proto.h> #include <asm/setup.h> #include <asm/apic.h> diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 08119a37e53c..6b96110bb0c3 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -149,7 +149,6 @@ struct set_mtrr_data { */ static int mtrr_rendezvous_handler(void *info) { -#ifdef CONFIG_SMP struct set_mtrr_data *data = info; /* @@ -171,7 +170,6 @@ static int mtrr_rendezvous_handler(void *info) } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) { mtrr_if->set_all(); } -#endif return 0; } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 4ee3abf20ed6..cfa62ec090ec 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1900,6 +1900,9 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) perf_callchain_store(entry, regs->ip); + if (!current->mm) + return; + if (perf_callchain_user32(regs, entry)) return; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 45fbb8f7f549..f88af2c2a561 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1590,6 +1590,7 @@ static __init int intel_pmu_init(void) break; case 42: /* SandyBridge */ + case 45: /* SandyBridge, "Romely-EP" */ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 5c1a91974918..f3f6f5344001 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -54,6 +54,7 @@ #include <asm/ftrace.h> #include <asm/irq_vectors.h> #include <asm/cpufeature.h> +#include <asm/alternative-asm.h> /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ #include <linux/elf-em.h> @@ -873,12 +874,7 @@ ENTRY(simd_coprocessor_error) 661: pushl_cfi $do_general_protection 662: .section .altinstructions,"a" - .balign 4 - .long 661b - .long 663f - .word X86_FEATURE_XMM - .byte 662b-661b - .byte 664f-663f + altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f .previous .section .altinstr_replacement,"ax" 663: pushl $do_simd_coprocessor_error diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e13329d800c8..6419bb05ecd5 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1111,7 +1111,6 @@ zeroentry spurious_interrupt_bug do_spurious_interrupt_bug zeroentry coprocessor_error do_coprocessor_error errorentry alignment_check do_alignment_check zeroentry simd_coprocessor_error do_simd_coprocessor_error -zeroentry emulate_vsyscall do_emulate_vsyscall /* Reload gs selector with exception handling */ diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 12aff2537682..739d8598f789 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -321,7 +321,7 @@ static inline unsigned short twd_i387_to_fxsr(unsigned short twd) return tmp; } -#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16); +#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16) #define FP_EXP_TAG_VALID 0 #define FP_EXP_TAG_ZERO 1 #define FP_EXP_TAG_SPECIAL 2 diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 65b8f5c2eebf..610485223bdb 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -14,7 +14,7 @@ #include <linux/io.h> #include <linux/delay.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <asm/system.h> #include <asm/timer.h> #include <asm/hw_irq.h> diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index f09d4bbe2d2d..b3300e6bacef 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -15,7 +15,7 @@ #include <linux/io.h> #include <linux/delay.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <asm/system.h> #include <asm/timer.h> #include <asm/hw_irq.h> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 33c07b0b122e..a9c2116001d6 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -51,6 +51,15 @@ static int parse_no_kvmapf(char *arg) early_param("no-kvmapf", parse_no_kvmapf); +static int steal_acc = 1; +static int parse_no_stealacc(char *arg) +{ + steal_acc = 0; + return 0; +} + +early_param("no-steal-acc", parse_no_stealacc); + struct kvm_para_state { u8 mmu_queue[MMU_QUEUE_SIZE]; int mmu_queue_len; @@ -58,6 +67,8 @@ struct kvm_para_state { static DEFINE_PER_CPU(struct kvm_para_state, para_state); static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); +static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); +static int has_steal_clock = 0; static struct kvm_para_state *kvm_para_state(void) { @@ -441,6 +452,21 @@ static void __init paravirt_ops_setup(void) #endif } +static void kvm_register_steal_time(void) +{ + int cpu = smp_processor_id(); + struct kvm_steal_time *st = &per_cpu(steal_time, cpu); + + if (!has_steal_clock) + return; + + memset(st, 0, sizeof(*st)); + + wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED)); + printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n", + cpu, __pa(st)); +} + void __cpuinit kvm_guest_cpu_init(void) { if (!kvm_para_available()) @@ -457,6 +483,9 @@ void __cpuinit kvm_guest_cpu_init(void) printk(KERN_INFO"KVM setup async PF for cpu %d\n", smp_processor_id()); } + + if (has_steal_clock) + kvm_register_steal_time(); } static void kvm_pv_disable_apf(void *unused) @@ -483,6 +512,31 @@ static struct notifier_block kvm_pv_reboot_nb = { .notifier_call = kvm_pv_reboot_notify, }; +static u64 kvm_steal_clock(int cpu) +{ + u64 steal; + struct kvm_steal_time *src; + int version; + + src = &per_cpu(steal_time, cpu); + do { + version = src->version; + rmb(); + steal = src->steal; + rmb(); + } while ((version & 1) || (version != src->version)); + + return steal; +} + +void kvm_disable_steal_time(void) +{ + if (!has_steal_clock) + return; + + wrmsr(MSR_KVM_STEAL_TIME, 0, 0); +} + #ifdef CONFIG_SMP static void __init kvm_smp_prepare_boot_cpu(void) { @@ -500,6 +554,7 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy) static void kvm_guest_cpu_offline(void *dummy) { + kvm_disable_steal_time(); kvm_pv_disable_apf(NULL); apf_task_wake_all(); } @@ -548,6 +603,11 @@ void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) x86_init.irqs.trap_init = kvm_apf_trap_init; + if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { + has_steal_clock = 1; + pv_time_ops.steal_clock = kvm_steal_clock; + } + #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; register_cpu_notifier(&kvm_cpu_notifier); @@ -555,3 +615,15 @@ void __init kvm_guest_init(void) kvm_guest_cpu_init(); #endif } + +static __init int activate_jump_labels(void) +{ + if (has_steal_clock) { + jump_label_inc(¶virt_steal_enabled); + if (steal_acc) + jump_label_inc(¶virt_steal_rq_enabled); + } + + return 0; +} +arch_initcall(activate_jump_labels); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 6389a6bca11b..c1a0188e29ae 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -160,6 +160,7 @@ static void __cpuinit kvm_setup_secondary_clock(void) static void kvm_crash_shutdown(struct pt_regs *regs) { native_write_msr(msr_kvm_system_time, 0, 0); + kvm_disable_steal_time(); native_machine_crash_shutdown(regs); } #endif @@ -167,6 +168,7 @@ static void kvm_crash_shutdown(struct pt_regs *regs) static void kvm_shutdown(void) { native_write_msr(msr_kvm_system_time, 0, 0); + kvm_disable_steal_time(); native_machine_shutdown(); } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 52f256f2cc81..925179f871de 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -45,21 +45,6 @@ void *module_alloc(unsigned long size) -1, __builtin_return_address(0)); } -/* Free memory returned from module_alloc */ -void module_free(struct module *mod, void *module_region) -{ - vfree(module_region); -} - -/* We don't need anything special. */ -int module_frob_arch_sections(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - char *secstrings, - struct module *mod) -{ - return 0; -} - #ifdef CONFIG_X86_32 int apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, @@ -100,17 +85,6 @@ int apply_relocate(Elf32_Shdr *sechdrs, } return 0; } - -int apply_relocate_add(Elf32_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n", - me->name); - return -ENOEXEC; -} #else /*X86_64*/ int apply_relocate_add(Elf64_Shdr *sechdrs, const char *strtab, @@ -181,17 +155,6 @@ overflow: me->name); return -ENOEXEC; } - -int apply_relocate(Elf_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - printk(KERN_ERR "non add relocation not supported\n"); - return -ENOSYS; -} - #endif int module_finalize(const Elf_Ehdr *hdr, diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 869e1aeeb71b..d90272e6bc40 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -202,6 +202,14 @@ static void native_flush_tlb_single(unsigned long addr) __native_flush_tlb_single(addr); } +struct jump_label_key paravirt_steal_enabled; +struct jump_label_key paravirt_steal_rq_enabled; + +static u64 native_steal_clock(int cpu) +{ + return 0; +} + /* These are in entry.S */ extern void native_iret(void); extern void native_irq_enable_sysexit(void); @@ -299,6 +307,10 @@ struct pv_info pv_info = { .paravirt_enabled = 0, .kernel_rpl = 0, .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ + +#ifdef CONFIG_X86_64 + .extra_user_64bit_cs = __USER_CS, +#endif }; struct pv_init_ops pv_init_ops = { @@ -307,6 +319,7 @@ struct pv_init_ops pv_init_ops = { struct pv_time_ops pv_time_ops = { .sched_clock = native_sched_clock, + .steal_clock = native_steal_clock, }; struct pv_irq_ops pv_irq_ops = { diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index e8c33a302006..726494b58345 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1553,7 +1553,7 @@ static void __init calgary_fixup_one_tce_space(struct pci_dev *dev) continue; /* cover the whole region */ - npages = (r->end - r->start) >> PAGE_SHIFT; + npages = resource_size(r) >> PAGE_SHIFT; npages++; iommu_range_reserve(tbl, r->start, npages); diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index ba0a4cce53be..63228035f9d7 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -234,7 +234,7 @@ void __init probe_roms(void) /* check for extension rom (ignore length byte!) */ rom = isa_bus_to_virt(extension_rom_resource.start); if (romsignature(rom)) { - length = extension_rom_resource.end - extension_rom_resource.start + 1; + length = resource_size(&extension_rom_resource); if (romchecksum(rom, length)) { request_resource(&iomem_resource, &extension_rom_resource); upper = extension_rom_resource.start; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e1ba8cb24e4e..e7e3b019c439 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -438,29 +438,6 @@ void cpu_idle_wait(void) } EXPORT_SYMBOL_GPL(cpu_idle_wait); -/* - * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, - * which can obviate IPI to trigger checking of need_resched. - * We execute MONITOR against need_resched and enter optimized wait state - * through MWAIT. Whenever someone changes need_resched, we would be woken - * up from MWAIT (without an IPI). - * - * New with Core Duo processors, MWAIT can take some hints based on CPU - * capability. - */ -void mwait_idle_with_hints(unsigned long ax, unsigned long cx) -{ - if (!need_resched()) { - if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) - clflush((void *)¤t_thread_info()->flags); - - __monitor((void *)¤t_thread_info()->flags, 0, 0); - smp_mb(); - if (!need_resched()) - __mwait(ax, cx); - } -} - /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index a3d0dc59067b..7a3b65107a27 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -38,6 +38,7 @@ #include <linux/uaccess.h> #include <linux/io.h> #include <linux/kdebug.h> +#include <linux/cpuidle.h> #include <asm/pgtable.h> #include <asm/system.h> @@ -109,7 +110,8 @@ void cpu_idle(void) local_irq_disable(); /* Don't trace irqs off for idle */ stop_critical_timings(); - pm_idle(); + if (cpuidle_idle_call()) + pm_idle(); start_critical_timings(); } tick_nohz_restart_sched_tick(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ca6f7ab8df33..f693e44e1bf6 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -37,6 +37,7 @@ #include <linux/uaccess.h> #include <linux/io.h> #include <linux/ftrace.h> +#include <linux/cpuidle.h> #include <asm/pgtable.h> #include <asm/system.h> @@ -136,7 +137,8 @@ void cpu_idle(void) enter_idle(); /* Don't trace irqs off for idle */ stop_critical_timings(); - pm_idle(); + if (cpuidle_idle_call()) + pm_idle(); start_critical_timings(); /* In many cases the interrupt that ended idle diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 3f2ad2640d85..ccdbc16b8941 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -42,8 +42,11 @@ int mach_set_rtc_mmss(unsigned long nowtime) { int real_seconds, real_minutes, cmos_minutes; unsigned char save_control, save_freq_select; + unsigned long flags; int retval = 0; + spin_lock_irqsave(&rtc_lock, flags); + /* tell the clock it's being set */ save_control = CMOS_READ(RTC_CONTROL); CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL); @@ -93,12 +96,17 @@ int mach_set_rtc_mmss(unsigned long nowtime) CMOS_WRITE(save_control, RTC_CONTROL); CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); + spin_unlock_irqrestore(&rtc_lock, flags); + return retval; } unsigned long mach_get_cmos_time(void) { unsigned int status, year, mon, day, hour, min, sec, century = 0; + unsigned long flags; + + spin_lock_irqsave(&rtc_lock, flags); /* * If UIP is clear, then we have >= 244 microseconds before @@ -125,6 +133,8 @@ unsigned long mach_get_cmos_time(void) status = CMOS_READ(RTC_CONTROL); WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY)); + spin_unlock_irqrestore(&rtc_lock, flags); + if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) { sec = bcd2bin(sec); min = bcd2bin(min); @@ -169,24 +179,15 @@ EXPORT_SYMBOL(rtc_cmos_write); int update_persistent_clock(struct timespec now) { - unsigned long flags; - int retval; - - spin_lock_irqsave(&rtc_lock, flags); - retval = x86_platform.set_wallclock(now.tv_sec); - spin_unlock_irqrestore(&rtc_lock, flags); - - return retval; + return x86_platform.set_wallclock(now.tv_sec); } /* not static: needed by APM */ void read_persistent_clock(struct timespec *ts) { - unsigned long retval, flags; + unsigned long retval; - spin_lock_irqsave(&rtc_lock, flags); retval = x86_platform.get_wallclock(); - spin_unlock_irqrestore(&rtc_lock, flags); ts->tv_sec = retval; ts->tv_nsec = 0; diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 7977f0cfe339..c346d1161488 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -74,7 +74,7 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) #ifdef CONFIG_X86_64 case 0x40 ... 0x4f: - if (regs->cs != __USER_CS) + if (!user_64bit_mode(regs)) /* 32-bit mode: register increment */ return 0; /* 64-bit mode: REX prefix */ diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index fbb0a045a1a2..bc19be332bc9 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -168,7 +168,7 @@ ENTRY(sys_call_table) .long ptregs_vm86 .long sys_ni_syscall /* Old sys_query_module */ .long sys_poll - .long sys_nfsservctl + .long sys_ni_syscall /* Old nfsservctl */ .long sys_setresgid16 /* 170 */ .long sys_getresgid16 .long sys_prctl diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index fbc097a085ca..6913369c234c 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -49,7 +49,7 @@ #include <asm/stacktrace.h> #include <asm/processor.h> #include <asm/debugreg.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <asm/system.h> #include <asm/traps.h> #include <asm/desc.h> @@ -872,12 +872,6 @@ void __init trap_init(void) set_bit(SYSCALL_VECTOR, used_vectors); #endif -#ifdef CONFIG_X86_64 - BUG_ON(test_bit(VSYSCALL_EMU_VECTOR, used_vectors)); - set_system_intr_gate(VSYSCALL_EMU_VECTOR, &emulate_vsyscall); - set_bit(VSYSCALL_EMU_VECTOR, used_vectors); -#endif - /* * Should be a barrier for any external CPU state: */ diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4aa9c54a9b76..0f703f10901a 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -71,7 +71,6 @@ PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(6); /* RW_ */ #ifdef CONFIG_X86_64 - user PT_LOAD FLAGS(5); /* R_E */ #ifdef CONFIG_SMP percpu PT_LOAD FLAGS(6); /* RW_ */ #endif @@ -154,44 +153,16 @@ SECTIONS #ifdef CONFIG_X86_64 -#define VSYSCALL_ADDR (-10*1024*1024) - -#define VLOAD_OFFSET (VSYSCALL_ADDR - __vsyscall_0 + LOAD_OFFSET) -#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) - -#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) -#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) - - . = ALIGN(4096); - __vsyscall_0 = .; - - . = VSYSCALL_ADDR; - .vsyscall : AT(VLOAD(.vsyscall)) { - *(.vsyscall_0) - - . = 1024; - *(.vsyscall_1) - - . = 2048; - *(.vsyscall_2) - - . = 4096; /* Pad the whole page. */ - } :user =0xcc - . = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE); - -#undef VSYSCALL_ADDR -#undef VLOAD_OFFSET -#undef VLOAD -#undef VVIRT_OFFSET -#undef VVIRT - + . = ALIGN(PAGE_SIZE); __vvar_page = .; .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { + /* work around gold bug 13023 */ + __vvar_beginning_hack = .; - /* Place all vvars at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) \ - . = offset; \ + /* Place all vvars at the offsets in asm/vvar.h. */ +#define EMIT_VVAR(name, offset) \ + . = __vvar_beginning_hack + offset; \ *(.vvar_ ## name) #define __VVAR_KERNEL_LDS #include <asm/vvar.h> diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index dda7dff9cef7..18ae83dd1cd7 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -18,9 +18,6 @@ * use the vDSO. */ -/* Disable profiling for userspace code: */ -#define DISABLE_BRANCH_PROFILING - #include <linux/time.h> #include <linux/init.h> #include <linux/kernel.h> @@ -50,12 +47,36 @@ #include <asm/vgtod.h> #include <asm/traps.h> +#define CREATE_TRACE_POINTS +#include "vsyscall_trace.h" + DEFINE_VVAR(int, vgetcpu_mode); DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = { .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), }; +static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; + +static int __init vsyscall_setup(char *str) +{ + if (str) { + if (!strcmp("emulate", str)) + vsyscall_mode = EMULATE; + else if (!strcmp("native", str)) + vsyscall_mode = NATIVE; + else if (!strcmp("none", str)) + vsyscall_mode = NONE; + else + return -EINVAL; + + return 0; + } + + return -EINVAL; +} +early_param("vsyscall", vsyscall_setup); + void update_vsyscall_tz(void) { unsigned long flags; @@ -100,7 +121,7 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", level, tsk->comm, task_pid_nr(tsk), - message, regs->ip - 2, regs->cs, + message, regs->ip, regs->cs, regs->sp, regs->ax, regs->si, regs->di); } @@ -118,46 +139,39 @@ static int addr_to_vsyscall_nr(unsigned long addr) return nr; } -void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) +bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) { struct task_struct *tsk; unsigned long caller; int vsyscall_nr; long ret; - local_irq_enable(); - /* - * Real 64-bit user mode code has cs == __USER_CS. Anything else - * is bogus. + * No point in checking CS -- the only way to get here is a user mode + * trap to a high address, which means that we're in 64-bit user code. */ - if (regs->cs != __USER_CS) { - /* - * If we trapped from kernel mode, we might as well OOPS now - * instead of returning to some random address and OOPSing - * then. - */ - BUG_ON(!user_mode(regs)); - /* Compat mode and non-compat 32-bit CS should both segfault. */ - warn_bad_vsyscall(KERN_WARNING, regs, - "illegal int 0xcc from 32-bit mode"); - goto sigsegv; + WARN_ON_ONCE(address != regs->ip); + + if (vsyscall_mode == NONE) { + warn_bad_vsyscall(KERN_INFO, regs, + "vsyscall attempted with vsyscall=none"); + return false; } - /* - * x86-ism here: regs->ip points to the instruction after the int 0xcc, - * and int 0xcc is two bytes long. - */ - vsyscall_nr = addr_to_vsyscall_nr(regs->ip - 2); + vsyscall_nr = addr_to_vsyscall_nr(address); + + trace_emulate_vsyscall(vsyscall_nr); + if (vsyscall_nr < 0) { warn_bad_vsyscall(KERN_WARNING, regs, - "illegal int 0xcc (exploit attempt?)"); + "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); goto sigsegv; } if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { - warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)"); + warn_bad_vsyscall(KERN_WARNING, regs, + "vsyscall with bad stack (exploit attempt?)"); goto sigsegv; } @@ -202,13 +216,11 @@ void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) regs->ip = caller; regs->sp += 8; - local_irq_disable(); - return; + return true; sigsegv: - regs->ip -= 2; /* The faulting instruction should be the int 0xcc. */ force_sig(SIGSEGV, current); - local_irq_disable(); + return true; } /* @@ -256,15 +268,21 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) void __init map_vsyscall(void) { - extern char __vsyscall_0; - unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); + extern char __vsyscall_page; + unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); extern char __vvar_page; unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page); - /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ - __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); + __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall, + vsyscall_mode == NATIVE + ? PAGE_KERNEL_VSYSCALL + : PAGE_KERNEL_VVAR); + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) != + (unsigned long)VSYSCALL_START); + __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR); - BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != (unsigned long)VVAR_ADDRESS); + BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != + (unsigned long)VVAR_ADDRESS); } static int __init vsyscall_init(void) diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S index ffa845eae5ca..c9596a9af159 100644 --- a/arch/x86/kernel/vsyscall_emu_64.S +++ b/arch/x86/kernel/vsyscall_emu_64.S @@ -7,21 +7,31 @@ */ #include <linux/linkage.h> + #include <asm/irq_vectors.h> +#include <asm/page_types.h> +#include <asm/unistd_64.h> + +__PAGE_ALIGNED_DATA + .globl __vsyscall_page + .balign PAGE_SIZE, 0xcc + .type __vsyscall_page, @object +__vsyscall_page: + + mov $__NR_gettimeofday, %rax + syscall + ret -/* The unused parts of the page are filled with 0xcc by the linker script. */ + .balign 1024, 0xcc + mov $__NR_time, %rax + syscall + ret -.section .vsyscall_0, "a" -ENTRY(vsyscall_0) - int $VSYSCALL_EMU_VECTOR -END(vsyscall_0) + .balign 1024, 0xcc + mov $__NR_getcpu, %rax + syscall + ret -.section .vsyscall_1, "a" -ENTRY(vsyscall_1) - int $VSYSCALL_EMU_VECTOR -END(vsyscall_1) + .balign 4096, 0xcc -.section .vsyscall_2, "a" -ENTRY(vsyscall_2) - int $VSYSCALL_EMU_VECTOR -END(vsyscall_2) + .size __vsyscall_page, 4096 diff --git a/arch/x86/kernel/vsyscall_trace.h b/arch/x86/kernel/vsyscall_trace.h new file mode 100644 index 000000000000..a8b2edec54fe --- /dev/null +++ b/arch/x86/kernel/vsyscall_trace.h @@ -0,0 +1,29 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vsyscall + +#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define __VSYSCALL_TRACE_H + +#include <linux/tracepoint.h> + +TRACE_EVENT(emulate_vsyscall, + + TP_PROTO(int nr), + + TP_ARGS(nr), + + TP_STRUCT__entry(__field(int, nr)), + + TP_fast_assign( + __entry->nr = nr; + ), + + TP_printk("nr = %d", __entry->nr) +); + +#endif + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../arch/x86/kernel +#define TRACE_INCLUDE_FILE vsyscall_trace +#include <trace/define_trace.h> |