From 265baba316ea258ca015aa79bc6f107cd9fce2b3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] Update defconfig Signed-off-by: Andi Kleen --- arch/x86_64/defconfig | 109 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 78 insertions(+), 31 deletions(-) diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig index 5fb970715941..647610ecb580 100644 --- a/arch/x86_64/defconfig +++ b/arch/x86_64/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.18-rc4 -# Thu Aug 24 21:05:55 2006 +# Linux kernel version: 2.6.18-git5 +# Tue Sep 26 09:30:47 2006 # CONFIG_X86_64=y CONFIG_64BIT=y @@ -19,6 +19,7 @@ CONFIG_GENERIC_ISA_DMA=y CONFIG_GENERIC_IOMAP=y CONFIG_ARCH_MAY_HAVE_PC_FDC=y CONFIG_DMI=y +CONFIG_AUDIT_ARCH=y CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" # @@ -38,16 +39,16 @@ CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y # CONFIG_BSD_PROCESS_ACCT is not set # CONFIG_TASKSTATS is not set -CONFIG_SYSCTL=y # CONFIG_AUDIT is not set CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y # CONFIG_CPUSETS is not set # CONFIG_RELAY is not set CONFIG_INITRAMFS_SOURCE="" -CONFIG_UID16=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y # CONFIG_EMBEDDED is not set +CONFIG_UID16=y +CONFIG_SYSCTL=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y # CONFIG_KALLSYMS_EXTRA_PASS is not set @@ -56,12 +57,12 @@ CONFIG_PRINTK=y CONFIG_BUG=y CONFIG_ELF_CORE=y CONFIG_BASE_FULL=y -CONFIG_RT_MUTEXES=y CONFIG_FUTEX=y CONFIG_EPOLL=y CONFIG_SHMEM=y CONFIG_SLAB=y CONFIG_VM_EVENT_COUNTERS=y +CONFIG_RT_MUTEXES=y # CONFIG_TINY_SHMEM is not set CONFIG_BASE_SMALL=0 # CONFIG_SLOB is not set @@ -160,6 +161,7 @@ CONFIG_X86_MCE_AMD=y # CONFIG_CRASH_DUMP is not set CONFIG_PHYSICAL_START=0x200000 CONFIG_SECCOMP=y +# CONFIG_CC_STACKPROTECTOR is not set # CONFIG_HZ_100 is not set CONFIG_HZ_250=y # CONFIG_HZ_1000 is not set @@ -307,18 +309,23 @@ CONFIG_IP_PNP_DHCP=y CONFIG_INET_DIAG=y CONFIG_INET_TCP_DIAG=y # CONFIG_TCP_CONG_ADVANCED is not set -CONFIG_TCP_CONG_BIC=y +CONFIG_TCP_CONG_CUBIC=y +CONFIG_DEFAULT_TCP_CONG="cubic" CONFIG_IPV6=y # CONFIG_IPV6_PRIVACY is not set # CONFIG_IPV6_ROUTER_PREF is not set # CONFIG_INET6_AH is not set # CONFIG_INET6_ESP is not set # CONFIG_INET6_IPCOMP is not set +# CONFIG_IPV6_MIP6 is not set # CONFIG_INET6_XFRM_TUNNEL is not set # CONFIG_INET6_TUNNEL is not set # CONFIG_INET6_XFRM_MODE_TRANSPORT is not set # CONFIG_INET6_XFRM_MODE_TUNNEL is not set +# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set # CONFIG_IPV6_TUNNEL is not set +# CONFIG_IPV6_SUBTREES is not set +# CONFIG_IPV6_MULTIPLE_TABLES is not set # CONFIG_NETWORK_SECMARK is not set # CONFIG_NETFILTER is not set @@ -345,7 +352,6 @@ CONFIG_IPV6=y # CONFIG_ATALK is not set # CONFIG_X25 is not set # CONFIG_LAPB is not set -# CONFIG_NET_DIVERT is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set @@ -487,6 +493,7 @@ CONFIG_IDEDMA_AUTO=y # # CONFIG_RAID_ATTRS is not set CONFIG_SCSI=y +CONFIG_SCSI_NETLINK=y # CONFIG_SCSI_PROC_FS is not set # @@ -508,12 +515,13 @@ CONFIG_SCSI_CONSTANTS=y # CONFIG_SCSI_LOGGING is not set # -# SCSI Transport Attributes +# SCSI Transports # CONFIG_SCSI_SPI_ATTRS=y CONFIG_SCSI_FC_ATTRS=y # CONFIG_SCSI_ISCSI_ATTRS is not set CONFIG_SCSI_SAS_ATTRS=y +# CONFIG_SCSI_SAS_LIBSAS is not set # # SCSI low-level drivers @@ -532,29 +540,14 @@ CONFIG_AIC79XX_RESET_DELAY_MS=4000 # CONFIG_AIC79XX_DEBUG_ENABLE is not set CONFIG_AIC79XX_DEBUG_MASK=0 # CONFIG_AIC79XX_REG_PRETTY_PRINT is not set +# CONFIG_SCSI_AIC94XX is not set +# CONFIG_SCSI_ARCMSR is not set CONFIG_MEGARAID_NEWGEN=y CONFIG_MEGARAID_MM=y CONFIG_MEGARAID_MAILBOX=y # CONFIG_MEGARAID_LEGACY is not set CONFIG_MEGARAID_SAS=y -CONFIG_SCSI_SATA=y -CONFIG_SCSI_SATA_AHCI=y -CONFIG_SCSI_SATA_SVW=y -CONFIG_SCSI_ATA_PIIX=y -# CONFIG_SCSI_SATA_MV is not set -CONFIG_SCSI_SATA_NV=y -# CONFIG_SCSI_PDC_ADMA is not set # CONFIG_SCSI_HPTIOP is not set -# CONFIG_SCSI_SATA_QSTOR is not set -# CONFIG_SCSI_SATA_PROMISE is not set -# CONFIG_SCSI_SATA_SX4 is not set -CONFIG_SCSI_SATA_SIL=y -# CONFIG_SCSI_SATA_SIL24 is not set -# CONFIG_SCSI_SATA_SIS is not set -# CONFIG_SCSI_SATA_ULI is not set -CONFIG_SCSI_SATA_VIA=y -# CONFIG_SCSI_SATA_VITESSE is not set -CONFIG_SCSI_SATA_INTEL_COMBINED=y # CONFIG_SCSI_BUSLOGIC is not set # CONFIG_SCSI_DMX3191D is not set # CONFIG_SCSI_EATA is not set @@ -563,6 +556,7 @@ CONFIG_SCSI_SATA_INTEL_COMBINED=y # CONFIG_SCSI_IPS is not set # CONFIG_SCSI_INITIO is not set # CONFIG_SCSI_INIA100 is not set +# CONFIG_SCSI_STEX is not set # CONFIG_SCSI_SYM53C8XX_2 is not set # CONFIG_SCSI_IPR is not set # CONFIG_SCSI_QLOGIC_1280 is not set @@ -572,6 +566,62 @@ CONFIG_SCSI_SATA_INTEL_COMBINED=y # CONFIG_SCSI_DC390T is not set # CONFIG_SCSI_DEBUG is not set +# +# Serial ATA (prod) and Parallel ATA (experimental) drivers +# +CONFIG_ATA=y +CONFIG_SATA_AHCI=y +CONFIG_SATA_SVW=y +CONFIG_ATA_PIIX=y +# CONFIG_SATA_MV is not set +CONFIG_SATA_NV=y +# CONFIG_PDC_ADMA is not set +# CONFIG_SATA_QSTOR is not set +# CONFIG_SATA_PROMISE is not set +# CONFIG_SATA_SX4 is not set +CONFIG_SATA_SIL=y +# CONFIG_SATA_SIL24 is not set +# CONFIG_SATA_SIS is not set +# CONFIG_SATA_ULI is not set +CONFIG_SATA_VIA=y +# CONFIG_SATA_VITESSE is not set +CONFIG_SATA_INTEL_COMBINED=y +# CONFIG_PATA_ALI is not set +# CONFIG_PATA_AMD is not set +# CONFIG_PATA_ARTOP is not set +# CONFIG_PATA_ATIIXP is not set +# CONFIG_PATA_CMD64X is not set +# CONFIG_PATA_CS5520 is not set +# CONFIG_PATA_CS5530 is not set +# CONFIG_PATA_CYPRESS is not set +# CONFIG_PATA_EFAR is not set +# CONFIG_ATA_GENERIC is not set +# CONFIG_PATA_HPT366 is not set +# CONFIG_PATA_HPT37X is not set +# CONFIG_PATA_HPT3X2N is not set +# CONFIG_PATA_HPT3X3 is not set +# CONFIG_PATA_IT821X is not set +# CONFIG_PATA_JMICRON is not set +# CONFIG_PATA_LEGACY is not set +# CONFIG_PATA_TRIFLEX is not set +# CONFIG_PATA_MPIIX is not set +# CONFIG_PATA_OLDPIIX is not set +# CONFIG_PATA_NETCELL is not set +# CONFIG_PATA_NS87410 is not set +# CONFIG_PATA_OPTI is not set +# CONFIG_PATA_OPTIDMA is not set +# CONFIG_PATA_PDC_OLD is not set +# CONFIG_PATA_QDI is not set +# CONFIG_PATA_RADISYS is not set +# CONFIG_PATA_RZ1000 is not set +# CONFIG_PATA_SC1200 is not set +# CONFIG_PATA_SERVERWORKS is not set +# CONFIG_PATA_PDC2027X is not set +# CONFIG_PATA_SIL680 is not set +# CONFIG_PATA_SIS is not set +# CONFIG_PATA_VIA is not set +# CONFIG_PATA_WINBOND is not set + # # Multi-device support (RAID and LVM) # @@ -678,6 +728,7 @@ CONFIG_NET_PCI=y # CONFIG_ADAPTEC_STARFIRE is not set CONFIG_B44=y CONFIG_FORCEDETH=y +# CONFIG_FORCEDETH_NAPI is not set # CONFIG_DGRS is not set # CONFIG_EEPRO100 is not set CONFIG_E100=y @@ -714,6 +765,7 @@ CONFIG_E1000=y # CONFIG_VIA_VELOCITY is not set CONFIG_TIGON3=y CONFIG_BNX2=y +# CONFIG_QLA3XXX is not set # # Ethernet (10000 Mbit) @@ -1036,6 +1088,7 @@ CONFIG_SOUND=y # Open Sound System # CONFIG_SOUND_PRIME=y +CONFIG_OSS_OBSOLETE_DRIVER=y # CONFIG_SOUND_BT878 is not set # CONFIG_SOUND_EMU10K1 is not set # CONFIG_SOUND_FUSION is not set @@ -1046,7 +1099,6 @@ CONFIG_SOUND_ICH=y # CONFIG_SOUND_MSNDPIN is not set # CONFIG_SOUND_VIA82CXXX is not set # CONFIG_SOUND_OSS is not set -# CONFIG_SOUND_TVMIXER is not set # # USB support @@ -1203,7 +1255,6 @@ CONFIG_USB_MON=y # InfiniBand support # # CONFIG_INFINIBAND is not set -# CONFIG_IPATH_CORE is not set # # EDAC - error detection and reporting (RAS) (EXPERIMENTAL) @@ -1448,10 +1499,6 @@ CONFIG_DEBUG_STACKOVERFLOW=y # # CONFIG_CRYPTO is not set -# -# Hardware crypto devices -# - # # Library routines # -- cgit v1.2.1 From 9142e0c8396b25ed4cb549b5efa636065768ebe0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] i386: Update defconfig This is based on the x86-64 defconfig which works on a wide range of systems. Signed-off-by: Andi Kleen --- arch/i386/defconfig | 1063 ++++++++++++++++++++++++--------------------------- 1 file changed, 501 insertions(+), 562 deletions(-) diff --git a/arch/i386/defconfig b/arch/i386/defconfig index 89ebb7a316ab..1a29bfa26d0c 100644 --- a/arch/i386/defconfig +++ b/arch/i386/defconfig @@ -1,41 +1,51 @@ # # Automatically generated make config: don't edit +# Linux kernel version: 2.6.18-git5 +# Tue Sep 26 09:30:47 2006 # CONFIG_X86_32=y +CONFIG_GENERIC_TIME=y +CONFIG_LOCKDEP_SUPPORT=y +CONFIG_STACKTRACE_SUPPORT=y CONFIG_SEMAPHORE_SLEEPERS=y CONFIG_X86=y CONFIG_MMU=y CONFIG_GENERIC_ISA_DMA=y CONFIG_GENERIC_IOMAP=y +CONFIG_GENERIC_HWEIGHT=y CONFIG_ARCH_MAY_HAVE_PC_FDC=y CONFIG_DMI=y +CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" # # Code maturity level options # CONFIG_EXPERIMENTAL=y -CONFIG_BROKEN_ON_SMP=y +CONFIG_LOCK_KERNEL=y CONFIG_INIT_ENV_ARG_LIMIT=32 # # General setup # CONFIG_LOCALVERSION="" -# CONFIG_LOCALVERSION_AUTO is not set +CONFIG_LOCALVERSION_AUTO=y CONFIG_SWAP=y CONFIG_SYSVIPC=y -# CONFIG_POSIX_MQUEUE is not set +CONFIG_POSIX_MQUEUE=y # CONFIG_BSD_PROCESS_ACCT is not set -CONFIG_SYSCTL=y +# CONFIG_TASKSTATS is not set # CONFIG_AUDIT is not set CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y +# CONFIG_CPUSETS is not set +# CONFIG_RELAY is not set CONFIG_INITRAMFS_SOURCE="" -CONFIG_UID16=y -CONFIG_VM86=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y # CONFIG_EMBEDDED is not set +CONFIG_UID16=y +CONFIG_SYSCTL=y CONFIG_KALLSYMS=y +CONFIG_KALLSYMS_ALL=y # CONFIG_KALLSYMS_EXTRA_PASS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y @@ -45,11 +55,9 @@ CONFIG_BASE_FULL=y CONFIG_FUTEX=y CONFIG_EPOLL=y CONFIG_SHMEM=y -CONFIG_CC_ALIGN_FUNCTIONS=0 -CONFIG_CC_ALIGN_LABELS=0 -CONFIG_CC_ALIGN_LOOPS=0 -CONFIG_CC_ALIGN_JUMPS=0 CONFIG_SLAB=y +CONFIG_VM_EVENT_COUNTERS=y +CONFIG_RT_MUTEXES=y # CONFIG_TINY_SHMEM is not set CONFIG_BASE_SMALL=0 # CONFIG_SLOB is not set @@ -60,41 +68,45 @@ CONFIG_BASE_SMALL=0 CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y -CONFIG_OBSOLETE_MODPARM=y # CONFIG_MODVERSIONS is not set # CONFIG_MODULE_SRCVERSION_ALL is not set # CONFIG_KMOD is not set +CONFIG_STOP_MACHINE=y # # Block layer # -# CONFIG_LBD is not set +CONFIG_LBD=y +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_LSF is not set # # IO Schedulers # CONFIG_IOSCHED_NOOP=y -# CONFIG_IOSCHED_AS is not set -# CONFIG_IOSCHED_DEADLINE is not set +CONFIG_IOSCHED_AS=y +CONFIG_IOSCHED_DEADLINE=y CONFIG_IOSCHED_CFQ=y -# CONFIG_DEFAULT_AS is not set +CONFIG_DEFAULT_AS=y # CONFIG_DEFAULT_DEADLINE is not set -CONFIG_DEFAULT_CFQ=y +# CONFIG_DEFAULT_CFQ is not set # CONFIG_DEFAULT_NOOP is not set -CONFIG_DEFAULT_IOSCHED="cfq" +CONFIG_DEFAULT_IOSCHED="anticipatory" # # Processor type and features # -CONFIG_X86_PC=y +CONFIG_SMP=y +# CONFIG_X86_PC is not set # CONFIG_X86_ELAN is not set # CONFIG_X86_VOYAGER is not set # CONFIG_X86_NUMAQ is not set # CONFIG_X86_SUMMIT is not set # CONFIG_X86_BIGSMP is not set # CONFIG_X86_VISWS is not set -# CONFIG_X86_GENERICARCH is not set +CONFIG_X86_GENERICARCH=y # CONFIG_X86_ES7000 is not set +CONFIG_X86_CYCLONE_TIMER=y # CONFIG_M386 is not set # CONFIG_M486 is not set # CONFIG_M586 is not set @@ -102,11 +114,11 @@ CONFIG_X86_PC=y # CONFIG_M586MMX is not set # CONFIG_M686 is not set # CONFIG_MPENTIUMII is not set -# CONFIG_MPENTIUMIII is not set +CONFIG_MPENTIUMIII=y # CONFIG_MPENTIUMM is not set # CONFIG_MPENTIUM4 is not set # CONFIG_MK6 is not set -CONFIG_MK7=y +# CONFIG_MK7 is not set # CONFIG_MK8 is not set # CONFIG_MCRUSOE is not set # CONFIG_MEFFICEON is not set @@ -117,10 +129,10 @@ CONFIG_MK7=y # CONFIG_MGEODE_LX is not set # CONFIG_MCYRIXIII is not set # CONFIG_MVIAC3_2 is not set -# CONFIG_X86_GENERIC is not set +CONFIG_X86_GENERIC=y CONFIG_X86_CMPXCHG=y CONFIG_X86_XADD=y -CONFIG_X86_L1_CACHE_SHIFT=6 +CONFIG_X86_L1_CACHE_SHIFT=7 CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_GENERIC_CALIBRATE_DELAY=y CONFIG_X86_WP_WORKS_OK=y @@ -131,26 +143,28 @@ CONFIG_X86_CMPXCHG64=y CONFIG_X86_GOOD_APIC=y CONFIG_X86_INTEL_USERCOPY=y CONFIG_X86_USE_PPRO_CHECKSUM=y -CONFIG_X86_USE_3DNOW=y CONFIG_X86_TSC=y -# CONFIG_HPET_TIMER is not set -# CONFIG_SMP is not set -CONFIG_PREEMPT_NONE=y -# CONFIG_PREEMPT_VOLUNTARY is not set +CONFIG_HPET_TIMER=y +CONFIG_HPET_EMULATE_RTC=y +CONFIG_NR_CPUS=32 +CONFIG_SCHED_SMT=y +CONFIG_SCHED_MC=y +# CONFIG_PREEMPT_NONE is not set +CONFIG_PREEMPT_VOLUNTARY=y # CONFIG_PREEMPT is not set -CONFIG_X86_UP_APIC=y -CONFIG_X86_UP_IOAPIC=y +CONFIG_PREEMPT_BKL=y CONFIG_X86_LOCAL_APIC=y CONFIG_X86_IO_APIC=y CONFIG_X86_MCE=y CONFIG_X86_MCE_NONFATAL=y -# CONFIG_X86_MCE_P4THERMAL is not set +CONFIG_X86_MCE_P4THERMAL=y +CONFIG_VM86=y # CONFIG_TOSHIBA is not set # CONFIG_I8K is not set # CONFIG_X86_REBOOTFIXUPS is not set -# CONFIG_MICROCODE is not set -# CONFIG_X86_MSR is not set -# CONFIG_X86_CPUID is not set +CONFIG_MICROCODE=y +CONFIG_X86_MSR=y +CONFIG_X86_CPUID=y # # Firmware Drivers @@ -158,68 +172,67 @@ CONFIG_X86_MCE_NONFATAL=y # CONFIG_EDD is not set # CONFIG_DELL_RBU is not set # CONFIG_DCDBAS is not set -CONFIG_NOHIGHMEM=y -# CONFIG_HIGHMEM4G is not set +# CONFIG_NOHIGHMEM is not set +CONFIG_HIGHMEM4G=y # CONFIG_HIGHMEM64G is not set -CONFIG_VMSPLIT_3G=y -# CONFIG_VMSPLIT_3G_OPT is not set -# CONFIG_VMSPLIT_2G is not set -# CONFIG_VMSPLIT_1G is not set CONFIG_PAGE_OFFSET=0xC0000000 -CONFIG_ARCH_FLATMEM_ENABLE=y -CONFIG_ARCH_SPARSEMEM_ENABLE=y -CONFIG_ARCH_SELECT_MEMORY_MODEL=y +CONFIG_HIGHMEM=y CONFIG_SELECT_MEMORY_MODEL=y CONFIG_FLATMEM_MANUAL=y # CONFIG_DISCONTIGMEM_MANUAL is not set # CONFIG_SPARSEMEM_MANUAL is not set CONFIG_FLATMEM=y CONFIG_FLAT_NODE_MEM_MAP=y -CONFIG_SPARSEMEM_STATIC=y +# CONFIG_SPARSEMEM_STATIC is not set CONFIG_SPLIT_PTLOCK_CPUS=4 +CONFIG_RESOURCES_64BIT=y +# CONFIG_HIGHPTE is not set # CONFIG_MATH_EMULATION is not set CONFIG_MTRR=y # CONFIG_EFI is not set +# CONFIG_IRQBALANCE is not set CONFIG_REGPARM=y -# CONFIG_SECCOMP is not set -CONFIG_HZ_100=y -# CONFIG_HZ_250 is not set +CONFIG_SECCOMP=y +# CONFIG_HZ_100 is not set +CONFIG_HZ_250=y # CONFIG_HZ_1000 is not set -CONFIG_HZ=100 +CONFIG_HZ=250 # CONFIG_KEXEC is not set +# CONFIG_CRASH_DUMP is not set CONFIG_PHYSICAL_START=0x100000 -CONFIG_DOUBLEFAULT=y +# CONFIG_HOTPLUG_CPU is not set +CONFIG_COMPAT_VDSO=y +CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y # # Power management options (ACPI, APM) # CONFIG_PM=y -# CONFIG_PM_LEGACY is not set +CONFIG_PM_LEGACY=y # CONFIG_PM_DEBUG is not set -CONFIG_SOFTWARE_SUSPEND=y -CONFIG_PM_STD_PARTITION="" # # ACPI (Advanced Configuration and Power Interface) Support # CONFIG_ACPI=y -# CONFIG_ACPI_SLEEP is not set -# CONFIG_ACPI_AC is not set -# CONFIG_ACPI_BATTERY is not set -# CONFIG_ACPI_BUTTON is not set +CONFIG_ACPI_AC=y +CONFIG_ACPI_BATTERY=y +CONFIG_ACPI_BUTTON=y # CONFIG_ACPI_VIDEO is not set # CONFIG_ACPI_HOTKEY is not set -# CONFIG_ACPI_FAN is not set -# CONFIG_ACPI_PROCESSOR is not set +CONFIG_ACPI_FAN=y +# CONFIG_ACPI_DOCK is not set +CONFIG_ACPI_PROCESSOR=y +CONFIG_ACPI_THERMAL=y # CONFIG_ACPI_ASUS is not set # CONFIG_ACPI_IBM is not set # CONFIG_ACPI_TOSHIBA is not set -CONFIG_ACPI_BLACKLIST_YEAR=0 -# CONFIG_ACPI_DEBUG is not set +CONFIG_ACPI_BLACKLIST_YEAR=2001 +CONFIG_ACPI_DEBUG=y CONFIG_ACPI_EC=y CONFIG_ACPI_POWER=y CONFIG_ACPI_SYSTEM=y -# CONFIG_X86_PM_TIMER is not set +CONFIG_X86_PM_TIMER=y # CONFIG_ACPI_CONTAINER is not set # @@ -230,7 +243,41 @@ CONFIG_ACPI_SYSTEM=y # # CPU Frequency scaling # -# CONFIG_CPU_FREQ is not set +CONFIG_CPU_FREQ=y +CONFIG_CPU_FREQ_TABLE=y +CONFIG_CPU_FREQ_DEBUG=y +CONFIG_CPU_FREQ_STAT=y +# CONFIG_CPU_FREQ_STAT_DETAILS is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y +# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set +CONFIG_CPU_FREQ_GOV_PERFORMANCE=y +# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set +CONFIG_CPU_FREQ_GOV_USERSPACE=y +CONFIG_CPU_FREQ_GOV_ONDEMAND=y +# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set + +# +# CPUFreq processor drivers +# +CONFIG_X86_ACPI_CPUFREQ=y +# CONFIG_X86_POWERNOW_K6 is not set +# CONFIG_X86_POWERNOW_K7 is not set +CONFIG_X86_POWERNOW_K8=y +CONFIG_X86_POWERNOW_K8_ACPI=y +# CONFIG_X86_GX_SUSPMOD is not set +# CONFIG_X86_SPEEDSTEP_CENTRINO is not set +# CONFIG_X86_SPEEDSTEP_ICH is not set +# CONFIG_X86_SPEEDSTEP_SMI is not set +# CONFIG_X86_P4_CLOCKMOD is not set +# CONFIG_X86_CPUFREQ_NFORCE2 is not set +# CONFIG_X86_LONGRUN is not set +# CONFIG_X86_LONGHAUL is not set + +# +# shared options +# +CONFIG_X86_ACPI_CPUFREQ_PROC_INTF=y +# CONFIG_X86_SPEEDSTEP_LIB is not set # # Bus options (PCI, PCMCIA, EISA, MCA, ISA) @@ -244,12 +291,13 @@ CONFIG_PCI_BIOS=y CONFIG_PCI_DIRECT=y CONFIG_PCI_MMCONFIG=y # CONFIG_PCIEPORTBUS is not set -# CONFIG_PCI_MSI is not set -# CONFIG_PCI_LEGACY_PROC is not set +CONFIG_PCI_MSI=y +# CONFIG_PCI_DEBUG is not set CONFIG_ISA_DMA_API=y # CONFIG_ISA is not set # CONFIG_MCA is not set # CONFIG_SCx200 is not set +CONFIG_K8_NB=y # # PCCARD (PCMCIA/CardBus) support @@ -278,93 +326,54 @@ CONFIG_NET=y # # CONFIG_NETDEBUG is not set CONFIG_PACKET=y -CONFIG_PACKET_MMAP=y +# CONFIG_PACKET_MMAP is not set CONFIG_UNIX=y +CONFIG_XFRM=y +# CONFIG_XFRM_USER is not set +# CONFIG_XFRM_SUB_POLICY is not set # CONFIG_NET_KEY is not set CONFIG_INET=y -# CONFIG_IP_MULTICAST is not set +CONFIG_IP_MULTICAST=y # CONFIG_IP_ADVANCED_ROUTER is not set CONFIG_IP_FIB_HASH=y -# CONFIG_IP_PNP is not set +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +# CONFIG_IP_PNP_BOOTP is not set +# CONFIG_IP_PNP_RARP is not set # CONFIG_NET_IPIP is not set # CONFIG_NET_IPGRE is not set +# CONFIG_IP_MROUTE is not set # CONFIG_ARPD is not set # CONFIG_SYN_COOKIES is not set # CONFIG_INET_AH is not set # CONFIG_INET_ESP is not set # CONFIG_INET_IPCOMP is not set +# CONFIG_INET_XFRM_TUNNEL is not set # CONFIG_INET_TUNNEL is not set -# CONFIG_INET_DIAG is not set +CONFIG_INET_XFRM_MODE_TRANSPORT=y +CONFIG_INET_XFRM_MODE_TUNNEL=y +CONFIG_INET_DIAG=y +CONFIG_INET_TCP_DIAG=y # CONFIG_TCP_CONG_ADVANCED is not set -CONFIG_TCP_CONG_BIC=y - -# -# IP: Virtual Server Configuration -# -# CONFIG_IP_VS is not set -# CONFIG_IPV6 is not set -CONFIG_NETFILTER=y -# CONFIG_NETFILTER_DEBUG is not set - -# -# Core Netfilter Configuration -# -# CONFIG_NETFILTER_NETLINK is not set -CONFIG_NETFILTER_XTABLES=y -# CONFIG_NETFILTER_XT_TARGET_CLASSIFY is not set -# CONFIG_NETFILTER_XT_TARGET_MARK is not set -# CONFIG_NETFILTER_XT_TARGET_NFQUEUE is not set -# CONFIG_NETFILTER_XT_MATCH_COMMENT is not set -# CONFIG_NETFILTER_XT_MATCH_CONNTRACK is not set -# CONFIG_NETFILTER_XT_MATCH_DCCP is not set -# CONFIG_NETFILTER_XT_MATCH_HELPER is not set -# CONFIG_NETFILTER_XT_MATCH_LENGTH is not set -CONFIG_NETFILTER_XT_MATCH_LIMIT=y -CONFIG_NETFILTER_XT_MATCH_MAC=y -# CONFIG_NETFILTER_XT_MATCH_MARK is not set -# CONFIG_NETFILTER_XT_MATCH_PKTTYPE is not set -# CONFIG_NETFILTER_XT_MATCH_REALM is not set -# CONFIG_NETFILTER_XT_MATCH_SCTP is not set -CONFIG_NETFILTER_XT_MATCH_STATE=y -# CONFIG_NETFILTER_XT_MATCH_STRING is not set -# CONFIG_NETFILTER_XT_MATCH_TCPMSS is not set - -# -# IP: Netfilter Configuration -# -CONFIG_IP_NF_CONNTRACK=y -# CONFIG_IP_NF_CT_ACCT is not set -# CONFIG_IP_NF_CONNTRACK_MARK is not set -# CONFIG_IP_NF_CONNTRACK_EVENTS is not set -# CONFIG_IP_NF_CT_PROTO_SCTP is not set -CONFIG_IP_NF_FTP=y -# CONFIG_IP_NF_IRC is not set -# CONFIG_IP_NF_NETBIOS_NS is not set -# CONFIG_IP_NF_TFTP is not set -# CONFIG_IP_NF_AMANDA is not set -# CONFIG_IP_NF_PPTP is not set -# CONFIG_IP_NF_QUEUE is not set -CONFIG_IP_NF_IPTABLES=y -# CONFIG_IP_NF_MATCH_IPRANGE is not set -# CONFIG_IP_NF_MATCH_MULTIPORT is not set -# CONFIG_IP_NF_MATCH_TOS is not set -# CONFIG_IP_NF_MATCH_RECENT is not set -# CONFIG_IP_NF_MATCH_ECN is not set -# CONFIG_IP_NF_MATCH_DSCP is not set -# CONFIG_IP_NF_MATCH_AH_ESP is not set -# CONFIG_IP_NF_MATCH_TTL is not set -# CONFIG_IP_NF_MATCH_OWNER is not set -# CONFIG_IP_NF_MATCH_ADDRTYPE is not set -# CONFIG_IP_NF_MATCH_HASHLIMIT is not set -CONFIG_IP_NF_FILTER=y -# CONFIG_IP_NF_TARGET_REJECT is not set -CONFIG_IP_NF_TARGET_LOG=y -# CONFIG_IP_NF_TARGET_ULOG is not set -# CONFIG_IP_NF_TARGET_TCPMSS is not set -# CONFIG_IP_NF_NAT is not set -# CONFIG_IP_NF_MANGLE is not set -# CONFIG_IP_NF_RAW is not set -# CONFIG_IP_NF_ARPTABLES is not set +CONFIG_TCP_CONG_CUBIC=y +CONFIG_DEFAULT_TCP_CONG="cubic" +CONFIG_IPV6=y +# CONFIG_IPV6_PRIVACY is not set +# CONFIG_IPV6_ROUTER_PREF is not set +# CONFIG_INET6_AH is not set +# CONFIG_INET6_ESP is not set +# CONFIG_INET6_IPCOMP is not set +# CONFIG_IPV6_MIP6 is not set +# CONFIG_INET6_XFRM_TUNNEL is not set +# CONFIG_INET6_TUNNEL is not set +CONFIG_INET6_XFRM_MODE_TRANSPORT=y +CONFIG_INET6_XFRM_MODE_TUNNEL=y +# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set +# CONFIG_IPV6_TUNNEL is not set +# CONFIG_IPV6_SUBTREES is not set +# CONFIG_IPV6_MULTIPLE_TABLES is not set +# CONFIG_NETWORK_SECMARK is not set +# CONFIG_NETFILTER is not set # # DCCP Configuration (EXPERIMENTAL) @@ -389,7 +398,6 @@ CONFIG_IP_NF_TARGET_LOG=y # CONFIG_ATALK is not set # CONFIG_X25 is not set # CONFIG_LAPB is not set -# CONFIG_NET_DIVERT is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set @@ -402,6 +410,7 @@ CONFIG_IP_NF_TARGET_LOG=y # Network testing # # CONFIG_NET_PKTGEN is not set +# CONFIG_NET_TCPPROBE is not set # CONFIG_HAMRADIO is not set # CONFIG_IRDA is not set # CONFIG_BT is not set @@ -416,7 +425,9 @@ CONFIG_IP_NF_TARGET_LOG=y # CONFIG_STANDALONE=y CONFIG_PREVENT_FIRMWARE_BUILD=y -# CONFIG_FW_LOADER is not set +CONFIG_FW_LOADER=y +# CONFIG_DEBUG_DRIVER is not set +# CONFIG_SYS_HYPERVISOR is not set # # Connector - unified userspace <-> kernelspace linker @@ -431,13 +442,7 @@ CONFIG_PREVENT_FIRMWARE_BUILD=y # # Parallel port support # -CONFIG_PARPORT=y -CONFIG_PARPORT_PC=y -# CONFIG_PARPORT_SERIAL is not set -# CONFIG_PARPORT_PC_FIFO is not set -# CONFIG_PARPORT_PC_SUPERIO is not set -# CONFIG_PARPORT_GSC is not set -CONFIG_PARPORT_1284=y +# CONFIG_PARPORT is not set # # Plug and Play support @@ -447,8 +452,7 @@ CONFIG_PARPORT_1284=y # # Block devices # -# CONFIG_BLK_DEV_FD is not set -# CONFIG_PARIDE is not set +CONFIG_BLK_DEV_FD=y # CONFIG_BLK_CPQ_DA is not set # CONFIG_BLK_CPQ_CISS_DA is not set # CONFIG_BLK_DEV_DAC960 is not set @@ -459,8 +463,11 @@ CONFIG_BLK_DEV_LOOP=y # CONFIG_BLK_DEV_NBD is not set # CONFIG_BLK_DEV_SX8 is not set # CONFIG_BLK_DEV_UB is not set -# CONFIG_BLK_DEV_RAM is not set +CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_COUNT=16 +CONFIG_BLK_DEV_RAM_SIZE=4096 +CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024 +CONFIG_BLK_DEV_INITRD=y # CONFIG_CDROM_PKTCDVD is not set # CONFIG_ATA_OVER_ETH is not set @@ -476,7 +483,7 @@ CONFIG_BLK_DEV_IDE=y # CONFIG_BLK_DEV_IDE_SATA is not set # CONFIG_BLK_DEV_HD_IDE is not set CONFIG_BLK_DEV_IDEDISK=y -# CONFIG_IDEDISK_MULTI_MODE is not set +CONFIG_IDEDISK_MULTI_MODE=y CONFIG_BLK_DEV_IDECD=y # CONFIG_BLK_DEV_IDETAPE is not set # CONFIG_BLK_DEV_IDEFLOPPY is not set @@ -486,10 +493,10 @@ CONFIG_BLK_DEV_IDECD=y # # IDE chipset support/bugfixes # -# CONFIG_IDE_GENERIC is not set +CONFIG_IDE_GENERIC=y # CONFIG_BLK_DEV_CMD640 is not set CONFIG_BLK_DEV_IDEPCI=y -CONFIG_IDEPCI_SHARE_IRQ=y +# CONFIG_IDEPCI_SHARE_IRQ is not set # CONFIG_BLK_DEV_OFFBOARD is not set # CONFIG_BLK_DEV_GENERIC is not set # CONFIG_BLK_DEV_OPTI621 is not set @@ -500,7 +507,7 @@ CONFIG_IDEDMA_PCI_AUTO=y # CONFIG_IDEDMA_ONLYDISK is not set # CONFIG_BLK_DEV_AEC62XX is not set # CONFIG_BLK_DEV_ALI15X3 is not set -# CONFIG_BLK_DEV_AMD74XX is not set +CONFIG_BLK_DEV_AMD74XX=y # CONFIG_BLK_DEV_ATIIXP is not set # CONFIG_BLK_DEV_CMD64X is not set # CONFIG_BLK_DEV_TRIFLEX is not set @@ -511,7 +518,7 @@ CONFIG_IDEDMA_PCI_AUTO=y # CONFIG_BLK_DEV_HPT34X is not set # CONFIG_BLK_DEV_HPT366 is not set # CONFIG_BLK_DEV_SC1200 is not set -# CONFIG_BLK_DEV_PIIX is not set +CONFIG_BLK_DEV_PIIX=y # CONFIG_BLK_DEV_IT821X is not set # CONFIG_BLK_DEV_NS87415 is not set # CONFIG_BLK_DEV_PDC202XX_OLD is not set @@ -521,7 +528,7 @@ CONFIG_IDEDMA_PCI_AUTO=y # CONFIG_BLK_DEV_SIS5513 is not set # CONFIG_BLK_DEV_SLC90E66 is not set # CONFIG_BLK_DEV_TRM290 is not set -CONFIG_BLK_DEV_VIA82CXXX=y +# CONFIG_BLK_DEV_VIA82CXXX is not set # CONFIG_IDE_ARM is not set CONFIG_BLK_DEV_IDEDMA=y # CONFIG_IDEDMA_IVB is not set @@ -533,6 +540,7 @@ CONFIG_IDEDMA_AUTO=y # # CONFIG_RAID_ATTRS is not set CONFIG_SCSI=y +CONFIG_SCSI_NETLINK=y # CONFIG_SCSI_PROC_FS is not set # @@ -541,8 +549,9 @@ CONFIG_SCSI=y CONFIG_BLK_DEV_SD=y # CONFIG_CHR_DEV_ST is not set # CONFIG_CHR_DEV_OSST is not set -# CONFIG_BLK_DEV_SR is not set -# CONFIG_CHR_DEV_SG is not set +CONFIG_BLK_DEV_SR=y +# CONFIG_BLK_DEV_SR_VENDOR is not set +CONFIG_CHR_DEV_SG=y # CONFIG_CHR_DEV_SCH is not set # @@ -553,29 +562,44 @@ CONFIG_BLK_DEV_SD=y # CONFIG_SCSI_LOGGING is not set # -# SCSI Transport Attributes +# SCSI Transports # -# CONFIG_SCSI_SPI_ATTRS is not set -# CONFIG_SCSI_FC_ATTRS is not set +CONFIG_SCSI_SPI_ATTRS=y +CONFIG_SCSI_FC_ATTRS=y # CONFIG_SCSI_ISCSI_ATTRS is not set # CONFIG_SCSI_SAS_ATTRS is not set +# CONFIG_SCSI_SAS_LIBSAS is not set # # SCSI low-level drivers # # CONFIG_ISCSI_TCP is not set -# CONFIG_BLK_DEV_3W_XXXX_RAID is not set +CONFIG_BLK_DEV_3W_XXXX_RAID=y # CONFIG_SCSI_3W_9XXX is not set # CONFIG_SCSI_ACARD is not set # CONFIG_SCSI_AACRAID is not set -# CONFIG_SCSI_AIC7XXX is not set +CONFIG_SCSI_AIC7XXX=y +CONFIG_AIC7XXX_CMDS_PER_DEVICE=32 +CONFIG_AIC7XXX_RESET_DELAY_MS=5000 +CONFIG_AIC7XXX_DEBUG_ENABLE=y +CONFIG_AIC7XXX_DEBUG_MASK=0 +CONFIG_AIC7XXX_REG_PRETTY_PRINT=y # CONFIG_SCSI_AIC7XXX_OLD is not set -# CONFIG_SCSI_AIC79XX is not set +CONFIG_SCSI_AIC79XX=y +CONFIG_AIC79XX_CMDS_PER_DEVICE=32 +CONFIG_AIC79XX_RESET_DELAY_MS=4000 +# CONFIG_AIC79XX_ENABLE_RD_STRM is not set +# CONFIG_AIC79XX_DEBUG_ENABLE is not set +CONFIG_AIC79XX_DEBUG_MASK=0 +# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set +# CONFIG_SCSI_AIC94XX is not set # CONFIG_SCSI_DPT_I2O is not set +# CONFIG_SCSI_ADVANSYS is not set +# CONFIG_SCSI_ARCMSR is not set # CONFIG_MEGARAID_NEWGEN is not set # CONFIG_MEGARAID_LEGACY is not set # CONFIG_MEGARAID_SAS is not set -# CONFIG_SCSI_SATA is not set +# CONFIG_SCSI_HPTIOP is not set # CONFIG_SCSI_BUSLOGIC is not set # CONFIG_SCSI_DMX3191D is not set # CONFIG_SCSI_EATA is not set @@ -584,11 +608,9 @@ CONFIG_BLK_DEV_SD=y # CONFIG_SCSI_IPS is not set # CONFIG_SCSI_INITIO is not set # CONFIG_SCSI_INIA100 is not set -# CONFIG_SCSI_PPA is not set -# CONFIG_SCSI_IMM is not set +# CONFIG_SCSI_STEX is not set # CONFIG_SCSI_SYM53C8XX_2 is not set # CONFIG_SCSI_IPR is not set -# CONFIG_SCSI_QLOGIC_FC is not set # CONFIG_SCSI_QLOGIC_1280 is not set # CONFIG_SCSI_QLA_FC is not set # CONFIG_SCSI_LPFC is not set @@ -597,23 +619,115 @@ CONFIG_BLK_DEV_SD=y # CONFIG_SCSI_NSP32 is not set # CONFIG_SCSI_DEBUG is not set +# +# Serial ATA (prod) and Parallel ATA (experimental) drivers +# +CONFIG_ATA=y +CONFIG_SATA_AHCI=y +CONFIG_SATA_SVW=y +CONFIG_ATA_PIIX=y +# CONFIG_SATA_MV is not set +CONFIG_SATA_NV=y +# CONFIG_PDC_ADMA is not set +# CONFIG_SATA_QSTOR is not set +# CONFIG_SATA_PROMISE is not set +# CONFIG_SATA_SX4 is not set +CONFIG_SATA_SIL=y +# CONFIG_SATA_SIL24 is not set +# CONFIG_SATA_SIS is not set +# CONFIG_SATA_ULI is not set +CONFIG_SATA_VIA=y +# CONFIG_SATA_VITESSE is not set +CONFIG_SATA_INTEL_COMBINED=y +# CONFIG_PATA_ALI is not set +# CONFIG_PATA_AMD is not set +# CONFIG_PATA_ARTOP is not set +# CONFIG_PATA_ATIIXP is not set +# CONFIG_PATA_CMD64X is not set +# CONFIG_PATA_CS5520 is not set +# CONFIG_PATA_CS5530 is not set +# CONFIG_PATA_CS5535 is not set +# CONFIG_PATA_CYPRESS is not set +# CONFIG_PATA_EFAR is not set +# CONFIG_ATA_GENERIC is not set +# CONFIG_PATA_HPT366 is not set +# CONFIG_PATA_HPT37X is not set +# CONFIG_PATA_HPT3X2N is not set +# CONFIG_PATA_HPT3X3 is not set +# CONFIG_PATA_IT821X is not set +# CONFIG_PATA_JMICRON is not set +# CONFIG_PATA_LEGACY is not set +# CONFIG_PATA_TRIFLEX is not set +# CONFIG_PATA_MPIIX is not set +# CONFIG_PATA_OLDPIIX is not set +# CONFIG_PATA_NETCELL is not set +# CONFIG_PATA_NS87410 is not set +# CONFIG_PATA_OPTI is not set +# CONFIG_PATA_OPTIDMA is not set +# CONFIG_PATA_PDC_OLD is not set +# CONFIG_PATA_QDI is not set +# CONFIG_PATA_RADISYS is not set +# CONFIG_PATA_RZ1000 is not set +# CONFIG_PATA_SC1200 is not set +# CONFIG_PATA_SERVERWORKS is not set +# CONFIG_PATA_PDC2027X is not set +# CONFIG_PATA_SIL680 is not set +# CONFIG_PATA_SIS is not set +# CONFIG_PATA_VIA is not set +# CONFIG_PATA_WINBOND is not set + # # Multi-device support (RAID and LVM) # -# CONFIG_MD is not set +CONFIG_MD=y +# CONFIG_BLK_DEV_MD is not set +CONFIG_BLK_DEV_DM=y +# CONFIG_DM_CRYPT is not set +# CONFIG_DM_SNAPSHOT is not set +# CONFIG_DM_MIRROR is not set +# CONFIG_DM_ZERO is not set +# CONFIG_DM_MULTIPATH is not set # # Fusion MPT device support # -# CONFIG_FUSION is not set -# CONFIG_FUSION_SPI is not set +CONFIG_FUSION=y +CONFIG_FUSION_SPI=y # CONFIG_FUSION_FC is not set # CONFIG_FUSION_SAS is not set +CONFIG_FUSION_MAX_SGE=128 +# CONFIG_FUSION_CTL is not set # # IEEE 1394 (FireWire) support # -# CONFIG_IEEE1394 is not set +CONFIG_IEEE1394=y + +# +# Subsystem Options +# +# CONFIG_IEEE1394_VERBOSEDEBUG is not set +# CONFIG_IEEE1394_OUI_DB is not set +# CONFIG_IEEE1394_EXTRA_CONFIG_ROMS is not set +# CONFIG_IEEE1394_EXPORT_FULL_API is not set + +# +# Device Drivers +# + +# +# Texas Instruments PCILynx requires I2C +# +CONFIG_IEEE1394_OHCI1394=y + +# +# Protocol Drivers +# +# CONFIG_IEEE1394_VIDEO1394 is not set +# CONFIG_IEEE1394_SBP2 is not set +# CONFIG_IEEE1394_ETH1394 is not set +# CONFIG_IEEE1394_DV1394 is not set +CONFIG_IEEE1394_RAWIO=y # # I2O device support @@ -652,46 +766,63 @@ CONFIG_MII=y # # Tulip family network device support # -# CONFIG_NET_TULIP is not set +CONFIG_NET_TULIP=y +# CONFIG_DE2104X is not set +CONFIG_TULIP=y +# CONFIG_TULIP_MWI is not set +# CONFIG_TULIP_MMIO is not set +# CONFIG_TULIP_NAPI is not set +# CONFIG_DE4X5 is not set +# CONFIG_WINBOND_840 is not set +# CONFIG_DM9102 is not set +# CONFIG_ULI526X is not set # CONFIG_HP100 is not set CONFIG_NET_PCI=y # CONFIG_PCNET32 is not set # CONFIG_AMD8111_ETH is not set # CONFIG_ADAPTEC_STARFIRE is not set -# CONFIG_B44 is not set -# CONFIG_FORCEDETH is not set +CONFIG_B44=y +CONFIG_FORCEDETH=y +# CONFIG_FORCEDETH_NAPI is not set # CONFIG_DGRS is not set # CONFIG_EEPRO100 is not set CONFIG_E100=y # CONFIG_FEALNX is not set # CONFIG_NATSEMI is not set # CONFIG_NE2K_PCI is not set -# CONFIG_8139CP is not set -# CONFIG_8139TOO is not set +CONFIG_8139CP=y +CONFIG_8139TOO=y +# CONFIG_8139TOO_PIO is not set +# CONFIG_8139TOO_TUNE_TWISTER is not set +# CONFIG_8139TOO_8129 is not set +# CONFIG_8139_OLD_RX_RESET is not set # CONFIG_SIS900 is not set # CONFIG_EPIC100 is not set # CONFIG_SUNDANCE is not set # CONFIG_TLAN is not set # CONFIG_VIA_RHINE is not set -# CONFIG_NET_POCKET is not set # # Ethernet (1000 Mbit) # # CONFIG_ACENIC is not set # CONFIG_DL2K is not set -# CONFIG_E1000 is not set +CONFIG_E1000=y +# CONFIG_E1000_NAPI is not set +# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set -# CONFIG_R8169 is not set +CONFIG_R8169=y +# CONFIG_R8169_NAPI is not set # CONFIG_SIS190 is not set # CONFIG_SKGE is not set -# CONFIG_SKY2 is not set +CONFIG_SKY2=y # CONFIG_SK98LIN is not set # CONFIG_VIA_VELOCITY is not set -# CONFIG_TIGON3 is not set -# CONFIG_BNX2 is not set +CONFIG_TIGON3=y +CONFIG_BNX2=y +# CONFIG_QLA3XXX is not set # # Ethernet (10000 Mbit) @@ -699,6 +830,7 @@ CONFIG_E100=y # CONFIG_CHELSIO_T1 is not set # CONFIG_IXGB is not set # CONFIG_S2IO is not set +# CONFIG_MYRI10GE is not set # # Token Ring devices @@ -716,14 +848,15 @@ CONFIG_E100=y # CONFIG_WAN is not set # CONFIG_FDDI is not set # CONFIG_HIPPI is not set -# CONFIG_PLIP is not set # CONFIG_PPP is not set # CONFIG_SLIP is not set # CONFIG_NET_FC is not set # CONFIG_SHAPER is not set -# CONFIG_NETCONSOLE is not set -# CONFIG_NETPOLL is not set -# CONFIG_NET_POLL_CONTROLLER is not set +CONFIG_NETCONSOLE=y +CONFIG_NETPOLL=y +# CONFIG_NETPOLL_RX is not set +# CONFIG_NETPOLL_TRAP is not set +CONFIG_NET_POLL_CONTROLLER=y # # ISDN subsystem @@ -745,8 +878,8 @@ CONFIG_INPUT=y # CONFIG_INPUT_MOUSEDEV=y CONFIG_INPUT_MOUSEDEV_PSAUX=y -CONFIG_INPUT_MOUSEDEV_SCREEN_X=1280 -CONFIG_INPUT_MOUSEDEV_SCREEN_Y=1024 +CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 +CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 # CONFIG_INPUT_JOYDEV is not set # CONFIG_INPUT_TSDEV is not set CONFIG_INPUT_EVDEV=y @@ -776,7 +909,6 @@ CONFIG_SERIO=y CONFIG_SERIO_I8042=y # CONFIG_SERIO_SERPORT is not set # CONFIG_SERIO_CT82C710 is not set -# CONFIG_SERIO_PARKBD is not set # CONFIG_SERIO_PCIPS2 is not set CONFIG_SERIO_LIBPS2=y # CONFIG_SERIO_RAW is not set @@ -788,14 +920,15 @@ CONFIG_SERIO_LIBPS2=y CONFIG_VT=y CONFIG_VT_CONSOLE=y CONFIG_HW_CONSOLE=y +# CONFIG_VT_HW_CONSOLE_BINDING is not set # CONFIG_SERIAL_NONSTANDARD is not set # # Serial drivers # CONFIG_SERIAL_8250=y -# CONFIG_SERIAL_8250_CONSOLE is not set -# CONFIG_SERIAL_8250_ACPI is not set +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_8250_PCI=y CONFIG_SERIAL_8250_NR_UARTS=4 CONFIG_SERIAL_8250_RUNTIME_UARTS=4 # CONFIG_SERIAL_8250_EXTENDED is not set @@ -804,14 +937,11 @@ CONFIG_SERIAL_8250_RUNTIME_UARTS=4 # Non-8250 serial port support # CONFIG_SERIAL_CORE=y +CONFIG_SERIAL_CORE_CONSOLE=y # CONFIG_SERIAL_JSM is not set CONFIG_UNIX98_PTYS=y CONFIG_LEGACY_PTYS=y CONFIG_LEGACY_PTY_COUNT=256 -CONFIG_PRINTER=y -# CONFIG_LP_CONSOLE is not set -# CONFIG_PPDEV is not set -# CONFIG_TIPAR is not set # # IPMI @@ -822,8 +952,12 @@ CONFIG_PRINTER=y # Watchdog Cards # # CONFIG_WATCHDOG is not set -# CONFIG_HW_RANDOM is not set -CONFIG_NVRAM=y +CONFIG_HW_RANDOM=y +CONFIG_HW_RANDOM_INTEL=y +CONFIG_HW_RANDOM_AMD=y +CONFIG_HW_RANDOM_GEODE=y +CONFIG_HW_RANDOM_VIA=y +# CONFIG_NVRAM is not set CONFIG_RTC=y # CONFIG_DTLK is not set # CONFIG_R3964 is not set @@ -833,31 +967,28 @@ CONFIG_RTC=y # # Ftape, the floppy tape device driver # -# CONFIG_FTAPE is not set CONFIG_AGP=y # CONFIG_AGP_ALI is not set # CONFIG_AGP_ATI is not set # CONFIG_AGP_AMD is not set -# CONFIG_AGP_AMD64 is not set -# CONFIG_AGP_INTEL is not set +CONFIG_AGP_AMD64=y +CONFIG_AGP_INTEL=y # CONFIG_AGP_NVIDIA is not set # CONFIG_AGP_SIS is not set # CONFIG_AGP_SWORKS is not set -CONFIG_AGP_VIA=y +# CONFIG_AGP_VIA is not set # CONFIG_AGP_EFFICEON is not set -CONFIG_DRM=y -# CONFIG_DRM_TDFX is not set -# CONFIG_DRM_R128 is not set -CONFIG_DRM_RADEON=y -# CONFIG_DRM_MGA is not set -# CONFIG_DRM_SIS is not set -# CONFIG_DRM_VIA is not set -# CONFIG_DRM_SAVAGE is not set +# CONFIG_DRM is not set # CONFIG_MWAVE is not set +# CONFIG_PC8736x_GPIO is not set +# CONFIG_NSC_GPIO is not set # CONFIG_CS5535_GPIO is not set -# CONFIG_RAW_DRIVER is not set -# CONFIG_HPET is not set -# CONFIG_HANGCHECK_TIMER is not set +CONFIG_RAW_DRIVER=y +CONFIG_MAX_RAW_DEVS=256 +CONFIG_HPET=y +# CONFIG_HPET_RTC_IRQ is not set +CONFIG_HPET_MMAP=y +CONFIG_HANGCHECK_TIMER=y # # TPM devices @@ -868,59 +999,7 @@ CONFIG_DRM_RADEON=y # # I2C support # -CONFIG_I2C=y -CONFIG_I2C_CHARDEV=y - -# -# I2C Algorithms -# -CONFIG_I2C_ALGOBIT=y -# CONFIG_I2C_ALGOPCF is not set -# CONFIG_I2C_ALGOPCA is not set - -# -# I2C Hardware Bus support -# -# CONFIG_I2C_ALI1535 is not set -# CONFIG_I2C_ALI1563 is not set -# CONFIG_I2C_ALI15X3 is not set -# CONFIG_I2C_AMD756 is not set -# CONFIG_I2C_AMD8111 is not set -# CONFIG_I2C_I801 is not set -# CONFIG_I2C_I810 is not set -# CONFIG_I2C_PIIX4 is not set -CONFIG_I2C_ISA=y -# CONFIG_I2C_NFORCE2 is not set -# CONFIG_I2C_PARPORT is not set -# CONFIG_I2C_PARPORT_LIGHT is not set -# CONFIG_I2C_PROSAVAGE is not set -# CONFIG_I2C_SAVAGE4 is not set -# CONFIG_SCx200_ACB is not set -# CONFIG_I2C_SIS5595 is not set -# CONFIG_I2C_SIS630 is not set -# CONFIG_I2C_SIS96X is not set -# CONFIG_I2C_STUB is not set -# CONFIG_I2C_VIA is not set -CONFIG_I2C_VIAPRO=y -# CONFIG_I2C_VOODOO3 is not set -# CONFIG_I2C_PCA_ISA is not set - -# -# Miscellaneous I2C Chip support -# -# CONFIG_SENSORS_DS1337 is not set -# CONFIG_SENSORS_DS1374 is not set -# CONFIG_SENSORS_EEPROM is not set -# CONFIG_SENSORS_PCF8574 is not set -# CONFIG_SENSORS_PCA9539 is not set -# CONFIG_SENSORS_PCF8591 is not set -# CONFIG_SENSORS_RTC8564 is not set -# CONFIG_SENSORS_MAX6875 is not set -# CONFIG_RTC_X1205_I2C is not set -# CONFIG_I2C_DEBUG_CORE is not set -# CONFIG_I2C_DEBUG_ALGO is not set -# CONFIG_I2C_DEBUG_BUS is not set -# CONFIG_I2C_DEBUG_CHIP is not set +# CONFIG_I2C is not set # # SPI support @@ -931,169 +1010,44 @@ CONFIG_I2C_VIAPRO=y # # Dallas's 1-wire bus # -# CONFIG_W1 is not set # # Hardware Monitoring support # -CONFIG_HWMON=y -CONFIG_HWMON_VID=y -# CONFIG_SENSORS_ADM1021 is not set -# CONFIG_SENSORS_ADM1025 is not set -# CONFIG_SENSORS_ADM1026 is not set -# CONFIG_SENSORS_ADM1031 is not set -# CONFIG_SENSORS_ADM9240 is not set -# CONFIG_SENSORS_ASB100 is not set -# CONFIG_SENSORS_ATXP1 is not set -# CONFIG_SENSORS_DS1621 is not set -# CONFIG_SENSORS_F71805F is not set -# CONFIG_SENSORS_FSCHER is not set -# CONFIG_SENSORS_FSCPOS is not set -# CONFIG_SENSORS_GL518SM is not set -# CONFIG_SENSORS_GL520SM is not set -CONFIG_SENSORS_IT87=y -# CONFIG_SENSORS_LM63 is not set -# CONFIG_SENSORS_LM75 is not set -# CONFIG_SENSORS_LM77 is not set -# CONFIG_SENSORS_LM78 is not set -# CONFIG_SENSORS_LM80 is not set -# CONFIG_SENSORS_LM83 is not set -# CONFIG_SENSORS_LM85 is not set -# CONFIG_SENSORS_LM87 is not set -# CONFIG_SENSORS_LM90 is not set -# CONFIG_SENSORS_LM92 is not set -# CONFIG_SENSORS_MAX1619 is not set -# CONFIG_SENSORS_PC87360 is not set -# CONFIG_SENSORS_SIS5595 is not set -# CONFIG_SENSORS_SMSC47M1 is not set -# CONFIG_SENSORS_SMSC47B397 is not set -# CONFIG_SENSORS_VIA686A is not set -# CONFIG_SENSORS_VT8231 is not set -# CONFIG_SENSORS_W83781D is not set -# CONFIG_SENSORS_W83792D is not set -# CONFIG_SENSORS_W83L785TS is not set -# CONFIG_SENSORS_W83627HF is not set -# CONFIG_SENSORS_W83627EHF is not set -# CONFIG_SENSORS_HDAPS is not set -# CONFIG_HWMON_DEBUG_CHIP is not set +# CONFIG_HWMON is not set +# CONFIG_HWMON_VID is not set # # Misc devices # # CONFIG_IBM_ASM is not set -# -# Multimedia Capabilities Port drivers -# - # # Multimedia devices # -CONFIG_VIDEO_DEV=y - -# -# Video For Linux -# - -# -# Video Adapters -# -# CONFIG_VIDEO_ADV_DEBUG is not set -# CONFIG_VIDEO_BT848 is not set -# CONFIG_VIDEO_BWQCAM is not set -# CONFIG_VIDEO_CQCAM is not set -# CONFIG_VIDEO_W9966 is not set -# CONFIG_VIDEO_CPIA is not set -# CONFIG_VIDEO_SAA5246A is not set -# CONFIG_VIDEO_SAA5249 is not set -# CONFIG_TUNER_3036 is not set -# CONFIG_VIDEO_STRADIS is not set -# CONFIG_VIDEO_ZORAN is not set -CONFIG_VIDEO_SAA7134=y -# CONFIG_VIDEO_SAA7134_ALSA is not set -# CONFIG_VIDEO_MXB is not set -# CONFIG_VIDEO_DPC is not set -# CONFIG_VIDEO_HEXIUM_ORION is not set -# CONFIG_VIDEO_HEXIUM_GEMINI is not set -# CONFIG_VIDEO_CX88 is not set -# CONFIG_VIDEO_EM28XX is not set -# CONFIG_VIDEO_OVCAMCHIP is not set -# CONFIG_VIDEO_AUDIO_DECODER is not set -# CONFIG_VIDEO_DECODER is not set - -# -# Radio Adapters -# -# CONFIG_RADIO_GEMTEK_PCI is not set -# CONFIG_RADIO_MAXIRADIO is not set -# CONFIG_RADIO_MAESTRO is not set +# CONFIG_VIDEO_DEV is not set +CONFIG_VIDEO_V4L2=y # # Digital Video Broadcasting Devices # # CONFIG_DVB is not set -CONFIG_VIDEO_TUNER=y -CONFIG_VIDEO_BUF=y -CONFIG_VIDEO_IR=y +# CONFIG_USB_DABUSB is not set # # Graphics support # -CONFIG_FB=y -CONFIG_FB_CFB_FILLRECT=y -CONFIG_FB_CFB_COPYAREA=y -CONFIG_FB_CFB_IMAGEBLIT=y -# CONFIG_FB_MACMODES is not set -CONFIG_FB_MODE_HELPERS=y -# CONFIG_FB_TILEBLITTING is not set -# CONFIG_FB_CIRRUS is not set -# CONFIG_FB_PM2 is not set -# CONFIG_FB_CYBER2000 is not set -# CONFIG_FB_ARC is not set -# CONFIG_FB_ASILIANT is not set -# CONFIG_FB_IMSTT is not set -# CONFIG_FB_VGA16 is not set -# CONFIG_FB_VESA is not set -CONFIG_VIDEO_SELECT=y -# CONFIG_FB_HGA is not set -# CONFIG_FB_S1D13XXX is not set -# CONFIG_FB_NVIDIA is not set -# CONFIG_FB_RIVA is not set -# CONFIG_FB_I810 is not set -# CONFIG_FB_INTEL is not set -# CONFIG_FB_MATROX is not set -# CONFIG_FB_RADEON_OLD is not set -CONFIG_FB_RADEON=y -CONFIG_FB_RADEON_I2C=y -# CONFIG_FB_RADEON_DEBUG is not set -# CONFIG_FB_ATY128 is not set -# CONFIG_FB_ATY is not set -# CONFIG_FB_SAVAGE is not set -# CONFIG_FB_SIS is not set -# CONFIG_FB_NEOMAGIC is not set -# CONFIG_FB_KYRO is not set -# CONFIG_FB_3DFX is not set -# CONFIG_FB_VOODOO1 is not set -# CONFIG_FB_CYBLA is not set -# CONFIG_FB_TRIDENT is not set -# CONFIG_FB_GEODE is not set -# CONFIG_FB_VIRTUAL is not set +CONFIG_FIRMWARE_EDID=y +# CONFIG_FB is not set # # Console display driver support # CONFIG_VGA_CONSOLE=y +CONFIG_VGACON_SOFT_SCROLLBACK=y +CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=128 +CONFIG_VIDEO_SELECT=y CONFIG_DUMMY_CONSOLE=y -CONFIG_FRAMEBUFFER_CONSOLE=y -# CONFIG_FRAMEBUFFER_CONSOLE_ROTATION is not set -# CONFIG_FONTS is not set -CONFIG_FONT_8x8=y -CONFIG_FONT_8x16=y - -# -# Logo configuration -# -# CONFIG_LOGO is not set # CONFIG_BACKLIGHT_LCD_SUPPORT is not set # @@ -1104,97 +1058,30 @@ CONFIG_SOUND=y # # Advanced Linux Sound Architecture # -CONFIG_SND=y -CONFIG_SND_TIMER=y -CONFIG_SND_PCM=y -CONFIG_SND_RAWMIDI=y -CONFIG_SND_SEQUENCER=y -# CONFIG_SND_SEQ_DUMMY is not set -# CONFIG_SND_MIXER_OSS is not set -# CONFIG_SND_PCM_OSS is not set -# CONFIG_SND_SEQUENCER_OSS is not set -CONFIG_SND_RTCTIMER=y -CONFIG_SND_SEQ_RTCTIMER_DEFAULT=y -# CONFIG_SND_DYNAMIC_MINORS is not set -# CONFIG_SND_SUPPORT_OLD_API is not set -# CONFIG_SND_VERBOSE_PRINTK is not set -# CONFIG_SND_DEBUG is not set - -# -# Generic devices -# -CONFIG_SND_MPU401_UART=y -CONFIG_SND_AC97_CODEC=y -CONFIG_SND_AC97_BUS=y -# CONFIG_SND_DUMMY is not set -# CONFIG_SND_VIRMIDI is not set -# CONFIG_SND_MTPAV is not set -# CONFIG_SND_SERIAL_U16550 is not set -# CONFIG_SND_MPU401 is not set - -# -# PCI devices -# -# CONFIG_SND_AD1889 is not set -# CONFIG_SND_ALS4000 is not set -# CONFIG_SND_ALI5451 is not set -# CONFIG_SND_ATIIXP is not set -# CONFIG_SND_ATIIXP_MODEM is not set -# CONFIG_SND_AU8810 is not set -# CONFIG_SND_AU8820 is not set -# CONFIG_SND_AU8830 is not set -# CONFIG_SND_AZT3328 is not set -# CONFIG_SND_BT87X is not set -# CONFIG_SND_CA0106 is not set -# CONFIG_SND_CMIPCI is not set -# CONFIG_SND_CS4281 is not set -# CONFIG_SND_CS46XX is not set -# CONFIG_SND_CS5535AUDIO is not set -# CONFIG_SND_EMU10K1 is not set -# CONFIG_SND_EMU10K1X is not set -# CONFIG_SND_ENS1370 is not set -# CONFIG_SND_ENS1371 is not set -# CONFIG_SND_ES1938 is not set -# CONFIG_SND_ES1968 is not set -# CONFIG_SND_FM801 is not set -# CONFIG_SND_HDA_INTEL is not set -# CONFIG_SND_HDSP is not set -# CONFIG_SND_HDSPM is not set -# CONFIG_SND_ICE1712 is not set -# CONFIG_SND_ICE1724 is not set -# CONFIG_SND_INTEL8X0 is not set -# CONFIG_SND_INTEL8X0M is not set -# CONFIG_SND_KORG1212 is not set -# CONFIG_SND_MAESTRO3 is not set -# CONFIG_SND_MIXART is not set -# CONFIG_SND_NM256 is not set -# CONFIG_SND_PCXHR is not set -# CONFIG_SND_RME32 is not set -# CONFIG_SND_RME96 is not set -# CONFIG_SND_RME9652 is not set -# CONFIG_SND_SONICVIBES is not set -# CONFIG_SND_TRIDENT is not set -CONFIG_SND_VIA82XX=y -# CONFIG_SND_VIA82XX_MODEM is not set -# CONFIG_SND_VX222 is not set -# CONFIG_SND_YMFPCI is not set - -# -# USB devices -# -# CONFIG_SND_USB_AUDIO is not set -# CONFIG_SND_USB_USX2Y is not set +# CONFIG_SND is not set # # Open Sound System # -# CONFIG_SOUND_PRIME is not set +CONFIG_SOUND_PRIME=y +CONFIG_OSS_OBSOLETE_DRIVER=y +# CONFIG_SOUND_BT878 is not set +# CONFIG_SOUND_EMU10K1 is not set +# CONFIG_SOUND_FUSION is not set +# CONFIG_SOUND_ES1371 is not set +CONFIG_SOUND_ICH=y +# CONFIG_SOUND_TRIDENT is not set +# CONFIG_SOUND_MSNDCLAS is not set +# CONFIG_SOUND_MSNDPIN is not set +# CONFIG_SOUND_VIA82CXXX is not set +# CONFIG_SOUND_OSS is not set # # USB support # CONFIG_USB_ARCH_HAS_HCD=y CONFIG_USB_ARCH_HAS_OHCI=y +CONFIG_USB_ARCH_HAS_EHCI=y CONFIG_USB=y # CONFIG_USB_DEBUG is not set @@ -1213,17 +1100,19 @@ CONFIG_USB_DEVICEFS=y CONFIG_USB_EHCI_HCD=y # CONFIG_USB_EHCI_SPLIT_ISO is not set # CONFIG_USB_EHCI_ROOT_HUB_TT is not set +# CONFIG_USB_EHCI_TT_NEWSCHED is not set # CONFIG_USB_ISP116X_HCD is not set -# CONFIG_USB_OHCI_HCD is not set +CONFIG_USB_OHCI_HCD=y +# CONFIG_USB_OHCI_BIG_ENDIAN is not set +CONFIG_USB_OHCI_LITTLE_ENDIAN=y CONFIG_USB_UHCI_HCD=y # CONFIG_USB_SL811_HCD is not set # # USB Device Class drivers # -# CONFIG_OBSOLETE_OSS_USB_DRIVER is not set # CONFIG_USB_ACM is not set -# CONFIG_USB_PRINTER is not set +CONFIG_USB_PRINTER=y # # NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' @@ -1248,21 +1137,17 @@ CONFIG_USB_STORAGE=y # # USB Input Devices # -# CONFIG_USB_HID is not set - -# -# USB HID Boot Protocol drivers -# -# CONFIG_USB_KBD is not set -# CONFIG_USB_MOUSE is not set +CONFIG_USB_HID=y +CONFIG_USB_HIDINPUT=y +# CONFIG_USB_HIDINPUT_POWERBOOK is not set +# CONFIG_HID_FF is not set +# CONFIG_USB_HIDDEV is not set # CONFIG_USB_AIPTEK is not set # CONFIG_USB_WACOM is not set # CONFIG_USB_ACECAD is not set # CONFIG_USB_KBTAB is not set # CONFIG_USB_POWERMATE is not set -# CONFIG_USB_MTOUCH is not set -# CONFIG_USB_ITMTOUCH is not set -# CONFIG_USB_EGALAX is not set +# CONFIG_USB_TOUCHSCREEN is not set # CONFIG_USB_YEALINK is not set # CONFIG_USB_XPAD is not set # CONFIG_USB_ATI_REMOTE is not set @@ -1276,21 +1161,6 @@ CONFIG_USB_STORAGE=y # CONFIG_USB_MDC800 is not set # CONFIG_USB_MICROTEK is not set -# -# USB Multimedia devices -# -# CONFIG_USB_DABUSB is not set -# CONFIG_USB_VICAM is not set -# CONFIG_USB_DSBR is not set -# CONFIG_USB_ET61X251 is not set -# CONFIG_USB_IBMCAM is not set -# CONFIG_USB_KONICAWC is not set -# CONFIG_USB_OV511 is not set -# CONFIG_USB_SE401 is not set -# CONFIG_USB_SN9C102 is not set -# CONFIG_USB_STV680 is not set -# CONFIG_USB_PWC is not set - # # USB Network Adapters # @@ -1299,12 +1169,11 @@ CONFIG_USB_STORAGE=y # CONFIG_USB_PEGASUS is not set # CONFIG_USB_RTL8150 is not set # CONFIG_USB_USBNET is not set -# CONFIG_USB_MON is not set +CONFIG_USB_MON=y # # USB port drivers # -# CONFIG_USB_USS720 is not set # # USB Serial Converter support @@ -1321,10 +1190,12 @@ CONFIG_USB_STORAGE=y # CONFIG_USB_LEGOTOWER is not set # CONFIG_USB_LCD is not set # CONFIG_USB_LED is not set +# CONFIG_USB_CYPRESS_CY7C63 is not set # CONFIG_USB_CYTHERM is not set # CONFIG_USB_PHIDGETKIT is not set # CONFIG_USB_PHIDGETSERVO is not set # CONFIG_USB_IDMOUSE is not set +# CONFIG_USB_APPLEDISPLAY is not set # CONFIG_USB_SISUSBVGA is not set # CONFIG_USB_LD is not set # CONFIG_USB_TEST is not set @@ -1343,57 +1214,97 @@ CONFIG_USB_STORAGE=y # # CONFIG_MMC is not set +# +# LED devices +# +# CONFIG_NEW_LEDS is not set + +# +# LED drivers +# + +# +# LED Triggers +# + # # InfiniBand support # # CONFIG_INFINIBAND is not set # -# SN Devices +# EDAC - error detection and reporting (RAS) (EXPERIMENTAL) +# +# CONFIG_EDAC is not set + +# +# Real Time Clock # +# CONFIG_RTC_CLASS is not set # -# EDAC - error detection and reporting (RAS) +# DMA Engine support +# +# CONFIG_DMA_ENGINE is not set + +# +# DMA Clients +# + +# +# DMA Devices # -# CONFIG_EDAC is not set # # File systems # CONFIG_EXT2_FS=y -# CONFIG_EXT2_FS_XATTR is not set +CONFIG_EXT2_FS_XATTR=y +CONFIG_EXT2_FS_POSIX_ACL=y +# CONFIG_EXT2_FS_SECURITY is not set # CONFIG_EXT2_FS_XIP is not set -# CONFIG_EXT3_FS is not set -# CONFIG_REISERFS_FS is not set +CONFIG_EXT3_FS=y +CONFIG_EXT3_FS_XATTR=y +CONFIG_EXT3_FS_POSIX_ACL=y +# CONFIG_EXT3_FS_SECURITY is not set +CONFIG_JBD=y +# CONFIG_JBD_DEBUG is not set +CONFIG_FS_MBCACHE=y +CONFIG_REISERFS_FS=y +# CONFIG_REISERFS_CHECK is not set +# CONFIG_REISERFS_PROC_INFO is not set +CONFIG_REISERFS_FS_XATTR=y +CONFIG_REISERFS_FS_POSIX_ACL=y +# CONFIG_REISERFS_FS_SECURITY is not set # CONFIG_JFS_FS is not set -# CONFIG_FS_POSIX_ACL is not set +CONFIG_FS_POSIX_ACL=y # CONFIG_XFS_FS is not set # CONFIG_OCFS2_FS is not set # CONFIG_MINIX_FS is not set # CONFIG_ROMFS_FS is not set -# CONFIG_INOTIFY is not set +CONFIG_INOTIFY=y +CONFIG_INOTIFY_USER=y # CONFIG_QUOTA is not set CONFIG_DNOTIFY=y # CONFIG_AUTOFS_FS is not set -# CONFIG_AUTOFS4_FS is not set +CONFIG_AUTOFS4_FS=y # CONFIG_FUSE_FS is not set # # CD-ROM/DVD Filesystems # CONFIG_ISO9660_FS=y -CONFIG_JOLIET=y -CONFIG_ZISOFS=y -CONFIG_ZISOFS_FS=y +# CONFIG_JOLIET is not set +# CONFIG_ZISOFS is not set # CONFIG_UDF_FS is not set # # DOS/FAT/NT Filesystems # CONFIG_FAT_FS=y -# CONFIG_MSDOS_FS is not set +CONFIG_MSDOS_FS=y CONFIG_VFAT_FS=y -CONFIG_FAT_DEFAULT_CODEPAGE=850 +CONFIG_FAT_DEFAULT_CODEPAGE=437 CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" # CONFIG_NTFS_FS is not set @@ -1404,10 +1315,9 @@ CONFIG_PROC_FS=y CONFIG_PROC_KCORE=y CONFIG_SYSFS=y CONFIG_TMPFS=y -# CONFIG_HUGETLBFS is not set -# CONFIG_HUGETLB_PAGE is not set +CONFIG_HUGETLBFS=y +CONFIG_HUGETLB_PAGE=y CONFIG_RAMFS=y -# CONFIG_RELAYFS_FS is not set # CONFIG_CONFIGFS_FS is not set # @@ -1430,13 +1340,26 @@ CONFIG_RAMFS=y # # Network File Systems # -# CONFIG_NFS_FS is not set -# CONFIG_NFSD is not set +CONFIG_NFS_FS=y +CONFIG_NFS_V3=y +# CONFIG_NFS_V3_ACL is not set +# CONFIG_NFS_V4 is not set +# CONFIG_NFS_DIRECTIO is not set +CONFIG_NFSD=y +CONFIG_NFSD_V3=y +# CONFIG_NFSD_V3_ACL is not set +# CONFIG_NFSD_V4 is not set +CONFIG_NFSD_TCP=y +CONFIG_ROOT_NFS=y +CONFIG_LOCKD=y +CONFIG_LOCKD_V4=y +CONFIG_EXPORTFS=y +CONFIG_NFS_COMMON=y +CONFIG_SUNRPC=y +# CONFIG_RPCSEC_GSS_KRB5 is not set +# CONFIG_RPCSEC_GSS_SPKM3 is not set # CONFIG_SMB_FS is not set -CONFIG_CIFS=y -# CONFIG_CIFS_STATS is not set -# CONFIG_CIFS_XATTR is not set -# CONFIG_CIFS_EXPERIMENTAL is not set +# CONFIG_CIFS is not set # CONFIG_NCP_FS is not set # CONFIG_CODA_FS is not set # CONFIG_AFS_FS is not set @@ -1445,33 +1368,18 @@ CONFIG_CIFS=y # # Partition Types # -CONFIG_PARTITION_ADVANCED=y -# CONFIG_ACORN_PARTITION is not set -# CONFIG_OSF_PARTITION is not set -# CONFIG_AMIGA_PARTITION is not set -# CONFIG_ATARI_PARTITION is not set -# CONFIG_MAC_PARTITION is not set +# CONFIG_PARTITION_ADVANCED is not set CONFIG_MSDOS_PARTITION=y -# CONFIG_BSD_DISKLABEL is not set -# CONFIG_MINIX_SUBPARTITION is not set -# CONFIG_SOLARIS_X86_PARTITION is not set -# CONFIG_UNIXWARE_DISKLABEL is not set -# CONFIG_LDM_PARTITION is not set -# CONFIG_SGI_PARTITION is not set -# CONFIG_ULTRIX_PARTITION is not set -# CONFIG_SUN_PARTITION is not set -# CONFIG_KARMA_PARTITION is not set -# CONFIG_EFI_PARTITION is not set # # Native Language Support # CONFIG_NLS=y -CONFIG_NLS_DEFAULT="iso8859-15" -# CONFIG_NLS_CODEPAGE_437 is not set +CONFIG_NLS_DEFAULT="iso8859-1" +CONFIG_NLS_CODEPAGE_437=y # CONFIG_NLS_CODEPAGE_737 is not set # CONFIG_NLS_CODEPAGE_775 is not set -CONFIG_NLS_CODEPAGE_850=y +# CONFIG_NLS_CODEPAGE_850 is not set # CONFIG_NLS_CODEPAGE_852 is not set # CONFIG_NLS_CODEPAGE_855 is not set # CONFIG_NLS_CODEPAGE_857 is not set @@ -1491,7 +1399,7 @@ CONFIG_NLS_CODEPAGE_850=y # CONFIG_NLS_ISO8859_8 is not set # CONFIG_NLS_CODEPAGE_1250 is not set # CONFIG_NLS_CODEPAGE_1251 is not set -# CONFIG_NLS_ASCII is not set +CONFIG_NLS_ASCII=y CONFIG_NLS_ISO8859_1=y # CONFIG_NLS_ISO8859_2 is not set # CONFIG_NLS_ISO8859_3 is not set @@ -1510,20 +1418,50 @@ CONFIG_NLS_UTF8=y # # Instrumentation Support # -# CONFIG_PROFILING is not set -# CONFIG_KPROBES is not set +CONFIG_PROFILING=y +CONFIG_OPROFILE=y +CONFIG_KPROBES=y # # Kernel hacking # +CONFIG_TRACE_IRQFLAGS_SUPPORT=y # CONFIG_PRINTK_TIME is not set CONFIG_MAGIC_SYSRQ=y -# CONFIG_DEBUG_KERNEL is not set -CONFIG_LOG_BUF_SHIFT=14 +CONFIG_UNUSED_SYMBOLS=y +CONFIG_DEBUG_KERNEL=y +CONFIG_LOG_BUF_SHIFT=18 +CONFIG_DETECT_SOFTLOCKUP=y +# CONFIG_SCHEDSTATS is not set +# CONFIG_DEBUG_SLAB is not set +# CONFIG_DEBUG_RT_MUTEXES is not set +# CONFIG_RT_MUTEX_TESTER is not set +# CONFIG_DEBUG_SPINLOCK is not set +# CONFIG_DEBUG_MUTEXES is not set +# CONFIG_DEBUG_RWSEMS is not set +# CONFIG_DEBUG_LOCK_ALLOC is not set +# CONFIG_PROVE_LOCKING is not set +# CONFIG_DEBUG_SPINLOCK_SLEEP is not set +# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +# CONFIG_DEBUG_KOBJECT is not set +# CONFIG_DEBUG_HIGHMEM is not set CONFIG_DEBUG_BUGVERBOSE=y +# CONFIG_DEBUG_INFO is not set +# CONFIG_DEBUG_FS is not set +# CONFIG_DEBUG_VM is not set +# CONFIG_FRAME_POINTER is not set +CONFIG_UNWIND_INFO=y +CONFIG_STACK_UNWIND=y +# CONFIG_FORCED_INLINING is not set +# CONFIG_RCU_TORTURE_TEST is not set CONFIG_EARLY_PRINTK=y +CONFIG_DEBUG_STACKOVERFLOW=y +# CONFIG_DEBUG_STACK_USAGE is not set +# CONFIG_DEBUG_RODATA is not set +# CONFIG_4KSTACKS is not set CONFIG_X86_FIND_SMP_CONFIG=y CONFIG_X86_MPPARSE=y +CONFIG_DOUBLEFAULT=y # # Security options @@ -1536,10 +1474,6 @@ CONFIG_X86_MPPARSE=y # # CONFIG_CRYPTO is not set -# -# Hardware crypto devices -# - # # Library routines # @@ -1548,7 +1482,12 @@ CONFIG_X86_MPPARSE=y CONFIG_CRC32=y # CONFIG_LIBCRC32C is not set CONFIG_ZLIB_INFLATE=y +CONFIG_PLIST=y CONFIG_GENERIC_HARDIRQS=y CONFIG_GENERIC_IRQ_PROBE=y +CONFIG_GENERIC_PENDING_IRQ=y +CONFIG_X86_SMP=y +CONFIG_X86_HT=y CONFIG_X86_BIOS_REBOOT=y +CONFIG_X86_TRAMPOLINE=y CONFIG_KTIME_SCALAR=y -- cgit v1.2.1 From 874c4fe389d1358f82c96dc9b5092fc5c7690604 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] i386: Allow to use GENERICARCH for UP kernels There are some machines around (large xSeries or Unisys ES7000) that need physical IO-APIC destination mode to access all of their IO devices. This currently doesn't work in UP kernels as used in distribution installers. This patch allows to compile even UP kernels as GENERICARCH which allows to use physical or clustered APIC mode. Signed-off-by: Andi Kleen --- arch/i386/Kconfig | 7 ++-- arch/i386/kernel/io_apic.c | 1 + arch/i386/kernel/mpparse.c | 1 + arch/i386/mach-generic/bigsmp.c | 1 + arch/i386/mach-generic/es7000.c | 1 + arch/i386/mach-generic/probe.c | 2 + arch/i386/mach-generic/summit.c | 1 + include/asm-i386/genapic.h | 69 +++++++++++++++++++------------- include/asm-i386/mach-es7000/mach_apic.h | 4 ++ include/asm-i386/mach-summit/mach_apic.h | 11 ++++- include/asm-i386/smp.h | 19 +++++---- 11 files changed, 76 insertions(+), 41 deletions(-) diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index b2751eadbc56..c134a545b37e 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -166,7 +166,6 @@ config X86_VISWS config X86_GENERICARCH bool "Generic architecture (Summit, bigsmp, ES7000, default)" - depends on SMP help This option compiles in the Summit, bigsmp, ES7000, default subarchitectures. It is intended for a generic binary kernel. @@ -263,7 +262,7 @@ source "kernel/Kconfig.preempt" config X86_UP_APIC bool "Local APIC support on uniprocessors" - depends on !SMP && !(X86_VISWS || X86_VOYAGER) + depends on !SMP && !(X86_VISWS || X86_VOYAGER || X86_GENERICARCH) help A local APIC (Advanced Programmable Interrupt Controller) is an integrated interrupt controller in the CPU. If you have a single-CPU @@ -288,12 +287,12 @@ config X86_UP_IOAPIC config X86_LOCAL_APIC bool - depends on X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) + depends on X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) || X86_GENERICARCH default y config X86_IO_APIC bool - depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) + depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) || X86_GENERICARCH default y config X86_VISWS_APIC diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 4fb32c551fe0..4eacd52eeb74 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -40,6 +40,7 @@ #include #include +#include #include "io_ports.h" diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index a70b5fa0ef06..827569688562 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -30,6 +30,7 @@ #include #include +#include #include #include diff --git a/arch/i386/mach-generic/bigsmp.c b/arch/i386/mach-generic/bigsmp.c index ef7a6e6fcb9f..33d9f93557ba 100644 --- a/arch/i386/mach-generic/bigsmp.c +++ b/arch/i386/mach-generic/bigsmp.c @@ -5,6 +5,7 @@ #define APIC_DEFINITION 1 #include #include +#include #include #include #include diff --git a/arch/i386/mach-generic/es7000.c b/arch/i386/mach-generic/es7000.c index 845cdd0b3593..aa144d82334d 100644 --- a/arch/i386/mach-generic/es7000.c +++ b/arch/i386/mach-generic/es7000.c @@ -4,6 +4,7 @@ #define APIC_DEFINITION 1 #include #include +#include #include #include #include diff --git a/arch/i386/mach-generic/probe.c b/arch/i386/mach-generic/probe.c index bcd1bcfaa723..793d1b473251 100644 --- a/arch/i386/mach-generic/probe.c +++ b/arch/i386/mach-generic/probe.c @@ -119,7 +119,9 @@ int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } +#ifdef CONFIG_SMP int hard_smp_processor_id(void) { return genapic->get_apic_id(*(unsigned long *)(APIC_BASE+APIC_ID)); } +#endif diff --git a/arch/i386/mach-generic/summit.c b/arch/i386/mach-generic/summit.c index b73501ddd653..f7e5d66648dc 100644 --- a/arch/i386/mach-generic/summit.c +++ b/arch/i386/mach-generic/summit.c @@ -4,6 +4,7 @@ #define APIC_DEFINITION 1 #include #include +#include #include #include #include diff --git a/include/asm-i386/genapic.h b/include/asm-i386/genapic.h index b3783a32abee..8ffbb0f07457 100644 --- a/include/asm-i386/genapic.h +++ b/include/asm-i386/genapic.h @@ -1,6 +1,8 @@ #ifndef _ASM_GENAPIC_H #define _ASM_GENAPIC_H 1 +#include + /* * Generic APIC driver interface. * @@ -63,14 +65,25 @@ struct genapic { unsigned (*get_apic_id)(unsigned long x); unsigned long apic_id_mask; unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask); - + +#ifdef CONFIG_SMP /* ipi */ void (*send_IPI_mask)(cpumask_t mask, int vector); void (*send_IPI_allbutself)(int vector); void (*send_IPI_all)(int vector); +#endif }; -#define APICFUNC(x) .x = x +#define APICFUNC(x) .x = x, + +/* More functions could be probably marked IPIFUNC and save some space + in UP GENERICARCH kernels, but I don't have the nerve right now + to untangle this mess. -AK */ +#ifdef CONFIG_SMP +#define IPIFUNC(x) APICFUNC(x) +#else +#define IPIFUNC(x) +#endif #define APIC_INIT(aname, aprobe) { \ .name = aname, \ @@ -80,33 +93,33 @@ struct genapic { .no_balance_irq = NO_BALANCE_IRQ, \ .ESR_DISABLE = esr_disable, \ .apic_destination_logical = APIC_DEST_LOGICAL, \ - APICFUNC(apic_id_registered), \ - APICFUNC(target_cpus), \ - APICFUNC(check_apicid_used), \ - APICFUNC(check_apicid_present), \ - APICFUNC(init_apic_ldr), \ - APICFUNC(ioapic_phys_id_map), \ - APICFUNC(clustered_apic_check), \ - APICFUNC(multi_timer_check), \ - APICFUNC(apicid_to_node), \ - APICFUNC(cpu_to_logical_apicid), \ - APICFUNC(cpu_present_to_apicid), \ - APICFUNC(apicid_to_cpu_present), \ - APICFUNC(mpc_apic_id), \ - APICFUNC(setup_portio_remap), \ - APICFUNC(check_phys_apicid_present), \ - APICFUNC(mpc_oem_bus_info), \ - APICFUNC(mpc_oem_pci_bus), \ - APICFUNC(mps_oem_check), \ - APICFUNC(get_apic_id), \ + APICFUNC(apic_id_registered) \ + APICFUNC(target_cpus) \ + APICFUNC(check_apicid_used) \ + APICFUNC(check_apicid_present) \ + APICFUNC(init_apic_ldr) \ + APICFUNC(ioapic_phys_id_map) \ + APICFUNC(clustered_apic_check) \ + APICFUNC(multi_timer_check) \ + APICFUNC(apicid_to_node) \ + APICFUNC(cpu_to_logical_apicid) \ + APICFUNC(cpu_present_to_apicid) \ + APICFUNC(apicid_to_cpu_present) \ + APICFUNC(mpc_apic_id) \ + APICFUNC(setup_portio_remap) \ + APICFUNC(check_phys_apicid_present) \ + APICFUNC(mpc_oem_bus_info) \ + APICFUNC(mpc_oem_pci_bus) \ + APICFUNC(mps_oem_check) \ + APICFUNC(get_apic_id) \ .apic_id_mask = APIC_ID_MASK, \ - APICFUNC(cpu_mask_to_apicid), \ - APICFUNC(acpi_madt_oem_check), \ - APICFUNC(send_IPI_mask), \ - APICFUNC(send_IPI_allbutself), \ - APICFUNC(send_IPI_all), \ - APICFUNC(enable_apic_mode), \ - APICFUNC(phys_pkg_id), \ + APICFUNC(cpu_mask_to_apicid) \ + APICFUNC(acpi_madt_oem_check) \ + IPIFUNC(send_IPI_mask) \ + IPIFUNC(send_IPI_allbutself) \ + IPIFUNC(send_IPI_all) \ + APICFUNC(enable_apic_mode) \ + APICFUNC(phys_pkg_id) \ } extern struct genapic *genapic; diff --git a/include/asm-i386/mach-es7000/mach_apic.h b/include/asm-i386/mach-es7000/mach_apic.h index b5f3f0d0b2bc..26333685a7fb 100644 --- a/include/asm-i386/mach-es7000/mach_apic.h +++ b/include/asm-i386/mach-es7000/mach_apic.h @@ -123,9 +123,13 @@ extern u8 cpu_2_logical_apicid[]; /* Mapping from cpu number to logical apicid */ static inline int cpu_to_logical_apicid(int cpu) { +#ifdef CONFIG_SMP if (cpu >= NR_CPUS) return BAD_APICID; return (int)cpu_2_logical_apicid[cpu]; +#else + return logical_smp_processor_id(); +#endif } static inline int mpc_apic_id(struct mpc_config_processor *m, struct mpc_config_translation *unused) diff --git a/include/asm-i386/mach-summit/mach_apic.h b/include/asm-i386/mach-summit/mach_apic.h index 9fd073286289..a81b05961595 100644 --- a/include/asm-i386/mach-summit/mach_apic.h +++ b/include/asm-i386/mach-summit/mach_apic.h @@ -46,10 +46,12 @@ extern u8 cpu_2_logical_apicid[]; static inline void init_apic_ldr(void) { unsigned long val, id; - int i, count; - u8 lid; + int count = 0; u8 my_id = (u8)hard_smp_processor_id(); u8 my_cluster = (u8)apicid_cluster(my_id); +#ifdef CONFIG_SMP + u8 lid; + int i; /* Create logical APIC IDs by counting CPUs already in cluster. */ for (count = 0, i = NR_CPUS; --i >= 0; ) { @@ -57,6 +59,7 @@ static inline void init_apic_ldr(void) if (lid != BAD_APICID && apicid_cluster(lid) == my_cluster) ++count; } +#endif /* We only have a 4 wide bitmap in cluster mode. If a deranged * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */ BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT); @@ -91,9 +94,13 @@ static inline int apicid_to_node(int logical_apicid) /* Mapping from cpu number to logical apicid */ static inline int cpu_to_logical_apicid(int cpu) { +#ifdef CONFIG_SMP if (cpu >= NR_CPUS) return BAD_APICID; return (int)cpu_2_logical_apicid[cpu]; +#else + return logical_smp_processor_id(); +#endif } static inline int cpu_present_to_apicid(int mps_cpu) diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h index 142d10e34ade..f87826039a51 100644 --- a/include/asm-i386/smp.h +++ b/include/asm-i386/smp.h @@ -80,17 +80,11 @@ static inline int hard_smp_processor_id(void) return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID)); } #endif - -static __inline int logical_smp_processor_id(void) -{ - /* we don't want to mark this access volatile - bad code generation */ - return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR)); -} - #endif extern int __cpu_disable(void); extern void __cpu_die(unsigned int cpu); + #endif /* !__ASSEMBLY__ */ #else /* CONFIG_SMP */ @@ -100,4 +94,15 @@ extern void __cpu_die(unsigned int cpu); #define NO_PROC_ID 0xFF /* No processor magic marker */ #endif + +#ifndef __ASSEMBLY__ +#ifdef CONFIG_X86_LOCAL_APIC +static __inline int logical_smp_processor_id(void) +{ + /* we don't want to mark this access volatile - bad code generation */ + return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR)); +} +#endif +#endif + #endif -- cgit v1.2.1 From b07f8915cda3fcd73b8b68075ba1e6cd0673365d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] x86: Temporarily revert parts of the Core 2 nmi nmi watchdog support This makes merging easier. They are readded a few patches later. Signed-off-by: Andi Kleen --- arch/i386/kernel/nmi.c | 65 +------------------------- arch/x86_64/kernel/nmi.c | 81 ++------------------------------- include/asm-i386/intel_arch_perfmon.h | 19 -------- include/asm-x86_64/intel_arch_perfmon.h | 19 -------- 4 files changed, 6 insertions(+), 178 deletions(-) delete mode 100644 include/asm-i386/intel_arch_perfmon.h delete mode 100644 include/asm-x86_64/intel_arch_perfmon.h diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index acb351478e42..1282d70ff971 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -24,7 +24,6 @@ #include #include -#include #include "mach_traps.h" @@ -96,9 +95,6 @@ int nmi_active; (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) -#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL -#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK - #ifdef CONFIG_SMP /* The performance counters used by NMI_LOCAL_APIC don't trigger when * the CPU is idle. To make sure the NMI watchdog really ticks on all @@ -211,8 +207,6 @@ static int __init setup_nmi_watchdog(char *str) __setup("nmi_watchdog=", setup_nmi_watchdog); -static void disable_intel_arch_watchdog(void); - static void disable_lapic_nmi_watchdog(void) { if (nmi_active <= 0) @@ -222,10 +216,6 @@ static void disable_lapic_nmi_watchdog(void) wrmsr(MSR_K7_EVNTSEL0, 0, 0); break; case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - disable_intel_arch_watchdog(); - break; - } switch (boot_cpu_data.x86) { case 6: if (boot_cpu_data.x86_model > 0xd) @@ -454,53 +444,6 @@ static int setup_p4_watchdog(void) return 1; } -static void disable_intel_arch_watchdog(void) -{ - unsigned ebx; - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebp indicates event present. - */ - ebx = cpuid_ebx(10); - if (!(ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, 0, 0); -} - -static int setup_intel_arch_watchdog(void) -{ - unsigned int evntsel; - unsigned ebx; - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebp indicates event present. - */ - ebx = cpuid_ebx(10); - if ((ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - return 0; - - nmi_perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; - - clear_msr_range(MSR_ARCH_PERFMON_EVENTSEL0, 2); - clear_msr_range(MSR_ARCH_PERFMON_PERFCTR0, 2); - - evntsel = ARCH_PERFMON_EVENTSEL_INT - | ARCH_PERFMON_EVENTSEL_OS - | ARCH_PERFMON_EVENTSEL_USR - | ARCH_PERFMON_NMI_EVENT_SEL - | ARCH_PERFMON_NMI_EVENT_UMASK; - - wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0); - write_watchdog_counter("INTEL_ARCH_PERFCTR0"); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0); - return 1; -} - void setup_apic_nmi_watchdog (void) { switch (boot_cpu_data.x86_vendor) { @@ -510,11 +453,6 @@ void setup_apic_nmi_watchdog (void) setup_k7_watchdog(); break; case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - if (!setup_intel_arch_watchdog()) - return; - break; - } switch (boot_cpu_data.x86) { case 6: if (boot_cpu_data.x86_model > 0xd) @@ -619,8 +557,7 @@ void nmi_watchdog_tick (struct pt_regs * regs) wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); apic_write(APIC_LVTPC, APIC_DM_NMI); } - else if (nmi_perfctr_msr == MSR_P6_PERFCTR0 || - nmi_perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { + else if (nmi_perfctr_msr == MSR_P6_PERFCTR0) { /* Only P6 based Pentium M need to re-unmask * the apic vector but it doesn't hurt * other P6 variant */ diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 5baa0c726e97..42c05d6907b9 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -26,7 +26,6 @@ #include #include #include -#include /* * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: @@ -66,9 +65,6 @@ static unsigned int nmi_p4_cccr_val; #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 #define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING -#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL -#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK - #define MSR_P4_MISC_ENABLE 0x1A0 #define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) #define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12) @@ -100,10 +96,7 @@ static __cpuinit inline int nmi_known_cpu(void) case X86_VENDOR_AMD: return boot_cpu_data.x86 == 15; case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return 1; - else - return (boot_cpu_data.x86 == 15); + return boot_cpu_data.x86 == 15; } return 0; } @@ -209,8 +202,6 @@ int __init setup_nmi_watchdog(char *str) __setup("nmi_watchdog=", setup_nmi_watchdog); -static void disable_intel_arch_watchdog(void); - static void disable_lapic_nmi_watchdog(void) { if (nmi_active <= 0) @@ -223,8 +214,6 @@ static void disable_lapic_nmi_watchdog(void) if (boot_cpu_data.x86 == 15) { wrmsr(MSR_P4_IQ_CCCR0, 0, 0); wrmsr(MSR_P4_CRU_ESCR0, 0, 0); - } else if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - disable_intel_arch_watchdog(); } break; } @@ -377,53 +366,6 @@ static void setup_k7_watchdog(void) wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); } -static void disable_intel_arch_watchdog(void) -{ - unsigned ebx; - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebp indicates event present. - */ - ebx = cpuid_ebx(10); - if (!(ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, 0, 0); -} - -static int setup_intel_arch_watchdog(void) -{ - unsigned int evntsel; - unsigned ebx; - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebp indicates event present. - */ - ebx = cpuid_ebx(10); - if ((ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - return 0; - - nmi_perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; - - clear_msr_range(MSR_ARCH_PERFMON_EVENTSEL0, 2); - clear_msr_range(MSR_ARCH_PERFMON_PERFCTR0, 2); - - evntsel = ARCH_PERFMON_EVENTSEL_INT - | ARCH_PERFMON_EVENTSEL_OS - | ARCH_PERFMON_EVENTSEL_USR - | ARCH_PERFMON_NMI_EVENT_SEL - | ARCH_PERFMON_NMI_EVENT_UMASK; - - wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0); - wrmsrl(MSR_ARCH_PERFMON_PERFCTR0, -((u64)cpu_khz * 1000 / nmi_hz)); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0); - return 1; -} - static int setup_p4_watchdog(void) { @@ -477,16 +419,10 @@ void setup_apic_nmi_watchdog(void) setup_k7_watchdog(); break; case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - if (!setup_intel_arch_watchdog()) - return; - } else if (boot_cpu_data.x86 == 15) { - if (!setup_p4_watchdog()) - return; - } else { + if (boot_cpu_data.x86 != 15) + return; + if (!setup_p4_watchdog()) return; - } - break; default: @@ -571,14 +507,7 @@ void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) */ wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); apic_write(APIC_LVTPC, APIC_DM_NMI); - } else if (nmi_perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { - /* - * For Intel based architectural perfmon - * - LVTPC is masked on interrupt and must be - * unmasked by the LVTPC handler. - */ - apic_write(APIC_LVTPC, APIC_DM_NMI); - } + } wrmsrl(nmi_perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); } } diff --git a/include/asm-i386/intel_arch_perfmon.h b/include/asm-i386/intel_arch_perfmon.h deleted file mode 100644 index 134ea9cc5283..000000000000 --- a/include/asm-i386/intel_arch_perfmon.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef X86_INTEL_ARCH_PERFMON_H -#define X86_INTEL_ARCH_PERFMON_H 1 - -#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 -#define MSR_ARCH_PERFMON_PERFCTR1 0xc2 - -#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 -#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 - -#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) -#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) -#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) -#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) - -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT (1 << 0) - -#endif /* X86_INTEL_ARCH_PERFMON_H */ diff --git a/include/asm-x86_64/intel_arch_perfmon.h b/include/asm-x86_64/intel_arch_perfmon.h deleted file mode 100644 index 59c396431569..000000000000 --- a/include/asm-x86_64/intel_arch_perfmon.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef X86_64_INTEL_ARCH_PERFMON_H -#define X86_64_INTEL_ARCH_PERFMON_H 1 - -#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 -#define MSR_ARCH_PERFMON_PERFCTR1 0xc2 - -#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 -#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 - -#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) -#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) -#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) -#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) - -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT (1 << 0) - -#endif /* X86_64_INTEL_ARCH_PERFMON_H */ -- cgit v1.2.1 From 828f0afda123a96ff4e8078f057a302f4b4232ae Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] x86: Add performance counter reservation framework for UP kernels Adds basic infrastructure to allow subsystems to reserve performance counters on the x86 chips. Only UP kernels are supported in this patch to make reviewing easier. The SMP portion makes a lot more changes. Think of this as a locking mechanism where each bit represents a different counter. In addition, each subsystem should also reserve an appropriate event selection register that will correspond to the performance counter it will be using (this is mainly neccessary for the Pentium 4 chips as they break the 1:1 relationship to performance counters). This will help prevent subsystems like oprofile from interfering with the nmi watchdog. Signed-off-by: Don Zickus Signed-off-by: Andi Kleen --- arch/i386/kernel/nmi.c | 188 +++++++++++++++++++++++++++++++++++++++-------- arch/x86_64/kernel/nmi.c | 178 ++++++++++++++++++++++++++++++++++---------- include/asm-i386/nmi.h | 7 ++ include/asm-x86_64/nmi.h | 8 +- 4 files changed, 308 insertions(+), 73 deletions(-) diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 1282d70ff971..5d58dfeacd59 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -34,6 +34,20 @@ static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ static unsigned int nmi_p4_cccr_val; extern void show_registers(struct pt_regs *regs); +/* perfctr_nmi_owner tracks the ownership of the perfctr registers: + * evtsel_nmi_owner tracks the ownership of the event selection + * - different performance counters/ event selection may be reserved for + * different subsystems this reservation system just tries to coordinate + * things a little + */ +static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner); +static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]); + +/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's + * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) + */ +#define NMI_MAX_COUNTER_BITS 66 + /* * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: * - it may be reserved by some other driver, or not @@ -95,6 +109,105 @@ int nmi_active; (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) +/* converts an msr to an appropriate reservation bit */ +static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) +{ + /* returns the bit offset of the performance counter register */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + return (msr - MSR_K7_PERFCTR0); + case X86_VENDOR_INTEL: + switch (boot_cpu_data.x86) { + case 6: + return (msr - MSR_P6_PERFCTR0); + case 15: + return (msr - MSR_P4_BPU_PERFCTR0); + } + } + return 0; +} + +/* converts an msr to an appropriate reservation bit */ +static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) +{ + /* returns the bit offset of the event selection register */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + return (msr - MSR_K7_EVNTSEL0); + case X86_VENDOR_INTEL: + switch (boot_cpu_data.x86) { + case 6: + return (msr - MSR_P6_EVNTSEL0); + case 15: + return (msr - MSR_P4_BSU_ESCR0); + } + } + return 0; +} + +/* checks for a bit availability (hack for oprofile) */ +int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) +{ + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner))); +} + +/* checks the an msr for availability */ +int avail_to_resrv_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner))); +} + +int reserve_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner))) + return 1; + return 0; +} + +void release_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner)); +} + +int reserve_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0])) + return 1; + return 0; +} + +void release_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]); +} + #ifdef CONFIG_SMP /* The performance counters used by NMI_LOCAL_APIC don't trigger when * the CPU is idle. To make sure the NMI watchdog really ticks on all @@ -344,14 +457,6 @@ late_initcall(init_lapic_nmi_sysfs); * Original code written by Keith Owens. */ -static void clear_msr_range(unsigned int base, unsigned int n) -{ - unsigned int i; - - for(i = 0; i < n; ++i) - wrmsr(base+i, 0, 0); -} - static void write_watchdog_counter(const char *descr) { u64 count = (u64)cpu_khz * 1000; @@ -362,14 +467,19 @@ static void write_watchdog_counter(const char *descr) wrmsrl(nmi_perfctr_msr, 0 - count); } -static void setup_k7_watchdog(void) +static int setup_k7_watchdog(void) { unsigned int evntsel; nmi_perfctr_msr = MSR_K7_PERFCTR0; - clear_msr_range(MSR_K7_EVNTSEL0, 4); - clear_msr_range(MSR_K7_PERFCTR0, 4); + if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + goto fail; + + if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0)) + goto fail1; + + wrmsrl(MSR_K7_PERFCTR0, 0UL); evntsel = K7_EVNTSEL_INT | K7_EVNTSEL_OS @@ -381,16 +491,24 @@ static void setup_k7_watchdog(void) apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= K7_EVNTSEL_ENABLE; wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); + return 1; +fail1: + release_perfctr_nmi(nmi_perfctr_msr); +fail: + return 0; } -static void setup_p6_watchdog(void) +static int setup_p6_watchdog(void) { unsigned int evntsel; nmi_perfctr_msr = MSR_P6_PERFCTR0; - clear_msr_range(MSR_P6_EVNTSEL0, 2); - clear_msr_range(MSR_P6_PERFCTR0, 2); + if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + goto fail; + + if (!reserve_evntsel_nmi(MSR_P6_EVNTSEL0)) + goto fail1; evntsel = P6_EVNTSEL_INT | P6_EVNTSEL_OS @@ -402,6 +520,11 @@ static void setup_p6_watchdog(void) apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= P6_EVNTSEL0_ENABLE; wrmsr(MSR_P6_EVNTSEL0, evntsel, 0); + return 1; +fail1: + release_perfctr_nmi(nmi_perfctr_msr); +fail: + return 0; } static int setup_p4_watchdog(void) @@ -419,22 +542,11 @@ static int setup_p4_watchdog(void) nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1; #endif - if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL)) - clear_msr_range(0x3F1, 2); - /* MSR 0x3F0 seems to have a default value of 0xFC00, but current - docs doesn't fully define it, so leave it alone for now. */ - if (boot_cpu_data.x86_model >= 0x3) { - /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */ - clear_msr_range(0x3A0, 26); - clear_msr_range(0x3BC, 3); - } else { - clear_msr_range(0x3A0, 31); - } - clear_msr_range(0x3C0, 6); - clear_msr_range(0x3C8, 6); - clear_msr_range(0x3E0, 2); - clear_msr_range(MSR_P4_CCCR0, 18); - clear_msr_range(MSR_P4_PERFCTR0, 18); + if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + goto fail; + + if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0)) + goto fail1; wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0); wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0); @@ -442,6 +554,10 @@ static int setup_p4_watchdog(void) apic_write(APIC_LVTPC, APIC_DM_NMI); wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); return 1; +fail1: + release_perfctr_nmi(nmi_perfctr_msr); +fail: + return 0; } void setup_apic_nmi_watchdog (void) @@ -450,7 +566,8 @@ void setup_apic_nmi_watchdog (void) case X86_VENDOR_AMD: if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15) return; - setup_k7_watchdog(); + if (!setup_k7_watchdog()) + return; break; case X86_VENDOR_INTEL: switch (boot_cpu_data.x86) { @@ -458,7 +575,8 @@ void setup_apic_nmi_watchdog (void) if (boot_cpu_data.x86_model > 0xd) return; - setup_p6_watchdog(); + if(!setup_p6_watchdog()) + return; break; case 15: if (boot_cpu_data.x86_model > 0x4) @@ -612,6 +730,12 @@ int proc_unknown_nmi_panic(ctl_table *table, int write, struct file *file, EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); +EXPORT_SYMBOL(reserve_perfctr_nmi); +EXPORT_SYMBOL(release_perfctr_nmi); +EXPORT_SYMBOL(reserve_evntsel_nmi); +EXPORT_SYMBOL(release_evntsel_nmi); EXPORT_SYMBOL(reserve_lapic_nmi); EXPORT_SYMBOL(release_lapic_nmi); EXPORT_SYMBOL(disable_timer_nmi_watchdog); diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 42c05d6907b9..b7a7c9973849 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -27,6 +27,20 @@ #include #include +/* perfctr_nmi_owner tracks the ownership of the perfctr registers: + * evtsel_nmi_owner tracks the ownership of the event selection + * - different performance counters/ event selection may be reserved for + * different subsystems this reservation system just tries to coordinate + * things a little + */ +static DEFINE_PER_CPU(unsigned, perfctr_nmi_owner); +static DEFINE_PER_CPU(unsigned, evntsel_nmi_owner[2]); + +/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's + * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) + */ +#define NMI_MAX_COUNTER_BITS 66 + /* * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: * - it may be reserved by some other driver, or not @@ -90,6 +104,95 @@ static unsigned int nmi_p4_cccr_val; (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) +/* converts an msr to an appropriate reservation bit */ +static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) +{ + /* returns the bit offset of the performance counter register */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + return (msr - MSR_K7_PERFCTR0); + case X86_VENDOR_INTEL: + return (msr - MSR_P4_BPU_PERFCTR0); + } + return 0; +} + +/* converts an msr to an appropriate reservation bit */ +static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) +{ + /* returns the bit offset of the event selection register */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + return (msr - MSR_K7_EVNTSEL0); + case X86_VENDOR_INTEL: + return (msr - MSR_P4_BSU_ESCR0); + } + return 0; +} + +/* checks for a bit availability (hack for oprofile) */ +int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) +{ + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner))); +} + +/* checks the an msr for availability */ +int avail_to_resrv_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner))); +} + +int reserve_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner))) + return 1; + return 0; +} + +void release_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner)); +} + +int reserve_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner))) + return 1; + return 0; +} + +void release_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner)); +} + static __cpuinit inline int nmi_known_cpu(void) { switch (boot_cpu_data.x86_vendor) { @@ -325,34 +428,22 @@ late_initcall(init_lapic_nmi_sysfs); #endif /* CONFIG_PM */ -/* - * Activate the NMI watchdog via the local APIC. - * Original code written by Keith Owens. - */ - -static void clear_msr_range(unsigned int base, unsigned int n) +static int setup_k7_watchdog(void) { - unsigned int i; - - for(i = 0; i < n; ++i) - wrmsr(base+i, 0, 0); -} - -static void setup_k7_watchdog(void) -{ - int i; unsigned int evntsel; nmi_perfctr_msr = MSR_K7_PERFCTR0; - for(i = 0; i < 4; ++i) { - /* Simulator may not support it */ - if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) { - nmi_perfctr_msr = 0; - return; - } - wrmsrl(MSR_K7_PERFCTR0+i, 0UL); - } + if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + goto fail; + + if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0)) + goto fail1; + + /* Simulator may not support it */ + if (checking_wrmsrl(MSR_K7_EVNTSEL0, 0UL)) + goto fail2; + wrmsrl(MSR_K7_PERFCTR0, 0UL); evntsel = K7_EVNTSEL_INT | K7_EVNTSEL_OS @@ -364,6 +455,13 @@ static void setup_k7_watchdog(void) apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= K7_EVNTSEL_ENABLE; wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); + return 1; +fail2: + release_evntsel_nmi(MSR_K7_EVNTSEL0); +fail1: + release_perfctr_nmi(nmi_perfctr_msr); +fail: + return 0; } @@ -382,22 +480,11 @@ static int setup_p4_watchdog(void) nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1; #endif - if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL)) - clear_msr_range(0x3F1, 2); - /* MSR 0x3F0 seems to have a default value of 0xFC00, but current - docs doesn't fully define it, so leave it alone for now. */ - if (boot_cpu_data.x86_model >= 0x3) { - /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */ - clear_msr_range(0x3A0, 26); - clear_msr_range(0x3BC, 3); - } else { - clear_msr_range(0x3A0, 31); - } - clear_msr_range(0x3C0, 6); - clear_msr_range(0x3C8, 6); - clear_msr_range(0x3E0, 2); - clear_msr_range(MSR_P4_CCCR0, 18); - clear_msr_range(MSR_P4_PERFCTR0, 18); + if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + goto fail; + + if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0)) + goto fail1; wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0); wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0); @@ -406,6 +493,10 @@ static int setup_p4_watchdog(void) apic_write(APIC_LVTPC, APIC_DM_NMI); wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); return 1; +fail1: + release_perfctr_nmi(nmi_perfctr_msr); +fail: + return 0; } void setup_apic_nmi_watchdog(void) @@ -416,7 +507,8 @@ void setup_apic_nmi_watchdog(void) return; if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) return; - setup_k7_watchdog(); + if (!setup_k7_watchdog()) + return; break; case X86_VENDOR_INTEL: if (boot_cpu_data.x86 != 15) @@ -588,6 +680,12 @@ int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); +EXPORT_SYMBOL(reserve_perfctr_nmi); +EXPORT_SYMBOL(release_perfctr_nmi); +EXPORT_SYMBOL(reserve_evntsel_nmi); +EXPORT_SYMBOL(release_evntsel_nmi); EXPORT_SYMBOL(reserve_lapic_nmi); EXPORT_SYMBOL(release_lapic_nmi); EXPORT_SYMBOL(disable_timer_nmi_watchdog); diff --git a/include/asm-i386/nmi.h b/include/asm-i386/nmi.h index 67d994799999..27fc9e6f630e 100644 --- a/include/asm-i386/nmi.h +++ b/include/asm-i386/nmi.h @@ -25,6 +25,13 @@ void set_nmi_callback(nmi_callback_t callback); */ void unset_nmi_callback(void); +extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); +extern int avail_to_resrv_perfctr_nmi(unsigned int); +extern int reserve_perfctr_nmi(unsigned int); +extern void release_perfctr_nmi(unsigned int); +extern int reserve_evntsel_nmi(unsigned int); +extern void release_evntsel_nmi(unsigned int); + extern void setup_apic_nmi_watchdog (void); extern int reserve_lapic_nmi(void); extern void release_lapic_nmi(void); diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h index efb45c894d76..62a784cb8f0c 100644 --- a/include/asm-x86_64/nmi.h +++ b/include/asm-x86_64/nmi.h @@ -56,7 +56,13 @@ extern int panic_on_timeout; extern int unknown_nmi_panic; extern int check_nmi_watchdog(void); - +extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); +extern int avail_to_resrv_perfctr_nmi(unsigned int); +extern int reserve_perfctr_nmi(unsigned int); +extern void release_perfctr_nmi(unsigned int); +extern int reserve_evntsel_nmi(unsigned int); +extern void release_evntsel_nmi(unsigned int); + extern void setup_apic_nmi_watchdog (void); extern int reserve_lapic_nmi(void); extern void release_lapic_nmi(void); -- cgit v1.2.1 From cb9c448c661d40ce2efbce8e9c19cc4d420d8ccc Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] i386: Utilize performance counter reservation framework in oprofile Incorporates the new performance counter reservation system in oprofile. Also cleans up a lot of the initialization code. The code original zero'd out every register associated with performance counters regardless if those registers were used or not. This causes issues with the nmi watchdog. Now oprofile tries to reserve registers and gives up if it can't get them. Cc: levon@movementarian.org Cc: oprofile-list@lists.sf.net Signed-off-by: Don Zickus Signed-off-by: Andi Kleen --- arch/i386/oprofile/nmi_int.c | 41 +++++++--- arch/i386/oprofile/op_model_athlon.c | 54 ++++++++++--- arch/i386/oprofile/op_model_p4.c | 152 +++++++++++++++++------------------ arch/i386/oprofile/op_model_ppro.c | 65 ++++++++++++--- arch/i386/oprofile/op_x86_model.h | 1 + 5 files changed, 199 insertions(+), 114 deletions(-) diff --git a/arch/i386/oprofile/nmi_int.c b/arch/i386/oprofile/nmi_int.c index 5f8dc8a21bd7..8710ca081b1e 100644 --- a/arch/i386/oprofile/nmi_int.c +++ b/arch/i386/oprofile/nmi_int.c @@ -98,15 +98,19 @@ static void nmi_cpu_save_registers(struct op_msrs * msrs) unsigned int i; for (i = 0; i < nr_ctrs; ++i) { - rdmsr(counters[i].addr, - counters[i].saved.low, - counters[i].saved.high); + if (counters[i].addr){ + rdmsr(counters[i].addr, + counters[i].saved.low, + counters[i].saved.high); + } } for (i = 0; i < nr_ctrls; ++i) { - rdmsr(controls[i].addr, - controls[i].saved.low, - controls[i].saved.high); + if (controls[i].addr){ + rdmsr(controls[i].addr, + controls[i].saved.low, + controls[i].saved.high); + } } } @@ -205,15 +209,19 @@ static void nmi_restore_registers(struct op_msrs * msrs) unsigned int i; for (i = 0; i < nr_ctrls; ++i) { - wrmsr(controls[i].addr, - controls[i].saved.low, - controls[i].saved.high); + if (controls[i].addr){ + wrmsr(controls[i].addr, + controls[i].saved.low, + controls[i].saved.high); + } } for (i = 0; i < nr_ctrs; ++i) { - wrmsr(counters[i].addr, - counters[i].saved.low, - counters[i].saved.high); + if (counters[i].addr){ + wrmsr(counters[i].addr, + counters[i].saved.low, + counters[i].saved.high); + } } } @@ -234,6 +242,7 @@ static void nmi_cpu_shutdown(void * dummy) apic_write(APIC_LVTPC, saved_lvtpc[cpu]); apic_write(APIC_LVTERR, v); nmi_restore_registers(msrs); + model->shutdown(msrs); } @@ -284,6 +293,14 @@ static int nmi_create_files(struct super_block * sb, struct dentry * root) struct dentry * dir; char buf[4]; + /* quick little hack to _not_ expose a counter if it is not + * available for use. This should protect userspace app. + * NOTE: assumes 1:1 mapping here (that counters are organized + * sequentially in their struct assignment). + */ + if (unlikely(!avail_to_resrv_perfctr_nmi_bit(i))) + continue; + snprintf(buf, sizeof(buf), "%d", i); dir = oprofilefs_mkdir(sb, root, buf); oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled); diff --git a/arch/i386/oprofile/op_model_athlon.c b/arch/i386/oprofile/op_model_athlon.c index 693bdea4a52b..3057a19e4641 100644 --- a/arch/i386/oprofile/op_model_athlon.c +++ b/arch/i386/oprofile/op_model_athlon.c @@ -21,10 +21,12 @@ #define NUM_COUNTERS 4 #define NUM_CONTROLS 4 +#define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0) #define CTR_READ(l,h,msrs,c) do {rdmsr(msrs->counters[(c)].addr, (l), (h));} while (0) #define CTR_WRITE(l,msrs,c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1);} while (0) #define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) +#define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0) #define CTRL_READ(l,h,msrs,c) do {rdmsr(msrs->controls[(c)].addr, (l), (h));} while (0) #define CTRL_WRITE(l,h,msrs,c) do {wrmsr(msrs->controls[(c)].addr, (l), (h));} while (0) #define CTRL_SET_ACTIVE(n) (n |= (1<<22)) @@ -40,15 +42,21 @@ static unsigned long reset_value[NUM_COUNTERS]; static void athlon_fill_in_addresses(struct op_msrs * const msrs) { - msrs->counters[0].addr = MSR_K7_PERFCTR0; - msrs->counters[1].addr = MSR_K7_PERFCTR1; - msrs->counters[2].addr = MSR_K7_PERFCTR2; - msrs->counters[3].addr = MSR_K7_PERFCTR3; - - msrs->controls[0].addr = MSR_K7_EVNTSEL0; - msrs->controls[1].addr = MSR_K7_EVNTSEL1; - msrs->controls[2].addr = MSR_K7_EVNTSEL2; - msrs->controls[3].addr = MSR_K7_EVNTSEL3; + int i; + + for (i=0; i < NUM_COUNTERS; i++) { + if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) + msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; + else + msrs->counters[i].addr = 0; + } + + for (i=0; i < NUM_CONTROLS; i++) { + if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) + msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; + else + msrs->controls[i].addr = 0; + } } @@ -59,19 +67,23 @@ static void athlon_setup_ctrs(struct op_msrs const * const msrs) /* clear all counters */ for (i = 0 ; i < NUM_CONTROLS; ++i) { + if (unlikely(!CTRL_IS_RESERVED(msrs,i))) + continue; CTRL_READ(low, high, msrs, i); CTRL_CLEAR(low); CTRL_WRITE(low, high, msrs, i); } - + /* avoid a false detection of ctr overflows in NMI handler */ for (i = 0; i < NUM_COUNTERS; ++i) { + if (unlikely(!CTR_IS_RESERVED(msrs,i))) + continue; CTR_WRITE(1, msrs, i); } /* enable active counters */ for (i = 0; i < NUM_COUNTERS; ++i) { - if (counter_config[i].enabled) { + if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs,i))) { reset_value[i] = counter_config[i].count; CTR_WRITE(counter_config[i].count, msrs, i); @@ -98,6 +110,8 @@ static int athlon_check_ctrs(struct pt_regs * const regs, int i; for (i = 0 ; i < NUM_COUNTERS; ++i) { + if (!reset_value[i]) + continue; CTR_READ(low, high, msrs, i); if (CTR_OVERFLOWED(low)) { oprofile_add_sample(regs, i); @@ -132,12 +146,27 @@ static void athlon_stop(struct op_msrs const * const msrs) /* Subtle: stop on all counters to avoid race with * setting our pm callback */ for (i = 0 ; i < NUM_COUNTERS ; ++i) { + if (!reset_value[i]) + continue; CTRL_READ(low, high, msrs, i); CTRL_SET_INACTIVE(low); CTRL_WRITE(low, high, msrs, i); } } +static void athlon_shutdown(struct op_msrs const * const msrs) +{ + int i; + + for (i = 0 ; i < NUM_COUNTERS ; ++i) { + if (CTR_IS_RESERVED(msrs,i)) + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + } + for (i = 0 ; i < NUM_CONTROLS ; ++i) { + if (CTRL_IS_RESERVED(msrs,i)) + release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } +} struct op_x86_model_spec const op_athlon_spec = { .num_counters = NUM_COUNTERS, @@ -146,5 +175,6 @@ struct op_x86_model_spec const op_athlon_spec = { .setup_ctrs = &athlon_setup_ctrs, .check_ctrs = &athlon_check_ctrs, .start = &athlon_start, - .stop = &athlon_stop + .stop = &athlon_stop, + .shutdown = &athlon_shutdown }; diff --git a/arch/i386/oprofile/op_model_p4.c b/arch/i386/oprofile/op_model_p4.c index 7c61d357b82b..47925927b12f 100644 --- a/arch/i386/oprofile/op_model_p4.c +++ b/arch/i386/oprofile/op_model_p4.c @@ -32,7 +32,7 @@ #define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2) static unsigned int num_counters = NUM_COUNTERS_NON_HT; - +static unsigned int num_controls = NUM_CONTROLS_NON_HT; /* this has to be checked dynamically since the hyper-threadedness of a chip is discovered at @@ -40,8 +40,10 @@ static unsigned int num_counters = NUM_COUNTERS_NON_HT; static inline void setup_num_counters(void) { #ifdef CONFIG_SMP - if (smp_num_siblings == 2) + if (smp_num_siblings == 2){ num_counters = NUM_COUNTERS_HT2; + num_controls = NUM_CONTROLS_HT2; + } #endif } @@ -97,15 +99,6 @@ static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = { #define NUM_UNUSED_CCCRS NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT -/* All cccr we don't use. */ -static int p4_unused_cccr[NUM_UNUSED_CCCRS] = { - MSR_P4_BPU_CCCR1, MSR_P4_BPU_CCCR3, - MSR_P4_MS_CCCR1, MSR_P4_MS_CCCR3, - MSR_P4_FLAME_CCCR1, MSR_P4_FLAME_CCCR3, - MSR_P4_IQ_CCCR0, MSR_P4_IQ_CCCR1, - MSR_P4_IQ_CCCR2, MSR_P4_IQ_CCCR3 -}; - /* p4 event codes in libop/op_event.h are indices into this table. */ static struct p4_event_binding p4_events[NUM_EVENTS] = { @@ -372,6 +365,8 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { #define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) #define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) +#define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0) +#define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0) #define CTR_READ(l,h,i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h));} while (0) #define CTR_WRITE(l,i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1);} while (0) #define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000)) @@ -401,29 +396,34 @@ static unsigned long reset_value[NUM_COUNTERS_NON_HT]; static void p4_fill_in_addresses(struct op_msrs * const msrs) { unsigned int i; - unsigned int addr, stag; + unsigned int addr, cccraddr, stag; setup_num_counters(); stag = get_stagger(); - /* the counter registers we pay attention to */ + /* initialize some registers */ for (i = 0; i < num_counters; ++i) { - msrs->counters[i].addr = - p4_counters[VIRT_CTR(stag, i)].counter_address; + msrs->counters[i].addr = 0; } - - /* FIXME: bad feeling, we don't save the 10 counters we don't use. */ - - /* 18 CCCR registers */ - for (i = 0, addr = MSR_P4_BPU_CCCR0 + stag; - addr <= MSR_P4_IQ_CCCR5; ++i, addr += addr_increment()) { - msrs->controls[i].addr = addr; + for (i = 0; i < num_controls; ++i) { + msrs->controls[i].addr = 0; } + /* the counter & cccr registers we pay attention to */ + for (i = 0; i < num_counters; ++i) { + addr = p4_counters[VIRT_CTR(stag, i)].counter_address; + cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address; + if (reserve_perfctr_nmi(addr)){ + msrs->counters[i].addr = addr; + msrs->controls[i].addr = cccraddr; + } + } + /* 43 ESCR registers in three or four discontiguous group */ for (addr = MSR_P4_BSU_ESCR0 + stag; addr < MSR_P4_IQ_ESCR0; ++i, addr += addr_increment()) { - msrs->controls[i].addr = addr; + if (reserve_evntsel_nmi(addr)) + msrs->controls[i].addr = addr; } /* no IQ_ESCR0/1 on some models, we save a seconde time BSU_ESCR0/1 @@ -431,47 +431,57 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs) if (boot_cpu_data.x86_model >= 0x3) { for (addr = MSR_P4_BSU_ESCR0 + stag; addr <= MSR_P4_BSU_ESCR1; ++i, addr += addr_increment()) { - msrs->controls[i].addr = addr; + if (reserve_evntsel_nmi(addr)) + msrs->controls[i].addr = addr; } } else { for (addr = MSR_P4_IQ_ESCR0 + stag; addr <= MSR_P4_IQ_ESCR1; ++i, addr += addr_increment()) { - msrs->controls[i].addr = addr; + if (reserve_evntsel_nmi(addr)) + msrs->controls[i].addr = addr; } } for (addr = MSR_P4_RAT_ESCR0 + stag; addr <= MSR_P4_SSU_ESCR0; ++i, addr += addr_increment()) { - msrs->controls[i].addr = addr; + if (reserve_evntsel_nmi(addr)) + msrs->controls[i].addr = addr; } for (addr = MSR_P4_MS_ESCR0 + stag; addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) { - msrs->controls[i].addr = addr; + if (reserve_evntsel_nmi(addr)) + msrs->controls[i].addr = addr; } for (addr = MSR_P4_IX_ESCR0 + stag; addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) { - msrs->controls[i].addr = addr; + if (reserve_evntsel_nmi(addr)) + msrs->controls[i].addr = addr; } /* there are 2 remaining non-contiguously located ESCRs */ if (num_counters == NUM_COUNTERS_NON_HT) { /* standard non-HT CPUs handle both remaining ESCRs*/ - msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; - msrs->controls[i++].addr = MSR_P4_CRU_ESCR4; + if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5)) + msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; + if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR4)) + msrs->controls[i++].addr = MSR_P4_CRU_ESCR4; } else if (stag == 0) { /* HT CPUs give the first remainder to the even thread, as the 32nd control register */ - msrs->controls[i++].addr = MSR_P4_CRU_ESCR4; + if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR4)) + msrs->controls[i++].addr = MSR_P4_CRU_ESCR4; } else { /* and two copies of the second to the odd thread, for the 22st and 23nd control registers */ - msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; - msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; + if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5)) { + msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; + msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; + } } } @@ -544,7 +554,6 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs) { unsigned int i; unsigned int low, high; - unsigned int addr; unsigned int stag; stag = get_stagger(); @@ -557,59 +566,24 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs) /* clear the cccrs we will use */ for (i = 0 ; i < num_counters ; i++) { + if (unlikely(!CTRL_IS_RESERVED(msrs,i))) + continue; rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); CCCR_CLEAR(low); CCCR_SET_REQUIRED_BITS(low); wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); } - /* clear cccrs outside our concern */ - for (i = stag ; i < NUM_UNUSED_CCCRS ; i += addr_increment()) { - rdmsr(p4_unused_cccr[i], low, high); - CCCR_CLEAR(low); - CCCR_SET_REQUIRED_BITS(low); - wrmsr(p4_unused_cccr[i], low, high); - } - /* clear all escrs (including those outside our concern) */ - for (addr = MSR_P4_BSU_ESCR0 + stag; - addr < MSR_P4_IQ_ESCR0; addr += addr_increment()) { - wrmsr(addr, 0, 0); - } - - /* On older models clear also MSR_P4_IQ_ESCR0/1 */ - if (boot_cpu_data.x86_model < 0x3) { - wrmsr(MSR_P4_IQ_ESCR0, 0, 0); - wrmsr(MSR_P4_IQ_ESCR1, 0, 0); - } - - for (addr = MSR_P4_RAT_ESCR0 + stag; - addr <= MSR_P4_SSU_ESCR0; ++i, addr += addr_increment()) { - wrmsr(addr, 0, 0); - } - - for (addr = MSR_P4_MS_ESCR0 + stag; - addr <= MSR_P4_TC_ESCR1; addr += addr_increment()){ - wrmsr(addr, 0, 0); - } - - for (addr = MSR_P4_IX_ESCR0 + stag; - addr <= MSR_P4_CRU_ESCR3; addr += addr_increment()){ - wrmsr(addr, 0, 0); + for (i = num_counters; i < num_controls; i++) { + if (unlikely(!CTRL_IS_RESERVED(msrs,i))) + continue; + wrmsr(msrs->controls[i].addr, 0, 0); } - if (num_counters == NUM_COUNTERS_NON_HT) { - wrmsr(MSR_P4_CRU_ESCR4, 0, 0); - wrmsr(MSR_P4_CRU_ESCR5, 0, 0); - } else if (stag == 0) { - wrmsr(MSR_P4_CRU_ESCR4, 0, 0); - } else { - wrmsr(MSR_P4_CRU_ESCR5, 0, 0); - } - /* setup all counters */ for (i = 0 ; i < num_counters ; ++i) { - if (counter_config[i].enabled) { + if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs,i))) { reset_value[i] = counter_config[i].count; pmc_setup_one_p4_counter(i); CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i)); @@ -696,12 +670,32 @@ static void p4_stop(struct op_msrs const * const msrs) stag = get_stagger(); for (i = 0; i < num_counters; ++i) { + if (!reset_value[i]) + continue; CCCR_READ(low, high, VIRT_CTR(stag, i)); CCCR_SET_DISABLE(low); CCCR_WRITE(low, high, VIRT_CTR(stag, i)); } } +static void p4_shutdown(struct op_msrs const * const msrs) +{ + int i; + + for (i = 0 ; i < num_counters ; ++i) { + if (CTR_IS_RESERVED(msrs,i)) + release_perfctr_nmi(msrs->counters[i].addr); + } + /* some of the control registers are specially reserved in + * conjunction with the counter registers (hence the starting offset). + * This saves a few bits. + */ + for (i = num_counters ; i < num_controls ; ++i) { + if (CTRL_IS_RESERVED(msrs,i)) + release_evntsel_nmi(msrs->controls[i].addr); + } +} + #ifdef CONFIG_SMP struct op_x86_model_spec const op_p4_ht2_spec = { @@ -711,7 +705,8 @@ struct op_x86_model_spec const op_p4_ht2_spec = { .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, .start = &p4_start, - .stop = &p4_stop + .stop = &p4_stop, + .shutdown = &p4_shutdown }; #endif @@ -722,5 +717,6 @@ struct op_x86_model_spec const op_p4_spec = { .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, .start = &p4_start, - .stop = &p4_stop + .stop = &p4_stop, + .shutdown = &p4_shutdown }; diff --git a/arch/i386/oprofile/op_model_ppro.c b/arch/i386/oprofile/op_model_ppro.c index 5c3ab4b027ad..f88e05ba8eb3 100644 --- a/arch/i386/oprofile/op_model_ppro.c +++ b/arch/i386/oprofile/op_model_ppro.c @@ -22,10 +22,12 @@ #define NUM_COUNTERS 2 #define NUM_CONTROLS 2 +#define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0) #define CTR_READ(l,h,msrs,c) do {rdmsr(msrs->counters[(c)].addr, (l), (h));} while (0) #define CTR_WRITE(l,msrs,c) do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), -1);} while (0) #define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) +#define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0) #define CTRL_READ(l,h,msrs,c) do {rdmsr((msrs->controls[(c)].addr), (l), (h));} while (0) #define CTRL_WRITE(l,h,msrs,c) do {wrmsr((msrs->controls[(c)].addr), (l), (h));} while (0) #define CTRL_SET_ACTIVE(n) (n |= (1<<22)) @@ -41,11 +43,21 @@ static unsigned long reset_value[NUM_COUNTERS]; static void ppro_fill_in_addresses(struct op_msrs * const msrs) { - msrs->counters[0].addr = MSR_P6_PERFCTR0; - msrs->counters[1].addr = MSR_P6_PERFCTR1; + int i; + + for (i=0; i < NUM_COUNTERS; i++) { + if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i)) + msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; + else + msrs->counters[i].addr = 0; + } - msrs->controls[0].addr = MSR_P6_EVNTSEL0; - msrs->controls[1].addr = MSR_P6_EVNTSEL1; + for (i=0; i < NUM_CONTROLS; i++) { + if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) + msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; + else + msrs->controls[i].addr = 0; + } } @@ -56,6 +68,8 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs) /* clear all counters */ for (i = 0 ; i < NUM_CONTROLS; ++i) { + if (unlikely(!CTRL_IS_RESERVED(msrs,i))) + continue; CTRL_READ(low, high, msrs, i); CTRL_CLEAR(low); CTRL_WRITE(low, high, msrs, i); @@ -63,12 +77,14 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs) /* avoid a false detection of ctr overflows in NMI handler */ for (i = 0; i < NUM_COUNTERS; ++i) { + if (unlikely(!CTR_IS_RESERVED(msrs,i))) + continue; CTR_WRITE(1, msrs, i); } /* enable active counters */ for (i = 0; i < NUM_COUNTERS; ++i) { - if (counter_config[i].enabled) { + if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs,i))) { reset_value[i] = counter_config[i].count; CTR_WRITE(counter_config[i].count, msrs, i); @@ -81,6 +97,8 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs) CTRL_SET_UM(low, counter_config[i].unit_mask); CTRL_SET_EVENT(low, counter_config[i].event); CTRL_WRITE(low, high, msrs, i); + } else { + reset_value[i] = 0; } } } @@ -93,6 +111,8 @@ static int ppro_check_ctrs(struct pt_regs * const regs, int i; for (i = 0 ; i < NUM_COUNTERS; ++i) { + if (!reset_value[i]) + continue; CTR_READ(low, high, msrs, i); if (CTR_OVERFLOWED(low)) { oprofile_add_sample(regs, i); @@ -118,18 +138,38 @@ static int ppro_check_ctrs(struct pt_regs * const regs, static void ppro_start(struct op_msrs const * const msrs) { unsigned int low,high; - CTRL_READ(low, high, msrs, 0); - CTRL_SET_ACTIVE(low); - CTRL_WRITE(low, high, msrs, 0); + + if (reset_value[0]) { + CTRL_READ(low, high, msrs, 0); + CTRL_SET_ACTIVE(low); + CTRL_WRITE(low, high, msrs, 0); + } } static void ppro_stop(struct op_msrs const * const msrs) { unsigned int low,high; - CTRL_READ(low, high, msrs, 0); - CTRL_SET_INACTIVE(low); - CTRL_WRITE(low, high, msrs, 0); + + if (reset_value[0]) { + CTRL_READ(low, high, msrs, 0); + CTRL_SET_INACTIVE(low); + CTRL_WRITE(low, high, msrs, 0); + } +} + +static void ppro_shutdown(struct op_msrs const * const msrs) +{ + int i; + + for (i = 0 ; i < NUM_COUNTERS ; ++i) { + if (CTR_IS_RESERVED(msrs,i)) + release_perfctr_nmi(MSR_P6_PERFCTR0 + i); + } + for (i = 0 ; i < NUM_CONTROLS ; ++i) { + if (CTRL_IS_RESERVED(msrs,i)) + release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); + } } @@ -140,5 +180,6 @@ struct op_x86_model_spec const op_ppro_spec = { .setup_ctrs = &ppro_setup_ctrs, .check_ctrs = &ppro_check_ctrs, .start = &ppro_start, - .stop = &ppro_stop + .stop = &ppro_stop, + .shutdown = &ppro_shutdown }; diff --git a/arch/i386/oprofile/op_x86_model.h b/arch/i386/oprofile/op_x86_model.h index 123b7e90a9ee..abb1aa95b979 100644 --- a/arch/i386/oprofile/op_x86_model.h +++ b/arch/i386/oprofile/op_x86_model.h @@ -40,6 +40,7 @@ struct op_x86_model_spec { struct op_msrs const * const msrs); void (*start)(struct op_msrs const * const msrs); void (*stop)(struct op_msrs const * const msrs); + void (*shutdown)(struct op_msrs const * const msrs); }; extern struct op_x86_model_spec const op_ppro_spec; -- cgit v1.2.1 From f2802e7f571c05f9a901b1f5bd144aa730ccc88e Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] Add SMP support on x86_64 to reservation framework This patch includes the changes to make the nmi watchdog on x86_64 SMP aware. A bunch of code was moved around to make it simpler to read. In addition, it is now possible to determine if a particular NMI was the result of the watchdog or not. This feature allows the kernel to filter out unknown NMIs easier. Signed-off-by: Don Zickus Signed-off-by: Andi Kleen --- arch/x86_64/kernel/apic.c | 3 +- arch/x86_64/kernel/nmi.c | 426 +++++++++++++++++++++++++++++++--------------- include/asm-x86_64/nmi.h | 3 +- 3 files changed, 290 insertions(+), 142 deletions(-) diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 2b8cef037a65..692e9b579743 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -479,8 +479,7 @@ void __cpuinit setup_local_APIC (void) } nmi_watchdog_default(); - if (nmi_watchdog == NMI_LOCAL_APIC) - setup_apic_nmi_watchdog(); + setup_apic_nmi_watchdog(NULL); apic_pm_activate(); } diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index b7a7c9973849..d42374a952d7 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -56,53 +56,29 @@ static unsigned int lapic_nmi_owner; #define LAPIC_NMI_RESERVED (1<<1) /* nmi_active: - * +1: the lapic NMI watchdog is active, but can be disabled - * 0: the lapic NMI watchdog has not been set up, and cannot + * >0: the lapic NMI watchdog is active, but can be disabled + * <0: the lapic NMI watchdog has not been set up, and cannot * be enabled - * -1: the lapic NMI watchdog is disabled, but can be enabled + * 0: the lapic NMI watchdog is disabled, but can be enabled */ -int nmi_active; /* oprofile uses this */ +atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ int panic_on_timeout; unsigned int nmi_watchdog = NMI_DEFAULT; static unsigned int nmi_hz = HZ; -static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ -static unsigned int nmi_p4_cccr_val; -/* Note that these events don't tick when the CPU idles. This means - the frequency varies with CPU load. */ - -#define K7_EVNTSEL_ENABLE (1 << 22) -#define K7_EVNTSEL_INT (1 << 20) -#define K7_EVNTSEL_OS (1 << 17) -#define K7_EVNTSEL_USR (1 << 16) -#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 -#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING +struct nmi_watchdog_ctlblk { + int enabled; + u64 check_bit; + unsigned int cccr_msr; + unsigned int perfctr_msr; /* the MSR to reset in NMI handler */ + unsigned int evntsel_msr; /* the MSR to select the events to handle */ +}; +static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); -#define MSR_P4_MISC_ENABLE 0x1A0 -#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) -#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12) -#define MSR_P4_PERFCTR0 0x300 -#define MSR_P4_CCCR0 0x360 -#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) -#define P4_ESCR_OS (1<<3) -#define P4_ESCR_USR (1<<2) -#define P4_CCCR_OVF_PMI0 (1<<26) -#define P4_CCCR_OVF_PMI1 (1<<27) -#define P4_CCCR_THRESHOLD(N) ((N)<<20) -#define P4_CCCR_COMPLEMENT (1<<19) -#define P4_CCCR_COMPARE (1<<18) -#define P4_CCCR_REQUIRED (3<<16) -#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) -#define P4_CCCR_ENABLE (1<<12) -/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter - CRU_ESCR0 (with any non-null event selector) through a complemented - max threshold. [IA32-Vol3, Section 14.9.9] */ -#define MSR_P4_IQ_COUNTER0 0x30C -#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR) -#define P4_NMI_IQ_CCCR0 \ - (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ - P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) +/* local prototypes */ +static void stop_apic_nmi_watchdog(void *unused); +static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); /* converts an msr to an appropriate reservation bit */ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) @@ -241,6 +217,12 @@ int __init check_nmi_watchdog (void) int *counts; int cpu; + if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT)) + return 0; + + if (!atomic_read(&nmi_active)) + return 0; + counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); if (!counts) return -1; @@ -258,19 +240,22 @@ int __init check_nmi_watchdog (void) mdelay((10*1000)/nmi_hz); // wait 10 ticks for_each_online_cpu(cpu) { + if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled) + continue; if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) { - endflag = 1; printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", cpu, counts[cpu], cpu_pda(cpu)->__nmi_count); - nmi_active = 0; - lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; - nmi_perfctr_msr = 0; - kfree(counts); - return -1; + per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0; + atomic_dec(&nmi_active); } } + if (!atomic_read(&nmi_active)) { + kfree(counts); + atomic_set(&nmi_active, -1); + return -1; + } endflag = 1; printk("OK.\n"); @@ -297,8 +282,11 @@ int __init setup_nmi_watchdog(char *str) get_option(&str, &nmi); - if (nmi >= NMI_INVALID) + if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE)) return 0; + + if ((nmi == NMI_LOCAL_APIC) && (nmi_known_cpu() == 0)) + return 0; /* no lapic support */ nmi_watchdog = nmi; return 1; } @@ -307,31 +295,30 @@ __setup("nmi_watchdog=", setup_nmi_watchdog); static void disable_lapic_nmi_watchdog(void) { - if (nmi_active <= 0) + BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); + + if (atomic_read(&nmi_active) <= 0) return; - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - wrmsr(MSR_K7_EVNTSEL0, 0, 0); - break; - case X86_VENDOR_INTEL: - if (boot_cpu_data.x86 == 15) { - wrmsr(MSR_P4_IQ_CCCR0, 0, 0); - wrmsr(MSR_P4_CRU_ESCR0, 0, 0); - } - break; - } - nmi_active = -1; - /* tell do_nmi() and others that we're not active any more */ - nmi_watchdog = 0; + + on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); + + BUG_ON(atomic_read(&nmi_active) != 0); } static void enable_lapic_nmi_watchdog(void) { - if (nmi_active < 0) { - nmi_watchdog = NMI_LOCAL_APIC; - touch_nmi_watchdog(); - setup_apic_nmi_watchdog(); - } + BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); + + /* are we already enabled */ + if (atomic_read(&nmi_active) != 0) + return; + + /* are we lapic aware */ + if (nmi_known_cpu() <= 0) + return; + + on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); + touch_nmi_watchdog(); } int reserve_lapic_nmi(void) @@ -363,21 +350,24 @@ void release_lapic_nmi(void) void disable_timer_nmi_watchdog(void) { - if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0)) + BUG_ON(nmi_watchdog != NMI_IO_APIC); + + if (atomic_read(&nmi_active) <= 0) return; disable_irq(0); - unset_nmi_callback(); - nmi_active = -1; - nmi_watchdog = NMI_NONE; + on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); + + BUG_ON(atomic_read(&nmi_active) != 0); } void enable_timer_nmi_watchdog(void) { - if (nmi_active < 0) { - nmi_watchdog = NMI_IO_APIC; + BUG_ON(nmi_watchdog != NMI_IO_APIC); + + if (atomic_read(&nmi_active) == 0) { touch_nmi_watchdog(); - nmi_active = 1; + on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); enable_irq(0); } } @@ -388,7 +378,7 @@ static int nmi_pm_active; /* nmi_active before suspend */ static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) { - nmi_pm_active = nmi_active; + nmi_pm_active = atomic_read(&nmi_active); disable_lapic_nmi_watchdog(); return 0; } @@ -396,7 +386,7 @@ static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) static int lapic_nmi_resume(struct sys_device *dev) { if (nmi_pm_active > 0) - enable_lapic_nmi_watchdog(); + enable_lapic_nmi_watchdog(); return 0; } @@ -415,7 +405,13 @@ static int __init init_lapic_nmi_sysfs(void) { int error; - if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC) + /* should really be a BUG_ON but b/c this is an + * init call, it just doesn't work. -dcz + */ + if (nmi_watchdog != NMI_LOCAL_APIC) + return 0; + + if ( atomic_read(&nmi_active) < 0 ) return 0; error = sysdev_class_register(&nmi_sysclass); @@ -428,100 +424,232 @@ late_initcall(init_lapic_nmi_sysfs); #endif /* CONFIG_PM */ +/* + * Activate the NMI watchdog via the local APIC. + * Original code written by Keith Owens. + */ + +/* Note that these events don't tick when the CPU idles. This means + the frequency varies with CPU load. */ + +#define K7_EVNTSEL_ENABLE (1 << 22) +#define K7_EVNTSEL_INT (1 << 20) +#define K7_EVNTSEL_OS (1 << 17) +#define K7_EVNTSEL_USR (1 << 16) +#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 +#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING + static int setup_k7_watchdog(void) { + unsigned int perfctr_msr, evntsel_msr; unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - nmi_perfctr_msr = MSR_K7_PERFCTR0; - - if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + perfctr_msr = MSR_K7_PERFCTR0; + evntsel_msr = MSR_K7_EVNTSEL0; + if (!reserve_perfctr_nmi(perfctr_msr)) goto fail; - if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0)) + if (!reserve_evntsel_nmi(evntsel_msr)) goto fail1; /* Simulator may not support it */ - if (checking_wrmsrl(MSR_K7_EVNTSEL0, 0UL)) + if (checking_wrmsrl(evntsel_msr, 0UL)) goto fail2; - wrmsrl(MSR_K7_PERFCTR0, 0UL); + wrmsrl(perfctr_msr, 0UL); evntsel = K7_EVNTSEL_INT | K7_EVNTSEL_OS | K7_EVNTSEL_USR | K7_NMI_EVENT; - wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); - wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz * 1000 / nmi_hz)); + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= K7_EVNTSEL_ENABLE; - wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + wd->check_bit = 1ULL<<63; return 1; fail2: - release_evntsel_nmi(MSR_K7_EVNTSEL0); + release_evntsel_nmi(evntsel_msr); fail1: - release_perfctr_nmi(nmi_perfctr_msr); + release_perfctr_nmi(perfctr_msr); fail: return 0; } +static void stop_k7_watchdog(void) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + wrmsr(wd->evntsel_msr, 0, 0); + + release_evntsel_nmi(wd->evntsel_msr); + release_perfctr_nmi(wd->perfctr_msr); +} + +/* Note that these events don't tick when the CPU idles. This means + the frequency varies with CPU load. */ + +#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) +#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) +#define P4_ESCR_OS (1<<3) +#define P4_ESCR_USR (1<<2) +#define P4_CCCR_OVF_PMI0 (1<<26) +#define P4_CCCR_OVF_PMI1 (1<<27) +#define P4_CCCR_THRESHOLD(N) ((N)<<20) +#define P4_CCCR_COMPLEMENT (1<<19) +#define P4_CCCR_COMPARE (1<<18) +#define P4_CCCR_REQUIRED (3<<16) +#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) +#define P4_CCCR_ENABLE (1<<12) +#define P4_CCCR_OVF (1<<31) +/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter + CRU_ESCR0 (with any non-null event selector) through a complemented + max threshold. [IA32-Vol3, Section 14.9.9] */ static int setup_p4_watchdog(void) { + unsigned int perfctr_msr, evntsel_msr, cccr_msr; + unsigned int evntsel, cccr_val; unsigned int misc_enable, dummy; + unsigned int ht_num; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy); + rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy); if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) return 0; - nmi_perfctr_msr = MSR_P4_IQ_COUNTER0; - nmi_p4_cccr_val = P4_NMI_IQ_CCCR0; #ifdef CONFIG_SMP - if (smp_num_siblings == 2) - nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1; + /* detect which hyperthread we are on */ + if (smp_num_siblings == 2) { + unsigned int ebx, apicid; + + ebx = cpuid_ebx(1); + apicid = (ebx >> 24) & 0xff; + ht_num = apicid & 1; + } else #endif + ht_num = 0; + + /* performance counters are shared resources + * assign each hyperthread its own set + * (re-use the ESCR0 register, seems safe + * and keeps the cccr_val the same) + */ + if (!ht_num) { + /* logical cpu 0 */ + perfctr_msr = MSR_P4_IQ_PERFCTR0; + evntsel_msr = MSR_P4_CRU_ESCR0; + cccr_msr = MSR_P4_IQ_CCCR0; + cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); + } else { + /* logical cpu 1 */ + perfctr_msr = MSR_P4_IQ_PERFCTR1; + evntsel_msr = MSR_P4_CRU_ESCR0; + cccr_msr = MSR_P4_IQ_CCCR1; + cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); + } - if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + if (!reserve_perfctr_nmi(perfctr_msr)) goto fail; - if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0)) + if (!reserve_evntsel_nmi(evntsel_msr)) goto fail1; - wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0); - wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0); - Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz * 1000UL / nmi_hz)); - wrmsrl(MSR_P4_IQ_COUNTER0, -((u64)cpu_khz * 1000 / nmi_hz)); + evntsel = P4_ESCR_EVENT_SELECT(0x3F) + | P4_ESCR_OS + | P4_ESCR_USR; + + cccr_val |= P4_CCCR_THRESHOLD(15) + | P4_CCCR_COMPLEMENT + | P4_CCCR_COMPARE + | P4_CCCR_REQUIRED; + + wrmsr(evntsel_msr, evntsel, 0); + wrmsr(cccr_msr, cccr_val, 0); + wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); apic_write(APIC_LVTPC, APIC_DM_NMI); - wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); + cccr_val |= P4_CCCR_ENABLE; + wrmsr(cccr_msr, cccr_val, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = cccr_msr; + wd->check_bit = 1ULL<<39; return 1; fail1: - release_perfctr_nmi(nmi_perfctr_msr); + release_perfctr_nmi(perfctr_msr); fail: return 0; } -void setup_apic_nmi_watchdog(void) +static void stop_p4_watchdog(void) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (boot_cpu_data.x86 != 15) - return; - if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) - return; - if (!setup_k7_watchdog()) - return; - break; - case X86_VENDOR_INTEL: - if (boot_cpu_data.x86 != 15) - return; - if (!setup_p4_watchdog()) + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + wrmsr(wd->cccr_msr, 0, 0); + wrmsr(wd->evntsel_msr, 0, 0); + + release_evntsel_nmi(wd->evntsel_msr); + release_perfctr_nmi(wd->perfctr_msr); +} + +void setup_apic_nmi_watchdog(void *unused) +{ + /* only support LOCAL and IO APICs for now */ + if ((nmi_watchdog != NMI_LOCAL_APIC) && + (nmi_watchdog != NMI_IO_APIC)) + return; + + if (nmi_watchdog == NMI_LOCAL_APIC) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) + return; + if (!setup_k7_watchdog()) + return; + break; + case X86_VENDOR_INTEL: + if (!setup_p4_watchdog()) + return; + break; + default: return; - break; + } + } + __get_cpu_var(nmi_watchdog_ctlblk.enabled) = 1; + atomic_inc(&nmi_active); +} - default: - return; +static void stop_apic_nmi_watchdog(void *unused) +{ + /* only support LOCAL and IO APICs for now */ + if ((nmi_watchdog != NMI_LOCAL_APIC) && + (nmi_watchdog != NMI_IO_APIC)) + return; + + if (nmi_watchdog == NMI_LOCAL_APIC) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) + return; + stop_k7_watchdog(); + break; + case X86_VENDOR_INTEL: + stop_p4_watchdog(); + break; + default: + return; + } } - lapic_nmi_owner = LAPIC_NMI_WATCHDOG; - nmi_active = 1; + __get_cpu_var(nmi_watchdog_ctlblk.enabled) = 0; + atomic_dec(&nmi_active); } /* @@ -558,50 +686,70 @@ void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { int sum; int touched = 0; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + u64 dummy; + + /* check for other users first */ + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) { + touched = 1; + } sum = read_pda(apic_timer_irqs); if (__get_cpu_var(nmi_touch)) { __get_cpu_var(nmi_touch) = 0; touched = 1; } + #ifdef CONFIG_X86_MCE /* Could check oops_in_progress here too, but it's safer not too */ if (atomic_read(&mce_entry) > 0) touched = 1; #endif + /* if the apic timer isn't firing, this cpu isn't doing much */ if (!touched && __get_cpu_var(last_irq_sum) == sum) { /* * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ local_inc(&__get_cpu_var(alert_counter)); - if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) { - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) { - local_set(&__get_cpu_var(alert_counter), 0); - return; - } + if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs); - } } else { __get_cpu_var(last_irq_sum) = sum; local_set(&__get_cpu_var(alert_counter), 0); } - if (nmi_perfctr_msr) { - if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) { - /* - * P4 quirks: - * - An overflown perfctr will assert its interrupt - * until the OVF flag in its CCCR is cleared. - * - LVTPC is masked on interrupt and must be - * unmasked by the LVTPC handler. - */ - wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); - apic_write(APIC_LVTPC, APIC_DM_NMI); - } - wrmsrl(nmi_perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); + + /* see if the nmi watchdog went off */ + if (wd->enabled) { + if (nmi_watchdog == NMI_LOCAL_APIC) { + rdmsrl(wd->perfctr_msr, dummy); + if (dummy & wd->check_bit){ + /* this wasn't a watchdog timer interrupt */ + goto done; + } + + /* only Intel uses the cccr msr */ + if (wd->cccr_msr != 0) { + /* + * P4 quirks: + * - An overflown perfctr will assert its interrupt + * until the OVF flag in its CCCR is cleared. + * - LVTPC is masked on interrupt and must be + * unmasked by the LVTPC handler. + */ + rdmsrl(wd->cccr_msr, dummy); + dummy &= ~P4_CCCR_OVF; + wrmsrl(wd->cccr_msr, dummy); + apic_write(APIC_LVTPC, APIC_DM_NMI); + } + /* start the cycle over again */ + wrmsrl(wd->perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); + } } +done: + return; } static __kprobes int dummy_nmi_callback(struct pt_regs * regs, int cpu) diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h index 62a784cb8f0c..5918136fd853 100644 --- a/include/asm-x86_64/nmi.h +++ b/include/asm-x86_64/nmi.h @@ -63,7 +63,7 @@ extern void release_perfctr_nmi(unsigned int); extern int reserve_evntsel_nmi(unsigned int); extern void release_evntsel_nmi(unsigned int); -extern void setup_apic_nmi_watchdog (void); +extern void setup_apic_nmi_watchdog (void *); extern int reserve_lapic_nmi(void); extern void release_lapic_nmi(void); extern void disable_timer_nmi_watchdog(void); @@ -73,6 +73,7 @@ extern void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason); extern void nmi_watchdog_default(void); extern int setup_nmi_watchdog(char *); +extern atomic_t nmi_active; extern unsigned int nmi_watchdog; #define NMI_DEFAULT -1 #define NMI_NONE 0 -- cgit v1.2.1 From b7471c6da94d30d3deadc55986cc38d1ff57f9ca Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] i386: Add SMP support on i386 to reservation framework This patch includes the changes to make the nmi watchdog on i386 SMP aware. A bunch of code was moved around to make it simpler to read. In addition, it is now possible to determine if a particular NMI was the result of the watchdog or not. This feature allows the kernel to filter out unknown NMIs easier. Signed-off-by: Don Zickus Signed-off-by: Andi Kleen --- arch/i386/kernel/apic.c | 3 +- arch/i386/kernel/nmi.c | 537 ++++++++++++++++++++++++------------- arch/i386/kernel/traps.c | 2 +- arch/i386/oprofile/nmi_timer_int.c | 4 +- include/asm-i386/nmi.h | 5 +- 5 files changed, 360 insertions(+), 191 deletions(-) diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index 8c844d07862f..1a34fc57800b 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -586,8 +586,7 @@ void __devinit setup_local_APIC(void) printk("No ESR for 82489DX.\n"); } - if (nmi_watchdog == NMI_LOCAL_APIC) - setup_apic_nmi_watchdog(); + setup_apic_nmi_watchdog(NULL); apic_pm_activate(); } diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 5d58dfeacd59..d88004343034 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -24,16 +24,10 @@ #include #include +#include #include "mach_traps.h" -unsigned int nmi_watchdog = NMI_NONE; -extern int unknown_nmi_panic; -static unsigned int nmi_hz = HZ; -static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ -static unsigned int nmi_p4_cccr_val; -extern void show_registers(struct pt_regs *regs); - /* perfctr_nmi_owner tracks the ownership of the perfctr registers: * evtsel_nmi_owner tracks the ownership of the event selection * - different performance counters/ event selection may be reserved for @@ -63,51 +57,31 @@ static unsigned int lapic_nmi_owner; #define LAPIC_NMI_RESERVED (1<<1) /* nmi_active: - * +1: the lapic NMI watchdog is active, but can be disabled - * 0: the lapic NMI watchdog has not been set up, and cannot + * >0: the lapic NMI watchdog is active, but can be disabled + * <0: the lapic NMI watchdog has not been set up, and cannot * be enabled - * -1: the lapic NMI watchdog is disabled, but can be enabled + * 0: the lapic NMI watchdog is disabled, but can be enabled */ -int nmi_active; +atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ -#define K7_EVNTSEL_ENABLE (1 << 22) -#define K7_EVNTSEL_INT (1 << 20) -#define K7_EVNTSEL_OS (1 << 17) -#define K7_EVNTSEL_USR (1 << 16) -#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 -#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING +unsigned int nmi_watchdog = NMI_DEFAULT; +static unsigned int nmi_hz = HZ; -#define P6_EVNTSEL0_ENABLE (1 << 22) -#define P6_EVNTSEL_INT (1 << 20) -#define P6_EVNTSEL_OS (1 << 17) -#define P6_EVNTSEL_USR (1 << 16) -#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 -#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED +struct nmi_watchdog_ctlblk { + int enabled; + u64 check_bit; + unsigned int cccr_msr; + unsigned int perfctr_msr; /* the MSR to reset in NMI handler */ + unsigned int evntsel_msr; /* the MSR to select the events to handle */ +}; +static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); -#define MSR_P4_MISC_ENABLE 0x1A0 -#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) -#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12) -#define MSR_P4_PERFCTR0 0x300 -#define MSR_P4_CCCR0 0x360 -#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) -#define P4_ESCR_OS (1<<3) -#define P4_ESCR_USR (1<<2) -#define P4_CCCR_OVF_PMI0 (1<<26) -#define P4_CCCR_OVF_PMI1 (1<<27) -#define P4_CCCR_THRESHOLD(N) ((N)<<20) -#define P4_CCCR_COMPLEMENT (1<<19) -#define P4_CCCR_COMPARE (1<<18) -#define P4_CCCR_REQUIRED (3<<16) -#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) -#define P4_CCCR_ENABLE (1<<12) -/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter - CRU_ESCR0 (with any non-null event selector) through a complemented - max threshold. [IA32-Vol3, Section 14.9.9] */ -#define MSR_P4_IQ_COUNTER0 0x30C -#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR) -#define P4_NMI_IQ_CCCR0 \ - (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ - P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) +/* local prototypes */ +static void stop_apic_nmi_watchdog(void *unused); +static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); + +extern void show_registers(struct pt_regs *regs); +extern int unknown_nmi_panic; /* converts an msr to an appropriate reservation bit */ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) @@ -208,6 +182,17 @@ void release_evntsel_nmi(unsigned int msr) clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]); } +static __cpuinit inline int nmi_known_cpu(void) +{ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)); + case X86_VENDOR_INTEL: + return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)); + } + return 0; +} + #ifdef CONFIG_SMP /* The performance counters used by NMI_LOCAL_APIC don't trigger when * the CPU is idle. To make sure the NMI watchdog really ticks on all @@ -234,7 +219,10 @@ static int __init check_nmi_watchdog(void) unsigned int *prev_nmi_count; int cpu; - if (nmi_watchdog == NMI_NONE) + if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT)) + return 0; + + if (!atomic_read(&nmi_active)) return 0; prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); @@ -258,18 +246,22 @@ static int __init check_nmi_watchdog(void) if (!cpu_isset(cpu, cpu_callin_map)) continue; #endif + if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled) + continue; if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { - endflag = 1; printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", cpu, prev_nmi_count[cpu], nmi_count(cpu)); - nmi_active = 0; - lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; - kfree(prev_nmi_count); - return -1; + per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0; + atomic_dec(&nmi_active); } } + if (!atomic_read(&nmi_active)) { + kfree(prev_nmi_count); + atomic_set(&nmi_active, -1); + return -1; + } endflag = 1; printk("OK.\n"); @@ -290,31 +282,16 @@ static int __init setup_nmi_watchdog(char *str) get_option(&str, &nmi); - if (nmi >= NMI_INVALID) + if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE)) return 0; - if (nmi == NMI_NONE) - nmi_watchdog = nmi; /* * If any other x86 CPU has a local APIC, then * please test the NMI stuff there and send me the * missing bits. Right now Intel P6/P4 and AMD K7 only. */ - if ((nmi == NMI_LOCAL_APIC) && - (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15)) - nmi_watchdog = nmi; - if ((nmi == NMI_LOCAL_APIC) && - (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && - (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15)) - nmi_watchdog = nmi; - /* - * We can enable the IO-APIC watchdog - * unconditionally. - */ - if (nmi == NMI_IO_APIC) { - nmi_active = 1; - nmi_watchdog = nmi; - } + if ((nmi == NMI_LOCAL_APIC) && (nmi_known_cpu() == 0)) + return 0; /* no lapic support */ + nmi_watchdog = nmi; return 1; } @@ -322,41 +299,30 @@ __setup("nmi_watchdog=", setup_nmi_watchdog); static void disable_lapic_nmi_watchdog(void) { - if (nmi_active <= 0) + BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); + + if (atomic_read(&nmi_active) <= 0) return; - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - wrmsr(MSR_K7_EVNTSEL0, 0, 0); - break; - case X86_VENDOR_INTEL: - switch (boot_cpu_data.x86) { - case 6: - if (boot_cpu_data.x86_model > 0xd) - break; - wrmsr(MSR_P6_EVNTSEL0, 0, 0); - break; - case 15: - if (boot_cpu_data.x86_model > 0x4) - break; + on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); - wrmsr(MSR_P4_IQ_CCCR0, 0, 0); - wrmsr(MSR_P4_CRU_ESCR0, 0, 0); - break; - } - break; - } - nmi_active = -1; - /* tell do_nmi() and others that we're not active any more */ - nmi_watchdog = 0; + BUG_ON(atomic_read(&nmi_active) != 0); } static void enable_lapic_nmi_watchdog(void) { - if (nmi_active < 0) { - nmi_watchdog = NMI_LOCAL_APIC; - setup_apic_nmi_watchdog(); - } + BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); + + /* are we already enabled */ + if (atomic_read(&nmi_active) != 0) + return; + + /* are we lapic aware */ + if (nmi_known_cpu() <= 0) + return; + + on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); + touch_nmi_watchdog(); } int reserve_lapic_nmi(void) @@ -388,20 +354,25 @@ void release_lapic_nmi(void) void disable_timer_nmi_watchdog(void) { - if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0)) + BUG_ON(nmi_watchdog != NMI_IO_APIC); + + if (atomic_read(&nmi_active) <= 0) return; - unset_nmi_callback(); - nmi_active = -1; - nmi_watchdog = NMI_NONE; + disable_irq(0); + on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); + + BUG_ON(atomic_read(&nmi_active) != 0); } void enable_timer_nmi_watchdog(void) { - if (nmi_active < 0) { - nmi_watchdog = NMI_IO_APIC; + BUG_ON(nmi_watchdog != NMI_IO_APIC); + + if (atomic_read(&nmi_active) == 0) { touch_nmi_watchdog(); - nmi_active = 1; + on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); + enable_irq(0); } } @@ -411,7 +382,7 @@ static int nmi_pm_active; /* nmi_active before suspend */ static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) { - nmi_pm_active = nmi_active; + nmi_pm_active = atomic_read(&nmi_active); disable_lapic_nmi_watchdog(); return 0; } @@ -439,7 +410,13 @@ static int __init init_lapic_nmi_sysfs(void) { int error; - if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC) + /* should really be a BUG_ON but b/c this is an + * init call, it just doesn't work. -dcz + */ + if (nmi_watchdog != NMI_LOCAL_APIC) + return 0; + + if ( atomic_read(&nmi_active) < 0 ) return 0; error = sysdev_class_register(&nmi_sysclass); @@ -457,143 +434,312 @@ late_initcall(init_lapic_nmi_sysfs); * Original code written by Keith Owens. */ -static void write_watchdog_counter(const char *descr) +static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr) { u64 count = (u64)cpu_khz * 1000; do_div(count, nmi_hz); if(descr) Dprintk("setting %s to -0x%08Lx\n", descr, count); - wrmsrl(nmi_perfctr_msr, 0 - count); + wrmsrl(perfctr_msr, 0 - count); } +/* Note that these events don't tick when the CPU idles. This means + the frequency varies with CPU load. */ + +#define K7_EVNTSEL_ENABLE (1 << 22) +#define K7_EVNTSEL_INT (1 << 20) +#define K7_EVNTSEL_OS (1 << 17) +#define K7_EVNTSEL_USR (1 << 16) +#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 +#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING + static int setup_k7_watchdog(void) { + unsigned int perfctr_msr, evntsel_msr; unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - nmi_perfctr_msr = MSR_K7_PERFCTR0; - - if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + perfctr_msr = MSR_K7_PERFCTR0; + evntsel_msr = MSR_K7_EVNTSEL0; + if (!reserve_perfctr_nmi(perfctr_msr)) goto fail; - if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0)) + if (!reserve_evntsel_nmi(evntsel_msr)) goto fail1; - wrmsrl(MSR_K7_PERFCTR0, 0UL); + wrmsrl(perfctr_msr, 0UL); evntsel = K7_EVNTSEL_INT | K7_EVNTSEL_OS | K7_EVNTSEL_USR | K7_NMI_EVENT; - wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); - write_watchdog_counter("K7_PERFCTR0"); + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + write_watchdog_counter(perfctr_msr, "K7_PERFCTR0"); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= K7_EVNTSEL_ENABLE; - wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + wd->check_bit = 1ULL<<63; return 1; fail1: - release_perfctr_nmi(nmi_perfctr_msr); + release_perfctr_nmi(perfctr_msr); fail: return 0; } +static void stop_k7_watchdog(void) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + wrmsr(wd->evntsel_msr, 0, 0); + + release_evntsel_nmi(wd->evntsel_msr); + release_perfctr_nmi(wd->perfctr_msr); +} + +#define P6_EVNTSEL0_ENABLE (1 << 22) +#define P6_EVNTSEL_INT (1 << 20) +#define P6_EVNTSEL_OS (1 << 17) +#define P6_EVNTSEL_USR (1 << 16) +#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 +#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED + static int setup_p6_watchdog(void) { + unsigned int perfctr_msr, evntsel_msr; unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - nmi_perfctr_msr = MSR_P6_PERFCTR0; - - if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + perfctr_msr = MSR_P6_PERFCTR0; + evntsel_msr = MSR_P6_EVNTSEL0; + if (!reserve_perfctr_nmi(perfctr_msr)) goto fail; - if (!reserve_evntsel_nmi(MSR_P6_EVNTSEL0)) + if (!reserve_evntsel_nmi(evntsel_msr)) goto fail1; + wrmsrl(perfctr_msr, 0UL); + evntsel = P6_EVNTSEL_INT | P6_EVNTSEL_OS | P6_EVNTSEL_USR | P6_NMI_EVENT; - wrmsr(MSR_P6_EVNTSEL0, evntsel, 0); - write_watchdog_counter("P6_PERFCTR0"); + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + write_watchdog_counter(perfctr_msr, "P6_PERFCTR0"); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= P6_EVNTSEL0_ENABLE; - wrmsr(MSR_P6_EVNTSEL0, evntsel, 0); + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + wd->check_bit = 1ULL<<39; return 1; fail1: - release_perfctr_nmi(nmi_perfctr_msr); + release_perfctr_nmi(perfctr_msr); fail: return 0; } +static void stop_p6_watchdog(void) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + wrmsr(wd->evntsel_msr, 0, 0); + + release_evntsel_nmi(wd->evntsel_msr); + release_perfctr_nmi(wd->perfctr_msr); +} + +/* Note that these events don't tick when the CPU idles. This means + the frequency varies with CPU load. */ + +#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) +#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) +#define P4_ESCR_OS (1<<3) +#define P4_ESCR_USR (1<<2) +#define P4_CCCR_OVF_PMI0 (1<<26) +#define P4_CCCR_OVF_PMI1 (1<<27) +#define P4_CCCR_THRESHOLD(N) ((N)<<20) +#define P4_CCCR_COMPLEMENT (1<<19) +#define P4_CCCR_COMPARE (1<<18) +#define P4_CCCR_REQUIRED (3<<16) +#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) +#define P4_CCCR_ENABLE (1<<12) +#define P4_CCCR_OVF (1<<31) +/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter + CRU_ESCR0 (with any non-null event selector) through a complemented + max threshold. [IA32-Vol3, Section 14.9.9] */ + static int setup_p4_watchdog(void) { + unsigned int perfctr_msr, evntsel_msr, cccr_msr; + unsigned int evntsel, cccr_val; unsigned int misc_enable, dummy; + unsigned int ht_num; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy); + rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy); if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) return 0; - nmi_perfctr_msr = MSR_P4_IQ_COUNTER0; - nmi_p4_cccr_val = P4_NMI_IQ_CCCR0; #ifdef CONFIG_SMP - if (smp_num_siblings == 2) - nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1; + /* detect which hyperthread we are on */ + if (smp_num_siblings == 2) { + unsigned int ebx, apicid; + + ebx = cpuid_ebx(1); + apicid = (ebx >> 24) & 0xff; + ht_num = apicid & 1; + } else #endif + ht_num = 0; + + /* performance counters are shared resources + * assign each hyperthread its own set + * (re-use the ESCR0 register, seems safe + * and keeps the cccr_val the same) + */ + if (!ht_num) { + /* logical cpu 0 */ + perfctr_msr = MSR_P4_IQ_PERFCTR0; + evntsel_msr = MSR_P4_CRU_ESCR0; + cccr_msr = MSR_P4_IQ_CCCR0; + cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); + } else { + /* logical cpu 1 */ + perfctr_msr = MSR_P4_IQ_PERFCTR1; + evntsel_msr = MSR_P4_CRU_ESCR0; + cccr_msr = MSR_P4_IQ_CCCR1; + cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); + } - if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + if (!reserve_perfctr_nmi(perfctr_msr)) goto fail; - if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0)) + if (!reserve_evntsel_nmi(evntsel_msr)) goto fail1; - wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0); - wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0); - write_watchdog_counter("P4_IQ_COUNTER0"); + evntsel = P4_ESCR_EVENT_SELECT(0x3F) + | P4_ESCR_OS + | P4_ESCR_USR; + + cccr_val |= P4_CCCR_THRESHOLD(15) + | P4_CCCR_COMPLEMENT + | P4_CCCR_COMPARE + | P4_CCCR_REQUIRED; + + wrmsr(evntsel_msr, evntsel, 0); + wrmsr(cccr_msr, cccr_val, 0); + write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0"); apic_write(APIC_LVTPC, APIC_DM_NMI); - wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); + cccr_val |= P4_CCCR_ENABLE; + wrmsr(cccr_msr, cccr_val, 0); + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = cccr_msr; + wd->check_bit = 1ULL<<39; return 1; fail1: - release_perfctr_nmi(nmi_perfctr_msr); + release_perfctr_nmi(perfctr_msr); fail: return 0; } -void setup_apic_nmi_watchdog (void) +static void stop_p4_watchdog(void) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15) - return; - if (!setup_k7_watchdog()) - return; - break; - case X86_VENDOR_INTEL: - switch (boot_cpu_data.x86) { - case 6: - if (boot_cpu_data.x86_model > 0xd) - return; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + wrmsr(wd->cccr_msr, 0, 0); + wrmsr(wd->evntsel_msr, 0, 0); - if(!setup_p6_watchdog()) + release_evntsel_nmi(wd->evntsel_msr); + release_perfctr_nmi(wd->perfctr_msr); +} + +void setup_apic_nmi_watchdog (void *unused) +{ + /* only support LOCAL and IO APICs for now */ + if ((nmi_watchdog != NMI_LOCAL_APIC) && + (nmi_watchdog != NMI_IO_APIC)) + return; + + if (nmi_watchdog == NMI_LOCAL_APIC) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15) return; - break; - case 15: - if (boot_cpu_data.x86_model > 0x4) + if (!setup_k7_watchdog()) return; + break; + case X86_VENDOR_INTEL: + switch (boot_cpu_data.x86) { + case 6: + if (boot_cpu_data.x86_model > 0xd) + return; + + if (!setup_p6_watchdog()) + return; + break; + case 15: + if (boot_cpu_data.x86_model > 0x4) + return; - if (!setup_p4_watchdog()) + if (!setup_p4_watchdog()) + return; + break; + default: return; + } + break; + default: + return; + } + } + __get_cpu_var(nmi_watchdog_ctlblk.enabled) = 1; + atomic_inc(&nmi_active); +} + +static void stop_apic_nmi_watchdog(void *unused) +{ + /* only support LOCAL and IO APICs for now */ + if ((nmi_watchdog != NMI_LOCAL_APIC) && + (nmi_watchdog != NMI_IO_APIC)) + return; + + if (nmi_watchdog == NMI_LOCAL_APIC) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + stop_k7_watchdog(); + break; + case X86_VENDOR_INTEL: + switch (boot_cpu_data.x86) { + case 6: + if (boot_cpu_data.x86_model > 0xd) + break; + stop_p6_watchdog(); + break; + case 15: + if (boot_cpu_data.x86_model > 0x4) + break; + stop_p4_watchdog(); + break; + } break; default: return; } - break; - default: - return; } - lapic_nmi_owner = LAPIC_NMI_WATCHDOG; - nmi_active = 1; + __get_cpu_var(nmi_watchdog_ctlblk.enabled) = 0; + atomic_dec(&nmi_active); } /* @@ -635,7 +781,7 @@ EXPORT_SYMBOL(touch_nmi_watchdog); extern void die_nmi(struct pt_regs *, const char *msg); -void nmi_watchdog_tick (struct pt_regs * regs) +void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) { /* @@ -644,11 +790,21 @@ void nmi_watchdog_tick (struct pt_regs * regs) * smp_processor_id(). */ unsigned int sum; + int touched = 0; int cpu = smp_processor_id(); + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + u64 dummy; + + /* check for other users first */ + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) { + touched = 1; + } sum = per_cpu(irq_stat, cpu).apic_timer_irqs; - if (last_irq_sums[cpu] == sum) { + /* if the apic timer isn't firing, this cpu isn't doing much */ + if (!touched && last_irq_sums[cpu] == sum) { /* * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... @@ -663,26 +819,41 @@ void nmi_watchdog_tick (struct pt_regs * regs) last_irq_sums[cpu] = sum; alert_counter[cpu] = 0; } - if (nmi_perfctr_msr) { - if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) { - /* - * P4 quirks: - * - An overflown perfctr will assert its interrupt - * until the OVF flag in its CCCR is cleared. - * - LVTPC is masked on interrupt and must be - * unmasked by the LVTPC handler. - */ - wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); - apic_write(APIC_LVTPC, APIC_DM_NMI); - } - else if (nmi_perfctr_msr == MSR_P6_PERFCTR0) { - /* Only P6 based Pentium M need to re-unmask - * the apic vector but it doesn't hurt - * other P6 variant */ - apic_write(APIC_LVTPC, APIC_DM_NMI); + /* see if the nmi watchdog went off */ + if (wd->enabled) { + if (nmi_watchdog == NMI_LOCAL_APIC) { + rdmsrl(wd->perfctr_msr, dummy); + if (dummy & wd->check_bit){ + /* this wasn't a watchdog timer interrupt */ + goto done; + } + + /* only Intel P4 uses the cccr msr */ + if (wd->cccr_msr != 0) { + /* + * P4 quirks: + * - An overflown perfctr will assert its interrupt + * until the OVF flag in its CCCR is cleared. + * - LVTPC is masked on interrupt and must be + * unmasked by the LVTPC handler. + */ + rdmsrl(wd->cccr_msr, dummy); + dummy &= ~P4_CCCR_OVF; + wrmsrl(wd->cccr_msr, dummy); + apic_write(APIC_LVTPC, APIC_DM_NMI); + } + else if (wd->perfctr_msr == MSR_P6_PERFCTR0) { + /* Only P6 based Pentium M need to re-unmask + * the apic vector but it doesn't hurt + * other P6 variant */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + } + /* start the cycle over again */ + write_watchdog_counter(wd->perfctr_msr, NULL); } - write_watchdog_counter(NULL); } +done: + return; } #ifdef CONFIG_SYSCTL diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 7e9edafffd8a..3a07b2677e2a 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -724,7 +724,7 @@ static void default_do_nmi(struct pt_regs * regs) * so it must be the NMI watchdog. */ if (nmi_watchdog) { - nmi_watchdog_tick(regs); + nmi_watchdog_tick(regs, reason); return; } #endif diff --git a/arch/i386/oprofile/nmi_timer_int.c b/arch/i386/oprofile/nmi_timer_int.c index 930a1127bb30..a33a73bb502d 100644 --- a/arch/i386/oprofile/nmi_timer_int.c +++ b/arch/i386/oprofile/nmi_timer_int.c @@ -42,9 +42,7 @@ static void timer_stop(void) int __init op_nmi_timer_init(struct oprofile_operations * ops) { - extern int nmi_active; - - if (nmi_active <= 0) + if (atomic_read(&nmi_active) <= 0) return -ENODEV; ops->start = timer_start; diff --git a/include/asm-i386/nmi.h b/include/asm-i386/nmi.h index 27fc9e6f630e..4cda6801ecb8 100644 --- a/include/asm-i386/nmi.h +++ b/include/asm-i386/nmi.h @@ -32,13 +32,14 @@ extern void release_perfctr_nmi(unsigned int); extern int reserve_evntsel_nmi(unsigned int); extern void release_evntsel_nmi(unsigned int); -extern void setup_apic_nmi_watchdog (void); +extern void setup_apic_nmi_watchdog (void *); extern int reserve_lapic_nmi(void); extern void release_lapic_nmi(void); extern void disable_timer_nmi_watchdog(void); extern void enable_timer_nmi_watchdog(void); -extern void nmi_watchdog_tick (struct pt_regs * regs); +extern void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason); +extern atomic_t nmi_active; extern unsigned int nmi_watchdog; #define NMI_DEFAULT -1 #define NMI_NONE 0 -- cgit v1.2.1 From 3adbbcce9a49b900d4cc118cdccfdefa78bf1afb Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] x86: Cleanup NMI interrupt path This patch cleans up the NMI interrupt path. Instead of being gated by if the 'nmi callback' is set, the interrupt handler now calls everyone who is registered on the die_chain and additionally checks the nmi watchdog, reseting it if enabled. This allows more subsystems to hook into the NMI if they need to (without being block by set_nmi_callback). Signed-off-by: Don Zickus Signed-off-by: Andi Kleen --- arch/i386/kernel/nmi.c | 16 +++++++++++++--- arch/i386/kernel/traps.c | 24 +++++++++++------------- arch/x86_64/kernel/nmi.c | 26 +++++++++++++++++++------- arch/x86_64/kernel/traps.c | 8 ++++---- include/asm-i386/nmi.h | 2 +- include/asm-x86_64/nmi.h | 10 +++++++++- 6 files changed, 57 insertions(+), 29 deletions(-) diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index d88004343034..bd96ea4f2942 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -781,7 +781,7 @@ EXPORT_SYMBOL(touch_nmi_watchdog); extern void die_nmi(struct pt_regs *, const char *msg); -void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) +int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) { /* @@ -794,10 +794,12 @@ void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) int cpu = smp_processor_id(); struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); u64 dummy; + int rc=0; /* check for other users first */ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) { + rc = 1; touched = 1; } @@ -850,10 +852,18 @@ void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) } /* start the cycle over again */ write_watchdog_counter(wd->perfctr_msr, NULL); - } + rc = 1; + } else if (nmi_watchdog == NMI_IO_APIC) { + /* don't know how to accurately check for this. + * just assume it was a watchdog timer interrupt + * This matches the old behaviour. + */ + rc = 1; + } else + printk(KERN_WARNING "Unknown enabled NMI hardware?!\n"); } done: - return; + return rc; } #ifdef CONFIG_SYSCTL diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 3a07b2677e2a..282f0bd40dfd 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -706,6 +706,13 @@ void die_nmi (struct pt_regs *regs, const char *msg) do_exit(SIGSEGV); } +static int dummy_nmi_callback(struct pt_regs * regs, int cpu) +{ + return 0; +} + +static nmi_callback_t nmi_callback = dummy_nmi_callback; + static void default_do_nmi(struct pt_regs * regs) { unsigned char reason = 0; @@ -723,12 +730,11 @@ static void default_do_nmi(struct pt_regs * regs) * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. */ - if (nmi_watchdog) { - nmi_watchdog_tick(regs, reason); + if (nmi_watchdog_tick(regs, reason)) return; - } #endif - unknown_nmi_error(reason, regs); + if (!rcu_dereference(nmi_callback)(regs, smp_processor_id())) + unknown_nmi_error(reason, regs); return; } if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) @@ -744,13 +750,6 @@ static void default_do_nmi(struct pt_regs * regs) reassert_nmi(); } -static int dummy_nmi_callback(struct pt_regs * regs, int cpu) -{ - return 0; -} - -static nmi_callback_t nmi_callback = dummy_nmi_callback; - fastcall void do_nmi(struct pt_regs * regs, long error_code) { int cpu; @@ -761,8 +760,7 @@ fastcall void do_nmi(struct pt_regs * regs, long error_code) ++nmi_count(cpu); - if (!rcu_dereference(nmi_callback)(regs, cpu)) - default_do_nmi(regs); + default_do_nmi(regs); nmi_exit(); } diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index d42374a952d7..f6b881b23a70 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -682,16 +682,18 @@ void touch_nmi_watchdog (void) touch_softlockup_watchdog(); } -void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) +int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { int sum; int touched = 0; struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); u64 dummy; + int rc=0; /* check for other users first */ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) { + rc = 1; touched = 1; } @@ -746,10 +748,18 @@ void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) } /* start the cycle over again */ wrmsrl(wd->perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); - } + rc = 1; + } else if (nmi_watchdog == NMI_IO_APIC) { + /* don't know how to accurately check for this. + * just assume it was a watchdog timer interrupt + * This matches the old behaviour. + */ + rc = 1; + } else + printk(KERN_WARNING "Unknown enabled NMI hardware?!\n"); } done: - return; + return rc; } static __kprobes int dummy_nmi_callback(struct pt_regs * regs, int cpu) @@ -761,15 +771,17 @@ static nmi_callback_t nmi_callback = dummy_nmi_callback; asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) { - int cpu = safe_smp_processor_id(); - nmi_enter(); add_pda(__nmi_count,1); - if (!rcu_dereference(nmi_callback)(regs, cpu)) - default_do_nmi(regs); + default_do_nmi(regs); nmi_exit(); } +int do_nmi_callback(struct pt_regs * regs, int cpu) +{ + return rcu_dereference(nmi_callback)(regs, cpu); +} + void set_nmi_callback(nmi_callback_t callback) { vmalloc_sync_all(); diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index b1249774d1e8..42bc070fdf11 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -781,12 +781,12 @@ asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs) * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. */ - if (nmi_watchdog > 0) { - nmi_watchdog_tick(regs,reason); + if (nmi_watchdog_tick(regs,reason)) return; - } + if (!do_nmi_callback(regs,cpu)) #endif - unknown_nmi_error(reason, regs); + unknown_nmi_error(reason, regs); + return; } if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) diff --git a/include/asm-i386/nmi.h b/include/asm-i386/nmi.h index 4cda6801ecb8..da0e0b4e9139 100644 --- a/include/asm-i386/nmi.h +++ b/include/asm-i386/nmi.h @@ -37,7 +37,7 @@ extern int reserve_lapic_nmi(void); extern void release_lapic_nmi(void); extern void disable_timer_nmi_watchdog(void); extern void enable_timer_nmi_watchdog(void); -extern void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason); +extern int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason); extern atomic_t nmi_active; extern unsigned int nmi_watchdog; diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h index 5918136fd853..8f02a2a416e6 100644 --- a/include/asm-x86_64/nmi.h +++ b/include/asm-x86_64/nmi.h @@ -26,6 +26,14 @@ void set_nmi_callback(nmi_callback_t callback); */ void unset_nmi_callback(void); +/** + * do_nmi_callback + * + * Check to see if a callback exists and execute it. Return 1 + * if the handler exists and was handled successfully. + */ +int do_nmi_callback(struct pt_regs *regs, int cpu); + #ifdef CONFIG_PM /** Replace the PM callback routine for NMI. */ @@ -68,7 +76,7 @@ extern int reserve_lapic_nmi(void); extern void release_lapic_nmi(void); extern void disable_timer_nmi_watchdog(void); extern void enable_timer_nmi_watchdog(void); -extern void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason); +extern int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason); extern void nmi_watchdog_default(void); extern int setup_nmi_watchdog(char *); -- cgit v1.2.1 From 1d001df19d5323e642ba8ac821c713675ebccd82 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] Add TIF_RESTORE_SIGMASK We need TIF_RESTORE_SIGMASK in order to support ppoll() and pselect() system calls. This patch originally came from Andi, and was based heavily on David Howells' implementation of same on i386. I fixed a typo which was causing do_signal() to use the wrong signal mask. Signed-off-by: David Woodhouse Signed-off-by: Andi Kleen --- arch/x86_64/ia32/ia32_signal.c | 28 ++++++-------- arch/x86_64/kernel/signal.c | 82 +++++++++++++++++----------------------- include/asm-x86_64/signal.h | 4 -- include/asm-x86_64/thread_info.h | 2 + include/asm-x86_64/unistd.h | 1 + 5 files changed, 49 insertions(+), 68 deletions(-) diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c index 25e5ca22204c..549de439fb2d 100644 --- a/arch/x86_64/ia32/ia32_signal.c +++ b/arch/x86_64/ia32/ia32_signal.c @@ -113,25 +113,19 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) } asmlinkage long -sys32_sigsuspend(int history0, int history1, old_sigset_t mask, - struct pt_regs *regs) +sys32_sigsuspend(int history0, int history1, old_sigset_t mask) { - sigset_t saveset; - mask &= _BLOCKABLE; spin_lock_irq(¤t->sighand->siglock); - saveset = current->blocked; + current->saved_sigmask = current->blocked; siginitset(¤t->blocked, mask); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - regs->rax = -EINTR; - while (1) { - current->state = TASK_INTERRUPTIBLE; - schedule(); - if (do_signal(regs, &saveset)) - return -EINTR; - } + current->state = TASK_INTERRUPTIBLE; + schedule(); + set_thread_flag(TIF_RESTORE_SIGMASK); + return -ERESTARTNOHAND; } asmlinkage long @@ -508,11 +502,11 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif - return 1; + return 0; give_sigsegv: force_sigsegv(sig, current); - return 0; + return -EFAULT; } int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, @@ -595,7 +589,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->ss = __USER32_DS; set_fs(USER_DS); - regs->eflags &= ~TF_MASK; + regs->eflags &= ~TF_MASK; if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); @@ -604,9 +598,9 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif - return 1; + return 0; give_sigsegv: force_sigsegv(sig, current); - return 0; + return -EFAULT; } diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index 28161170fb0a..7f58bc9a056d 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -37,37 +37,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, int ia32_setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, struct pt_regs * regs); -asmlinkage long -sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize, struct pt_regs *regs) -{ - sigset_t saveset, newset; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (copy_from_user(&newset, unewset, sizeof(newset))) - return -EFAULT; - sigdelsetmask(&newset, ~_BLOCKABLE); - - spin_lock_irq(¤t->sighand->siglock); - saveset = current->blocked; - current->blocked = newset; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); -#ifdef DEBUG_SIG - printk("rt_sigsuspend savset(%lx) newset(%lx) regs(%p) rip(%lx)\n", - saveset, newset, regs, regs->rip); -#endif - regs->rax = -EINTR; - while (1) { - current->state = TASK_INTERRUPTIBLE; - schedule(); - if (do_signal(regs, &saveset)) - return -EINTR; - } -} - asmlinkage long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, struct pt_regs *regs) @@ -341,11 +310,11 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif - return 1; + return 0; give_sigsegv: force_sigsegv(sig, current); - return 0; + return -EFAULT; } /* @@ -408,7 +377,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, #endif ret = setup_rt_frame(sig, ka, info, oldset, regs); - if (ret) { + if (ret == 0) { spin_lock_irq(¤t->sighand->siglock); sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); if (!(ka->sa.sa_flags & SA_NODEFER)) @@ -425,11 +394,12 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ -int do_signal(struct pt_regs *regs, sigset_t *oldset) +static void do_signal(struct pt_regs *regs) { struct k_sigaction ka; siginfo_t info; int signr; + sigset_t *oldset; /* * We want the common case to go fast, which @@ -438,9 +408,11 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset) * if so. */ if (!user_mode(regs)) - return 1; + return; - if (!oldset) + if (test_thread_flag(TIF_RESTORE_SIGMASK)) + oldset = ¤t->saved_sigmask; + else oldset = ¤t->blocked; signr = get_signal_to_deliver(&info, &ka, regs, NULL); @@ -454,30 +426,46 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset) set_debugreg(current->thread.debugreg7, 7); /* Whee! Actually deliver the signal. */ - return handle_signal(signr, &info, &ka, oldset, regs); + if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { + /* a signal was successfully delivered; the saved + * sigmask will have been stored in the signal frame, + * and will be restored by sigreturn, so we can simply + * clear the TIF_RESTORE_SIGMASK flag */ + clear_thread_flag(TIF_RESTORE_SIGMASK); + } + return; } /* Did we come from a system call? */ if ((long)regs->orig_rax >= 0) { /* Restart the system call - no handlers present */ long res = regs->rax; - if (res == -ERESTARTNOHAND || - res == -ERESTARTSYS || - res == -ERESTARTNOINTR) { + switch (res) { + case -ERESTARTNOHAND: + case -ERESTARTSYS: + case -ERESTARTNOINTR: regs->rax = regs->orig_rax; regs->rip -= 2; - } - if (regs->rax == (unsigned long)-ERESTART_RESTARTBLOCK) { + break; + case -ERESTART_RESTARTBLOCK: regs->rax = test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall; regs->rip -= 2; + break; } } - return 0; + + /* if there's no signal to deliver, we just put the saved sigmask + back. */ + if (test_thread_flag(TIF_RESTORE_SIGMASK)) { + clear_thread_flag(TIF_RESTORE_SIGMASK); + sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); + } } -void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_flags) +void +do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { #ifdef DEBUG_SIG printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n", @@ -491,8 +479,8 @@ void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_ } /* deal with pending signal delivery */ - if (thread_info_flags & _TIF_SIGPENDING) - do_signal(regs,oldset); + if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK)) + do_signal(regs); } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) diff --git a/include/asm-x86_64/signal.h b/include/asm-x86_64/signal.h index 3ede2a61973a..4581f978b299 100644 --- a/include/asm-x86_64/signal.h +++ b/include/asm-x86_64/signal.h @@ -24,10 +24,6 @@ typedef struct { } sigset_t; -struct pt_regs; -asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); - - #else /* Here we must cater to libcs that poke about in kernel headers. */ diff --git a/include/asm-x86_64/thread_info.h b/include/asm-x86_64/thread_info.h index 2029b00351f3..790c512a4369 100644 --- a/include/asm-x86_64/thread_info.h +++ b/include/asm-x86_64/thread_info.h @@ -114,6 +114,7 @@ static inline struct thread_info *stack_thread_info(void) #define TIF_IRET 5 /* force IRET */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ +#define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal */ /* 16 free */ #define TIF_IA32 17 /* 32bit process */ #define TIF_FORK 18 /* ret_from_fork */ @@ -128,6 +129,7 @@ static inline struct thread_info *stack_thread_info(void) #define _TIF_IRET (1< Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] Add ppoll/pselect syscalls Needed TIF_RESTORE_SIGMASK first Signed-off-by: Andi Kleen --- arch/x86_64/ia32/ia32entry.S | 4 ++-- include/asm-x86_64/unistd.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 5d4a7d125ed0..30ed0f6f4a2b 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -703,8 +703,8 @@ ia32_sys_call_table: .quad sys_readlinkat /* 305 */ .quad sys_fchmodat .quad sys_faccessat - .quad quiet_ni_syscall /* pselect6 for now */ - .quad quiet_ni_syscall /* ppoll for now */ + .quad compat_sys_pselect6 + .quad compat_sys_ppoll .quad sys_unshare /* 310 */ .quad compat_sys_set_robust_list .quad compat_sys_get_robust_list diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index f266de294003..eeb98c168e98 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -600,9 +600,9 @@ __SYSCALL(__NR_fchmodat, sys_fchmodat) #define __NR_faccessat 269 __SYSCALL(__NR_faccessat, sys_faccessat) #define __NR_pselect6 270 -__SYSCALL(__NR_pselect6, sys_ni_syscall) /* for now */ +__SYSCALL(__NR_pselect6, sys_pselect6) #define __NR_ppoll 271 -__SYSCALL(__NR_ppoll, sys_ni_syscall) /* for now */ +__SYSCALL(__NR_ppoll, sys_ppoll) #define __NR_unshare 272 __SYSCALL(__NR_unshare, sys_unshare) #define __NR_set_robust_list 273 -- cgit v1.2.1 From 2fbe7b25c8edaf2d10e6c1a4cc9f8afe714c4764 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] i386/x86-64: Remove un/set_nmi_callback and reserve/release_lapic_nmi functions Removes the un/set_nmi_callback and reserve/release_lapic_nmi functions as they are no longer needed. The various subsystems are modified to register with the die_notifier instead. Also includes compile fixes by Andrew Morton. Signed-off-by: Don Zickus Signed-off-by: Andi Kleen --- arch/i386/kernel/crash.c | 20 +++++++- arch/i386/kernel/nmi.c | 85 ++++--------------------------- arch/i386/kernel/traps.c | 23 +-------- arch/i386/oprofile/nmi_int.c | 47 ++++++++++------- arch/i386/oprofile/nmi_timer_int.c | 33 +++++++++--- arch/x86_64/kernel/crash.c | 20 +++++++- arch/x86_64/kernel/nmi.c | 102 +++---------------------------------- include/asm-i386/nmi.h | 21 ++------ include/asm-x86_64/nmi.h | 21 -------- kernel/sysctl.c | 4 +- 10 files changed, 116 insertions(+), 260 deletions(-) diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c index 5b96f038367f..736c76d6b31d 100644 --- a/arch/i386/kernel/crash.c +++ b/arch/i386/kernel/crash.c @@ -22,6 +22,8 @@ #include #include #include +#include + #include @@ -93,9 +95,18 @@ static void crash_save_self(struct pt_regs *regs) #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static atomic_t waiting_for_crash_ipi; -static int crash_nmi_callback(struct pt_regs *regs, int cpu) +static int crash_nmi_callback(struct notifier_block *self, + unsigned long val, void *data) { + struct pt_regs *regs; struct pt_regs fixed_regs; + int cpu; + + if (val != DIE_NMI) + return NOTIFY_OK; + + regs = ((struct die_args *)data)->regs; + cpu = raw_smp_processor_id(); /* Don't do anything if this handler is invoked on crashing cpu. * Otherwise, system will completely hang. Crashing cpu can get @@ -125,13 +136,18 @@ static void smp_send_nmi_allbutself(void) send_IPI_allbutself(NMI_VECTOR); } +static struct notifier_block crash_nmi_nb = { + .notifier_call = crash_nmi_callback, +}; + static void nmi_shootdown_cpus(void) { unsigned long msecs; atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); /* Would it be better to replace the trap vector here? */ - set_nmi_callback(crash_nmi_callback); + if (register_die_notifier(&crash_nmi_nb)) + return; /* return what? */ /* Ensure the new callback function is set before sending * out the NMI */ diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index bd96ea4f2942..acd3fdea2a21 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -42,20 +42,6 @@ static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]); */ #define NMI_MAX_COUNTER_BITS 66 -/* - * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: - * - it may be reserved by some other driver, or not - * - when not reserved by some other driver, it may be used for - * the NMI watchdog, or not - * - * This is maintained separately from nmi_active because the NMI - * watchdog may also be driven from the I/O APIC timer. - */ -static DEFINE_SPINLOCK(lapic_nmi_owner_lock); -static unsigned int lapic_nmi_owner; -#define LAPIC_NMI_WATCHDOG (1<<0) -#define LAPIC_NMI_RESERVED (1<<1) - /* nmi_active: * >0: the lapic NMI watchdog is active, but can be disabled * <0: the lapic NMI watchdog has not been set up, and cannot @@ -325,33 +311,6 @@ static void enable_lapic_nmi_watchdog(void) touch_nmi_watchdog(); } -int reserve_lapic_nmi(void) -{ - unsigned int old_owner; - - spin_lock(&lapic_nmi_owner_lock); - old_owner = lapic_nmi_owner; - lapic_nmi_owner |= LAPIC_NMI_RESERVED; - spin_unlock(&lapic_nmi_owner_lock); - if (old_owner & LAPIC_NMI_RESERVED) - return -EBUSY; - if (old_owner & LAPIC_NMI_WATCHDOG) - disable_lapic_nmi_watchdog(); - return 0; -} - -void release_lapic_nmi(void) -{ - unsigned int new_owner; - - spin_lock(&lapic_nmi_owner_lock); - new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED; - lapic_nmi_owner = new_owner; - spin_unlock(&lapic_nmi_owner_lock); - if (new_owner & LAPIC_NMI_WATCHDOG) - enable_lapic_nmi_watchdog(); -} - void disable_timer_nmi_watchdog(void) { BUG_ON(nmi_watchdog != NMI_IO_APIC); @@ -866,6 +825,15 @@ done: return rc; } +int do_nmi_callback(struct pt_regs * regs, int cpu) +{ +#ifdef CONFIG_SYSCTL + if (unknown_nmi_panic) + return unknown_nmi_panic_callback(regs, cpu); +#endif + return 0; +} + #ifdef CONFIG_SYSCTL static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) @@ -873,37 +841,8 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) unsigned char reason = get_nmi_reason(); char buf[64]; - if (!(reason & 0xc0)) { - sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(regs, buf); - } - return 0; -} - -/* - * proc handler for /proc/sys/kernel/unknown_nmi_panic - */ -int proc_unknown_nmi_panic(ctl_table *table, int write, struct file *file, - void __user *buffer, size_t *length, loff_t *ppos) -{ - int old_state; - - old_state = unknown_nmi_panic; - proc_dointvec(table, write, file, buffer, length, ppos); - if (!!old_state == !!unknown_nmi_panic) - return 0; - - if (unknown_nmi_panic) { - if (reserve_lapic_nmi() < 0) { - unknown_nmi_panic = 0; - return -EBUSY; - } else { - set_nmi_callback(unknown_nmi_panic_callback); - } - } else { - release_lapic_nmi(); - unset_nmi_callback(); - } + sprintf(buf, "NMI received for unknown reason %02x\n", reason); + die_nmi(regs, buf); return 0; } @@ -917,7 +856,5 @@ EXPORT_SYMBOL(reserve_perfctr_nmi); EXPORT_SYMBOL(release_perfctr_nmi); EXPORT_SYMBOL(reserve_evntsel_nmi); EXPORT_SYMBOL(release_evntsel_nmi); -EXPORT_SYMBOL(reserve_lapic_nmi); -EXPORT_SYMBOL(release_lapic_nmi); EXPORT_SYMBOL(disable_timer_nmi_watchdog); EXPORT_SYMBOL(enable_timer_nmi_watchdog); diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 282f0bd40dfd..7db664d0b25c 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -706,13 +706,6 @@ void die_nmi (struct pt_regs *regs, const char *msg) do_exit(SIGSEGV); } -static int dummy_nmi_callback(struct pt_regs * regs, int cpu) -{ - return 0; -} - -static nmi_callback_t nmi_callback = dummy_nmi_callback; - static void default_do_nmi(struct pt_regs * regs) { unsigned char reason = 0; @@ -732,9 +725,10 @@ static void default_do_nmi(struct pt_regs * regs) */ if (nmi_watchdog_tick(regs, reason)) return; + if (!do_nmi_callback(regs, smp_processor_id())) #endif - if (!rcu_dereference(nmi_callback)(regs, smp_processor_id())) unknown_nmi_error(reason, regs); + return; } if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) @@ -765,19 +759,6 @@ fastcall void do_nmi(struct pt_regs * regs, long error_code) nmi_exit(); } -void set_nmi_callback(nmi_callback_t callback) -{ - vmalloc_sync_all(); - rcu_assign_pointer(nmi_callback, callback); -} -EXPORT_SYMBOL_GPL(set_nmi_callback); - -void unset_nmi_callback(void) -{ - nmi_callback = dummy_nmi_callback; -} -EXPORT_SYMBOL_GPL(unset_nmi_callback); - #ifdef CONFIG_KPROBES fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code) { diff --git a/arch/i386/oprofile/nmi_int.c b/arch/i386/oprofile/nmi_int.c index 8710ca081b1e..b3610188bcf0 100644 --- a/arch/i386/oprofile/nmi_int.c +++ b/arch/i386/oprofile/nmi_int.c @@ -17,14 +17,15 @@ #include #include #include +#include #include "op_counter.h" #include "op_x86_model.h" - + static struct op_x86_model_spec const * model; static struct op_msrs cpu_msrs[NR_CPUS]; static unsigned long saved_lvtpc[NR_CPUS]; - + static int nmi_start(void); static void nmi_stop(void); @@ -82,13 +83,24 @@ static void exit_driverfs(void) #define exit_driverfs() do { } while (0) #endif /* CONFIG_PM */ - -static int nmi_callback(struct pt_regs * regs, int cpu) +int profile_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) { - return model->check_ctrs(regs, &cpu_msrs[cpu]); + struct die_args *args = (struct die_args *)data; + int ret = NOTIFY_DONE; + int cpu = smp_processor_id(); + + switch(val) { + case DIE_NMI: + if (model->check_ctrs(args->regs, &cpu_msrs[cpu])) + ret = NOTIFY_STOP; + break; + default: + break; + } + return ret; } - - + static void nmi_cpu_save_registers(struct op_msrs * msrs) { unsigned int const nr_ctrs = model->num_counters; @@ -174,27 +186,29 @@ static void nmi_cpu_setup(void * dummy) apic_write(APIC_LVTPC, APIC_DM_NMI); } +static struct notifier_block profile_exceptions_nb = { + .notifier_call = profile_exceptions_notify, + .next = NULL, + .priority = 0 +}; static int nmi_setup(void) { + int err=0; + if (!allocate_msrs()) return -ENOMEM; - /* We walk a thin line between law and rape here. - * We need to be careful to install our NMI handler - * without actually triggering any NMIs as this will - * break the core code horrifically. - */ - if (reserve_lapic_nmi() < 0) { + if ((err = register_die_notifier(&profile_exceptions_nb))){ free_msrs(); - return -EBUSY; + return err; } + /* We need to serialize save and setup for HT because the subset * of msrs are distinct for save and setup operations */ on_each_cpu(nmi_save_registers, NULL, 0, 1); on_each_cpu(nmi_cpu_setup, NULL, 0, 1); - set_nmi_callback(nmi_callback); nmi_enabled = 1; return 0; } @@ -250,8 +264,7 @@ static void nmi_shutdown(void) { nmi_enabled = 0; on_each_cpu(nmi_cpu_shutdown, NULL, 0, 1); - unset_nmi_callback(); - release_lapic_nmi(); + unregister_die_notifier(&profile_exceptions_nb); free_msrs(); } diff --git a/arch/i386/oprofile/nmi_timer_int.c b/arch/i386/oprofile/nmi_timer_int.c index a33a73bb502d..93ca48f804ac 100644 --- a/arch/i386/oprofile/nmi_timer_int.c +++ b/arch/i386/oprofile/nmi_timer_int.c @@ -17,32 +17,49 @@ #include #include #include +#include -static int nmi_timer_callback(struct pt_regs * regs, int cpu) +int profile_timer_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) { - oprofile_add_sample(regs, 0); - return 1; + struct die_args *args = (struct die_args *)data; + int ret = NOTIFY_DONE; + + switch(val) { + case DIE_NMI: + oprofile_add_sample(args->regs, 0); + ret = NOTIFY_STOP; + break; + default: + break; + } + return ret; } +static struct notifier_block profile_timer_exceptions_nb = { + .notifier_call = profile_timer_exceptions_notify, + .next = NULL, + .priority = 0 +}; + static int timer_start(void) { - disable_timer_nmi_watchdog(); - set_nmi_callback(nmi_timer_callback); + if (register_die_notifier(&profile_timer_exceptions_nb)) + return 1; return 0; } static void timer_stop(void) { - enable_timer_nmi_watchdog(); - unset_nmi_callback(); + unregister_die_notifier(&profile_timer_exceptions_nb); synchronize_sched(); /* Allow already-started NMIs to complete. */ } int __init op_nmi_timer_init(struct oprofile_operations * ops) { - if (atomic_read(&nmi_active) <= 0) + if ((nmi_watchdog != NMI_IO_APIC) || (atomic_read(&nmi_active) <= 0)) return -ENODEV; ops->start = timer_start; diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c index d8d5750d6106..44c8af65325e 100644 --- a/arch/x86_64/kernel/crash.c +++ b/arch/x86_64/kernel/crash.c @@ -23,6 +23,7 @@ #include #include #include +#include /* This keeps a track of which one is crashing cpu. */ static int crashing_cpu; @@ -95,8 +96,18 @@ static void crash_save_self(struct pt_regs *regs) #ifdef CONFIG_SMP static atomic_t waiting_for_crash_ipi; -static int crash_nmi_callback(struct pt_regs *regs, int cpu) +static int crash_nmi_callback(struct notifier_block *self, + unsigned long val, void *data) { + struct pt_regs *regs; + int cpu; + + if (val != DIE_NMI) + return NOTIFY_OK; + + regs = ((struct die_args *)data)->regs; + cpu = raw_smp_processor_id(); + /* * Don't do anything if this handler is invoked on crashing cpu. * Otherwise, system will completely hang. Crashing cpu can get @@ -127,12 +138,17 @@ static void smp_send_nmi_allbutself(void) * cpu hotplug shouldn't matter. */ +static struct notifier_block crash_nmi_nb = { + .notifier_call = crash_nmi_callback, +}; + static void nmi_shootdown_cpus(void) { unsigned long msecs; atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); - set_nmi_callback(crash_nmi_callback); + if (register_die_notifier(&crash_nmi_nb)) + return; /* return what? */ /* * Ensure the new callback function is set before sending diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index f6b881b23a70..9d175dcf3a2d 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -41,20 +41,6 @@ static DEFINE_PER_CPU(unsigned, evntsel_nmi_owner[2]); */ #define NMI_MAX_COUNTER_BITS 66 -/* - * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: - * - it may be reserved by some other driver, or not - * - when not reserved by some other driver, it may be used for - * the NMI watchdog, or not - * - * This is maintained separately from nmi_active because the NMI - * watchdog may also be driven from the I/O APIC timer. - */ -static DEFINE_SPINLOCK(lapic_nmi_owner_lock); -static unsigned int lapic_nmi_owner; -#define LAPIC_NMI_WATCHDOG (1<<0) -#define LAPIC_NMI_RESERVED (1<<1) - /* nmi_active: * >0: the lapic NMI watchdog is active, but can be disabled * <0: the lapic NMI watchdog has not been set up, and cannot @@ -321,33 +307,6 @@ static void enable_lapic_nmi_watchdog(void) touch_nmi_watchdog(); } -int reserve_lapic_nmi(void) -{ - unsigned int old_owner; - - spin_lock(&lapic_nmi_owner_lock); - old_owner = lapic_nmi_owner; - lapic_nmi_owner |= LAPIC_NMI_RESERVED; - spin_unlock(&lapic_nmi_owner_lock); - if (old_owner & LAPIC_NMI_RESERVED) - return -EBUSY; - if (old_owner & LAPIC_NMI_WATCHDOG) - disable_lapic_nmi_watchdog(); - return 0; -} - -void release_lapic_nmi(void) -{ - unsigned int new_owner; - - spin_lock(&lapic_nmi_owner_lock); - new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED; - lapic_nmi_owner = new_owner; - spin_unlock(&lapic_nmi_owner_lock); - if (new_owner & LAPIC_NMI_WATCHDOG) - enable_lapic_nmi_watchdog(); -} - void disable_timer_nmi_watchdog(void) { BUG_ON(nmi_watchdog != NMI_IO_APIC); @@ -762,13 +721,6 @@ done: return rc; } -static __kprobes int dummy_nmi_callback(struct pt_regs * regs, int cpu) -{ - return 0; -} - -static nmi_callback_t nmi_callback = dummy_nmi_callback; - asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) { nmi_enter(); @@ -779,21 +731,12 @@ asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) int do_nmi_callback(struct pt_regs * regs, int cpu) { - return rcu_dereference(nmi_callback)(regs, cpu); -} - -void set_nmi_callback(nmi_callback_t callback) -{ - vmalloc_sync_all(); - rcu_assign_pointer(nmi_callback, callback); -} -EXPORT_SYMBOL_GPL(set_nmi_callback); - -void unset_nmi_callback(void) -{ - nmi_callback = dummy_nmi_callback; +#ifdef CONFIG_SYSCTL + if (unknown_nmi_panic) + return unknown_nmi_panic_callback(regs, cpu); +#endif + return 0; } -EXPORT_SYMBOL_GPL(unset_nmi_callback); #ifdef CONFIG_SYSCTL @@ -802,37 +745,8 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) unsigned char reason = get_nmi_reason(); char buf[64]; - if (!(reason & 0xc0)) { - sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(buf,regs); - } - return 0; -} - -/* - * proc handler for /proc/sys/kernel/unknown_nmi_panic - */ -int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file, - void __user *buffer, size_t *length, loff_t *ppos) -{ - int old_state; - - old_state = unknown_nmi_panic; - proc_dointvec(table, write, file, buffer, length, ppos); - if (!!old_state == !!unknown_nmi_panic) - return 0; - - if (unknown_nmi_panic) { - if (reserve_lapic_nmi() < 0) { - unknown_nmi_panic = 0; - return -EBUSY; - } else { - set_nmi_callback(unknown_nmi_panic_callback); - } - } else { - release_lapic_nmi(); - unset_nmi_callback(); - } + sprintf(buf, "NMI received for unknown reason %02x\n", reason); + die_nmi(buf,regs); return 0; } @@ -846,8 +760,6 @@ EXPORT_SYMBOL(reserve_perfctr_nmi); EXPORT_SYMBOL(release_perfctr_nmi); EXPORT_SYMBOL(reserve_evntsel_nmi); EXPORT_SYMBOL(release_evntsel_nmi); -EXPORT_SYMBOL(reserve_lapic_nmi); -EXPORT_SYMBOL(release_lapic_nmi); EXPORT_SYMBOL(disable_timer_nmi_watchdog); EXPORT_SYMBOL(enable_timer_nmi_watchdog); EXPORT_SYMBOL(touch_nmi_watchdog); diff --git a/include/asm-i386/nmi.h b/include/asm-i386/nmi.h index da0e0b4e9139..34d6bf063b6e 100644 --- a/include/asm-i386/nmi.h +++ b/include/asm-i386/nmi.h @@ -6,24 +6,13 @@ #include -struct pt_regs; - -typedef int (*nmi_callback_t)(struct pt_regs * regs, int cpu); - -/** - * set_nmi_callback - * - * Set a handler for an NMI. Only one handler may be - * set. Return 1 if the NMI was handled. - */ -void set_nmi_callback(nmi_callback_t callback); - /** - * unset_nmi_callback + * do_nmi_callback * - * Remove the handler previously set. + * Check to see if a callback exists and execute it. Return 1 + * if the handler exists and was handled successfully. */ -void unset_nmi_callback(void); +int do_nmi_callback(struct pt_regs *regs, int cpu); extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); extern int avail_to_resrv_perfctr_nmi(unsigned int); @@ -33,8 +22,6 @@ extern int reserve_evntsel_nmi(unsigned int); extern void release_evntsel_nmi(unsigned int); extern void setup_apic_nmi_watchdog (void *); -extern int reserve_lapic_nmi(void); -extern void release_lapic_nmi(void); extern void disable_timer_nmi_watchdog(void); extern void enable_timer_nmi_watchdog(void); extern int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason); diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h index 8f02a2a416e6..8818c39d34e0 100644 --- a/include/asm-x86_64/nmi.h +++ b/include/asm-x86_64/nmi.h @@ -7,25 +7,6 @@ #include #include -struct pt_regs; - -typedef int (*nmi_callback_t)(struct pt_regs * regs, int cpu); - -/** - * set_nmi_callback - * - * Set a handler for an NMI. Only one handler may be - * set. Return 1 if the NMI was handled. - */ -void set_nmi_callback(nmi_callback_t callback); - -/** - * unset_nmi_callback - * - * Remove the handler previously set. - */ -void unset_nmi_callback(void); - /** * do_nmi_callback * @@ -72,8 +53,6 @@ extern int reserve_evntsel_nmi(unsigned int); extern void release_evntsel_nmi(unsigned int); extern void setup_apic_nmi_watchdog (void *); -extern int reserve_lapic_nmi(void); -extern void release_lapic_nmi(void); extern void disable_timer_nmi_watchdog(void); extern void enable_timer_nmi_watchdog(void); extern int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 362a0cc37138..83f168361624 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -76,8 +76,6 @@ extern int compat_log; #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) int unknown_nmi_panic; -extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *, - void __user *, size_t *, loff_t *); #endif /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ @@ -628,7 +626,7 @@ static ctl_table kern_table[] = { .data = &unknown_nmi_panic, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_unknown_nmi_panic, + .proc_handler = &proc_dointvec, }, #endif #if defined(CONFIG_X86) -- cgit v1.2.1 From 407984f1af259b31957c7c05075a454a751bb801 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] x86: Add abilty to enable/disable nmi watchdog with sysctl Adds a new /proc/sys/kernel/nmi call that will enable/disable the nmi watchdog. Signed-off-by: Don Zickus Signed-off-by: Andi Kleen --- arch/i386/kernel/nmi.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86_64/kernel/nmi.c | 48 ++++++++++++++++++++++++++++++++++++++++++++ include/asm-i386/nmi.h | 1 + include/asm-x86_64/nmi.h | 1 + include/linux/sysctl.h | 1 + kernel/sysctl.c | 11 ++++++++++ 6 files changed, 114 insertions(+) diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index acd3fdea2a21..28065d0b71a9 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -846,6 +846,58 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) return 0; } +/* + * proc handler for /proc/sys/kernel/nmi_watchdog + */ +int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int old_state; + + nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; + old_state = nmi_watchdog_enabled; + proc_dointvec(table, write, file, buffer, length, ppos); + if (!!old_state == !!nmi_watchdog_enabled) + return 0; + + if (atomic_read(&nmi_active) < 0) { + printk(KERN_WARNING "NMI watchdog is permanently disabled\n"); + return -EINVAL; + } + + if (nmi_watchdog == NMI_DEFAULT) { + if (nmi_known_cpu() > 0) + nmi_watchdog = NMI_LOCAL_APIC; + else + nmi_watchdog = NMI_IO_APIC; + } + + if (nmi_watchdog == NMI_LOCAL_APIC) + { + if (nmi_watchdog_enabled) + enable_lapic_nmi_watchdog(); + else + disable_lapic_nmi_watchdog(); + } else if (nmi_watchdog == NMI_IO_APIC) { + /* FIXME + * for some reason these functions don't work + */ + printk("Can not enable/disable NMI on IO APIC\n"); + return -EINVAL; +#if 0 + if (nmi_watchdog_enabled) + enable_timer_nmi_watchdog(); + else + disable_timer_nmi_watchdog(); +#endif + } else { + printk( KERN_WARNING + "NMI watchdog doesn't know what hardware to touch\n"); + return -EIO; + } + return 0; +} + #endif EXPORT_SYMBOL(nmi_active); diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 9d175dcf3a2d..3a17411a9a19 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -750,6 +750,54 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) return 0; } +/* + * proc handler for /proc/sys/kernel/nmi + */ +int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int old_state; + + nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; + old_state = nmi_watchdog_enabled; + proc_dointvec(table, write, file, buffer, length, ppos); + if (!!old_state == !!nmi_watchdog_enabled) + return 0; + + if (atomic_read(&nmi_active) < 0) { + printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); + return -EINVAL; + } + + /* if nmi_watchdog is not set yet, then set it */ + nmi_watchdog_default(); + + if (nmi_watchdog == NMI_LOCAL_APIC) + { + if (nmi_watchdog_enabled) + enable_lapic_nmi_watchdog(); + else + disable_lapic_nmi_watchdog(); + } else if (nmi_watchdog == NMI_IO_APIC) { + /* FIXME + * for some reason these functions don't work + */ + printk("Can not enable/disable NMI on IO APIC\n"); + return -EIO; +#if 0 + if (nmi_watchdog_enabled) + enable_timer_nmi_watchdog(); + else + disable_timer_nmi_watchdog(); +#endif + } else { + printk(KERN_WARNING + "NMI watchdog doesn't know what hardware to touch\n"); + return -EIO; + } + return 0; +} + #endif EXPORT_SYMBOL(nmi_active); diff --git a/include/asm-i386/nmi.h b/include/asm-i386/nmi.h index 34d6bf063b6e..13b5d8311bf7 100644 --- a/include/asm-i386/nmi.h +++ b/include/asm-i386/nmi.h @@ -14,6 +14,7 @@ */ int do_nmi_callback(struct pt_regs *regs, int cpu); +extern int nmi_watchdog_enabled; extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); extern int avail_to_resrv_perfctr_nmi(unsigned int); extern int reserve_perfctr_nmi(unsigned int); diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h index 8818c39d34e0..2c23b0df87d2 100644 --- a/include/asm-x86_64/nmi.h +++ b/include/asm-x86_64/nmi.h @@ -43,6 +43,7 @@ extern void die_nmi(char *str, struct pt_regs *regs); extern int panic_on_timeout; extern int unknown_nmi_panic; +extern int nmi_watchdog_enabled; extern int check_nmi_watchdog(void); extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 736ed917a4f8..ecb79ba52ae1 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -150,6 +150,7 @@ enum KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */ KERN_COMPAT_LOG=73, /* int: print compat layer messages */ KERN_MAX_LOCK_DEPTH=74, + KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */ }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 83f168361624..040de6bd74dd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -76,6 +76,9 @@ extern int compat_log; #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) int unknown_nmi_panic; +int nmi_watchdog_enabled; +extern int proc_nmi_enabled(struct ctl_table *, int , struct file *, + void __user *, size_t *, loff_t *); #endif /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ @@ -628,6 +631,14 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = KERN_NMI_WATCHDOG, + .procname = "nmi_watchdog", + .data = &nmi_watchdog_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_nmi_enabled, + }, #endif #if defined(CONFIG_X86) { -- cgit v1.2.1 From e33e89ab1a8d295de0500b697f4f31c3ceee9aa2 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] x86: Add abilty to enable/disable nmi watchdog from procfs (update) Adds a new /proc/sys/kernel/nmi_watchdog call that will enable/disable the nmi watchdog. By entering a non-zero value here, a user can enable the nmi watchdog to monitor the online cpus in the system. By entering a zero value here, a user can disable the nmi watchdog and free up a performance counter which could then be utilized by the oprofile subsystem, otherwise oprofile may be short a counter when in use. Signed-off-by: Don Zickus Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.txt | 14 +++++++++----- arch/i386/kernel/nmi.c | 21 ++++----------------- arch/x86_64/kernel/nmi.c | 21 ++++----------------- 3 files changed, 17 insertions(+), 39 deletions(-) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 99902ae6804e..7db71d6fba82 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1124,11 +1124,15 @@ debugging information is displayed on console. NMI switch that most IA32 servers have fires unknown NMI up, for example. If a system hangs up, try pressing the NMI switch. -[NOTE] - This function and oprofile share a NMI callback. Therefore this function - cannot be enabled when oprofile is activated. - And NMI watchdog will be disabled when the value in this file is set to - non-zero. +nmi_watchdog +------------ + +Enables/Disables the NMI watchdog on x86 systems. When the value is non-zero +the NMI watchdog is enabled and will continuously test all online cpus to +determine whether or not they are still functioning properly. + +Because the NMI watchdog shares registers with oprofile, by disabling the NMI +watchdog, oprofile may have more registers to utilize. 2.4 /proc/sys/vm - The virtual memory subsystem diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 28065d0b71a9..6241e4448cab 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -847,7 +847,7 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) } /* - * proc handler for /proc/sys/kernel/nmi_watchdog + * proc handler for /proc/sys/kernel/nmi */ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) @@ -861,8 +861,8 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, return 0; if (atomic_read(&nmi_active) < 0) { - printk(KERN_WARNING "NMI watchdog is permanently disabled\n"); - return -EINVAL; + printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); + return -EIO; } if (nmi_watchdog == NMI_DEFAULT) { @@ -872,24 +872,11 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, nmi_watchdog = NMI_IO_APIC; } - if (nmi_watchdog == NMI_LOCAL_APIC) - { + if (nmi_watchdog == NMI_LOCAL_APIC) { if (nmi_watchdog_enabled) enable_lapic_nmi_watchdog(); else disable_lapic_nmi_watchdog(); - } else if (nmi_watchdog == NMI_IO_APIC) { - /* FIXME - * for some reason these functions don't work - */ - printk("Can not enable/disable NMI on IO APIC\n"); - return -EINVAL; -#if 0 - if (nmi_watchdog_enabled) - enable_timer_nmi_watchdog(); - else - disable_timer_nmi_watchdog(); -#endif } else { printk( KERN_WARNING "NMI watchdog doesn't know what hardware to touch\n"); diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 3a17411a9a19..dd57410dad51 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -167,7 +167,7 @@ static __cpuinit inline int nmi_known_cpu(void) } /* Run after command line and cpu_init init, but before all other checks */ -void __cpuinit nmi_watchdog_default(void) +void nmi_watchdog_default(void) { if (nmi_watchdog != NMI_DEFAULT) return; @@ -766,32 +766,19 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, if (atomic_read(&nmi_active) < 0) { printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); - return -EINVAL; + return -EIO; } /* if nmi_watchdog is not set yet, then set it */ nmi_watchdog_default(); - if (nmi_watchdog == NMI_LOCAL_APIC) - { + if (nmi_watchdog == NMI_LOCAL_APIC) { if (nmi_watchdog_enabled) enable_lapic_nmi_watchdog(); else disable_lapic_nmi_watchdog(); - } else if (nmi_watchdog == NMI_IO_APIC) { - /* FIXME - * for some reason these functions don't work - */ - printk("Can not enable/disable NMI on IO APIC\n"); - return -EIO; -#if 0 - if (nmi_watchdog_enabled) - enable_timer_nmi_watchdog(); - else - disable_timer_nmi_watchdog(); -#endif } else { - printk(KERN_WARNING + printk( KERN_WARNING "NMI watchdog doesn't know what hardware to touch\n"); return -EIO; } -- cgit v1.2.1 From 8da5adda91df3d2fcc5300e68da491694c9af019 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] x86: Allow users to force a panic on NMI To quote Alan Cox: The default Linux behaviour on an NMI of either memory or unknown is to continue operation. For many environments such as scientific computing it is preferable that the box is taken out and the error dealt with than an uncorrected parity/ECC error get propogated. A small number of systems do generate NMI's for bizarre random reasons such as power management so the default is unchanged. In other respects the new proc/sys entry works like the existing panic controls already in that directory. This is separate to the edac support - EDAC allows supported chipsets to handle ECC errors well, this change allows unsupported cases to at least panic rather than cause problems further down the line. Signed-off-by: Don Zickus Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 6 ++++++ arch/x86_64/kernel/traps.c | 6 ++++++ include/linux/kernel.h | 1 + include/linux/sysctl.h | 1 + kernel/panic.c | 1 + kernel/sysctl.c | 8 ++++++++ 6 files changed, 23 insertions(+) diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 7db664d0b25c..2f6cb8276480 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -635,6 +635,8 @@ static void mem_parity_error(unsigned char reason, struct pt_regs * regs) "to continue\n"); printk(KERN_EMERG "You probably have a hardware problem with your RAM " "chips\n"); + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); /* Clear and disable the memory parity error line. */ clear_mem_error(reason); @@ -670,6 +672,10 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) reason, smp_processor_id()); printk("Dazed and confused, but trying to continue\n"); printk("Do you have a strange power saving mode enabled?\n"); + + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + } static DEFINE_SPINLOCK(nmi_print_lock); diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 42bc070fdf11..b18829db2a6a 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -732,6 +732,8 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs) { printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); printk("You probably have a hardware problem with your RAM chips\n"); + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); /* Clear and disable the memory parity error line. */ reason = (reason & 0xf) | 4; @@ -757,6 +759,10 @@ unknown_nmi_error(unsigned char reason, struct pt_regs * regs) { printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); printk("Dazed and confused, but trying to continue\n"); printk("Do you have a strange power saving mode enabled?\n"); + + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + } /* Runs on IST stack. This code must keep interrupts off all the time. diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2b2ae4fdce8b..1ff9609300b4 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -186,6 +186,7 @@ extern void bust_spinlocks(int yes); extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ extern int panic_timeout; extern int panic_on_oops; +extern int panic_on_unrecovered_nmi; extern int tainted; extern const char *print_tainted(void); extern void add_taint(unsigned); diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index ecb79ba52ae1..432778446ad2 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -151,6 +151,7 @@ enum KERN_COMPAT_LOG=73, /* int: print compat layer messages */ KERN_MAX_LOCK_DEPTH=74, KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */ + KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */ }; diff --git a/kernel/panic.c b/kernel/panic.c index 8010b9b17aca..d2db3e2209e0 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -21,6 +21,7 @@ #include int panic_on_oops; +int panic_on_unrecovered_nmi; int tainted; static int pause_on_oops; static int pause_on_oops_flag; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 040de6bd74dd..220e20564124 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -641,6 +641,14 @@ static ctl_table kern_table[] = { }, #endif #if defined(CONFIG_X86) + { + .ctl_name = KERN_PANIC_ON_NMI, + .procname = "panic_on_unrecovered_nmi", + .data = &panic_on_unrecovered_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = KERN_BOOTLOADER_TYPE, .procname = "bootloader_type", -- cgit v1.2.1 From c41c5cd3b20a2d81c30498f13b1527847a8fdf69 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] x86: x86 clean up nmi panic messages Clean up some of the output messages on the nmi error paths to make more sense when they are displayed. This is mainly a cosmetic fix and shouldn't impact any normal code path. Signed-off-by: Don Zickus Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 15 ++++++++------- arch/x86_64/kernel/traps.c | 21 ++++++++++++++------- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 2f6cb8276480..3c85c89f68d8 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -631,13 +631,15 @@ gp_in_kernel: static void mem_parity_error(unsigned char reason, struct pt_regs * regs) { - printk(KERN_EMERG "Uhhuh. NMI received. Dazed and confused, but trying " - "to continue\n"); + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " + "CPU %d.\n", reason, smp_processor_id()); printk(KERN_EMERG "You probably have a hardware problem with your RAM " "chips\n"); if (panic_on_unrecovered_nmi) panic("NMI: Not continuing"); + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); + /* Clear and disable the memory parity error line. */ clear_mem_error(reason); } @@ -668,14 +670,13 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) return; } #endif - printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", - reason, smp_processor_id()); - printk("Dazed and confused, but trying to continue\n"); - printk("Do you have a strange power saving mode enabled?\n"); - + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " + "CPU %d.\n", reason, smp_processor_id()); + printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); if (panic_on_unrecovered_nmi) panic("NMI: Not continuing"); + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); } static DEFINE_SPINLOCK(nmi_print_lock); diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index b18829db2a6a..dae10df60926 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -730,10 +730,15 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, static __kprobes void mem_parity_error(unsigned char reason, struct pt_regs * regs) { - printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); - printk("You probably have a hardware problem with your RAM chips\n"); + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", + reason); + printk(KERN_EMERG "You probably have a hardware problem with your " + "RAM chips\n"); + if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); + panic("NMI: Not continuing"); + + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); /* Clear and disable the memory parity error line. */ reason = (reason & 0xf) | 4; @@ -756,13 +761,15 @@ io_check_error(unsigned char reason, struct pt_regs * regs) static __kprobes void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) -{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); - printk("Dazed and confused, but trying to continue\n"); - printk("Do you have a strange power saving mode enabled?\n"); +{ + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", + reason); + printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); + panic("NMI: Not continuing"); + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); } /* Runs on IST stack. This code must keep interrupts off all the time. -- cgit v1.2.1 From 4038f901cf102a40715b900984ed7540a9fa637f Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] i386/x86-64: Fix NMI watchdog suspend/resume Making NMI suspend/resume work with SMP. We use CPU hotplug to offline APs in SMP suspend/resume. Only BSP executes sysdev's .suspend/.resume method. APs should follow CPU hotplug code path. And: +From: Don Zickus Makes the start/stop paths of nmi watchdog more robust to handle the suspend/resume cases more gracefully. AK: I merged the two patches together Signed-off-by: Shaohua Li Signed-off-by: Andi Kleen Cc: Don Zickus Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/nmi.c | 33 ++++++++++++++++++++++++++------- arch/i386/kernel/smpboot.c | 3 ++- arch/x86_64/kernel/nmi.c | 33 ++++++++++++++++++++++++++------- arch/x86_64/kernel/smpboot.c | 2 ++ include/asm-i386/nmi.h | 1 + include/asm-x86_64/nmi.h | 1 + 6 files changed, 58 insertions(+), 15 deletions(-) diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 6241e4448cab..8e4ed930ce6b 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -63,7 +63,6 @@ struct nmi_watchdog_ctlblk { static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); /* local prototypes */ -static void stop_apic_nmi_watchdog(void *unused); static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); extern void show_registers(struct pt_regs *regs); @@ -341,15 +340,20 @@ static int nmi_pm_active; /* nmi_active before suspend */ static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) { + /* only CPU0 goes here, other CPUs should be offline */ nmi_pm_active = atomic_read(&nmi_active); - disable_lapic_nmi_watchdog(); + stop_apic_nmi_watchdog(NULL); + BUG_ON(atomic_read(&nmi_active) != 0); return 0; } static int lapic_nmi_resume(struct sys_device *dev) { - if (nmi_pm_active > 0) - enable_lapic_nmi_watchdog(); + /* only CPU0 goes here, other CPUs should be offline */ + if (nmi_pm_active > 0) { + setup_apic_nmi_watchdog(NULL); + touch_nmi_watchdog(); + } return 0; } @@ -626,11 +630,21 @@ static void stop_p4_watchdog(void) void setup_apic_nmi_watchdog (void *unused) { + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + /* only support LOCAL and IO APICs for now */ if ((nmi_watchdog != NMI_LOCAL_APIC) && (nmi_watchdog != NMI_IO_APIC)) return; + if (wd->enabled == 1) + return; + + /* cheap hack to support suspend/resume */ + /* if cpu0 is not active neither should the other cpus */ + if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) + return; + if (nmi_watchdog == NMI_LOCAL_APIC) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: @@ -663,17 +677,22 @@ void setup_apic_nmi_watchdog (void *unused) return; } } - __get_cpu_var(nmi_watchdog_ctlblk.enabled) = 1; + wd->enabled = 1; atomic_inc(&nmi_active); } -static void stop_apic_nmi_watchdog(void *unused) +void stop_apic_nmi_watchdog(void *unused) { + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + /* only support LOCAL and IO APICs for now */ if ((nmi_watchdog != NMI_LOCAL_APIC) && (nmi_watchdog != NMI_IO_APIC)) return; + if (wd->enabled == 0) + return; + if (nmi_watchdog == NMI_LOCAL_APIC) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: @@ -697,7 +716,7 @@ static void stop_apic_nmi_watchdog(void *unused) return; } } - __get_cpu_var(nmi_watchdog_ctlblk.enabled) = 0; + wd->enabled = 0; atomic_dec(&nmi_active); } diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index efe07990e7fc..9367af76ce37 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -1376,7 +1376,8 @@ int __cpu_disable(void) */ if (cpu == 0) return -EBUSY; - + if (nmi_watchdog == NMI_LOCAL_APIC) + stop_apic_nmi_watchdog(NULL); clear_local_APIC(); /* Allow any queued timer interrupts to get serviced */ local_irq_enable(); diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index dd57410dad51..5a35975e5763 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -63,7 +63,6 @@ struct nmi_watchdog_ctlblk { static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); /* local prototypes */ -static void stop_apic_nmi_watchdog(void *unused); static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); /* converts an msr to an appropriate reservation bit */ @@ -337,15 +336,20 @@ static int nmi_pm_active; /* nmi_active before suspend */ static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) { + /* only CPU0 goes here, other CPUs should be offline */ nmi_pm_active = atomic_read(&nmi_active); - disable_lapic_nmi_watchdog(); + stop_apic_nmi_watchdog(NULL); + BUG_ON(atomic_read(&nmi_active) != 0); return 0; } static int lapic_nmi_resume(struct sys_device *dev) { - if (nmi_pm_active > 0) - enable_lapic_nmi_watchdog(); + /* only CPU0 goes here, other CPUs should be offline */ + if (nmi_pm_active > 0) { + setup_apic_nmi_watchdog(NULL); + touch_nmi_watchdog(); + } return 0; } @@ -561,11 +565,21 @@ static void stop_p4_watchdog(void) void setup_apic_nmi_watchdog(void *unused) { + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + /* only support LOCAL and IO APICs for now */ if ((nmi_watchdog != NMI_LOCAL_APIC) && (nmi_watchdog != NMI_IO_APIC)) return; + if (wd->enabled == 1) + return; + + /* cheap hack to support suspend/resume */ + /* if cpu0 is not active neither should the other cpus */ + if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) + return; + if (nmi_watchdog == NMI_LOCAL_APIC) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: @@ -582,17 +596,22 @@ void setup_apic_nmi_watchdog(void *unused) return; } } - __get_cpu_var(nmi_watchdog_ctlblk.enabled) = 1; + wd->enabled = 1; atomic_inc(&nmi_active); } -static void stop_apic_nmi_watchdog(void *unused) +void stop_apic_nmi_watchdog(void *unused) { + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + /* only support LOCAL and IO APICs for now */ if ((nmi_watchdog != NMI_LOCAL_APIC) && (nmi_watchdog != NMI_IO_APIC)) return; + if (wd->enabled == 0) + return; + if (nmi_watchdog == NMI_LOCAL_APIC) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: @@ -607,7 +626,7 @@ static void stop_apic_nmi_watchdog(void *unused) return; } } - __get_cpu_var(nmi_watchdog_ctlblk.enabled) = 0; + wd->enabled = 0; atomic_dec(&nmi_active); } diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 975380207b46..15555879ce95 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -1233,6 +1233,8 @@ int __cpu_disable(void) if (cpu == 0) return -EBUSY; + if (nmi_watchdog == NMI_LOCAL_APIC) + stop_apic_nmi_watchdog(NULL); clear_local_APIC(); /* diff --git a/include/asm-i386/nmi.h b/include/asm-i386/nmi.h index 13b5d8311bf7..303bcd4592bb 100644 --- a/include/asm-i386/nmi.h +++ b/include/asm-i386/nmi.h @@ -23,6 +23,7 @@ extern int reserve_evntsel_nmi(unsigned int); extern void release_evntsel_nmi(unsigned int); extern void setup_apic_nmi_watchdog (void *); +extern void stop_apic_nmi_watchdog (void *); extern void disable_timer_nmi_watchdog(void); extern void enable_timer_nmi_watchdog(void); extern int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason); diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h index 2c23b0df87d2..578596494275 100644 --- a/include/asm-x86_64/nmi.h +++ b/include/asm-x86_64/nmi.h @@ -54,6 +54,7 @@ extern int reserve_evntsel_nmi(unsigned int); extern void release_evntsel_nmi(unsigned int); extern void setup_apic_nmi_watchdog (void *); +extern void stop_apic_nmi_watchdog (void *); extern void disable_timer_nmi_watchdog(void); extern void enable_timer_nmi_watchdog(void); extern int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason); -- cgit v1.2.1 From fac58550e80c307bf17cfa0dd544fca4eff120a5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] Fix up panic messages for different NMI panics When a unknown NMI happened the panic would claim a NMI watchdog timeout. Also it would check the variable set by nmi_watchdog=panic and panic then. Fix up the panic message to be generic Unconditionally panic on unknown NMI when panic on unknown nmi is enabled. Noticed by Jan Beulich Cc: jbeulich@novell.com Signed-off-by: Andi Kleen --- arch/x86_64/kernel/nmi.c | 5 +++-- arch/x86_64/kernel/traps.c | 7 +++---- include/asm-x86_64/nmi.h | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 5a35975e5763..1b76d1574529 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -695,7 +695,8 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) */ local_inc(&__get_cpu_var(alert_counter)); if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) - die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs); + die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs, + panic_on_timeout); } else { __get_cpu_var(last_irq_sum) = sum; local_set(&__get_cpu_var(alert_counter), 0); @@ -765,7 +766,7 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) char buf[64]; sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(buf,regs); + die_nmi(buf, regs, 1); /* Always panic here */ return 0; } diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index dae10df60926..96f62a033242 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -570,7 +570,7 @@ void die(const char * str, struct pt_regs * regs, long err) do_exit(SIGSEGV); } -void __kprobes die_nmi(char *str, struct pt_regs *regs) +void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) { unsigned long flags = oops_begin(); @@ -582,9 +582,8 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs) show_registers(regs); if (kexec_should_crash(current)) crash_kexec(regs); - if (panic_on_timeout || panic_on_oops) - panic("nmi watchdog"); - printk("console shuts up ...\n"); + if (do_panic || panic_on_oops) + panic("Non maskable interrupt"); oops_end(flags); nmi_exit(); local_irq_enable(); diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h index 578596494275..cbf2669bca71 100644 --- a/include/asm-x86_64/nmi.h +++ b/include/asm-x86_64/nmi.h @@ -37,7 +37,7 @@ static inline void unset_nmi_pm_callback(struct pm_dev * dev) #endif /* CONFIG_PM */ extern void default_do_nmi(struct pt_regs *); -extern void die_nmi(char *str, struct pt_regs *regs); +extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); #define get_nmi_reason() inb(0x61) -- cgit v1.2.1 From c7c19f8e5e564fb1a354a065befc8a88a9bb08fd Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] i386: make functions static This patch makes the following needlessly global functions static: - nmi_int.c: profile_exceptions_notify() - nmi_timer_int.c: profile_timer_exceptions_notify() Signed-off-by: Adrian Bunk Signed-off-by: Andi Kleen --- arch/i386/oprofile/nmi_int.c | 4 ++-- arch/i386/oprofile/nmi_timer_int.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/i386/oprofile/nmi_int.c b/arch/i386/oprofile/nmi_int.c index b3610188bcf0..3700eef78743 100644 --- a/arch/i386/oprofile/nmi_int.c +++ b/arch/i386/oprofile/nmi_int.c @@ -83,8 +83,8 @@ static void exit_driverfs(void) #define exit_driverfs() do { } while (0) #endif /* CONFIG_PM */ -int profile_exceptions_notify(struct notifier_block *self, - unsigned long val, void *data) +static int profile_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) { struct die_args *args = (struct die_args *)data; int ret = NOTIFY_DONE; diff --git a/arch/i386/oprofile/nmi_timer_int.c b/arch/i386/oprofile/nmi_timer_int.c index 93ca48f804ac..abf0ba52a635 100644 --- a/arch/i386/oprofile/nmi_timer_int.c +++ b/arch/i386/oprofile/nmi_timer_int.c @@ -19,8 +19,8 @@ #include #include -int profile_timer_exceptions_notify(struct notifier_block *self, - unsigned long val, void *data) +static int profile_timer_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) { struct die_args *args = (struct die_args *)data; int ret = NOTIFY_DONE; -- cgit v1.2.1 From 3f22c5789eb76fd9aabe4be37ba609c793f046f9 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] kdump x86_64 nmi event notification fix After a crash we should wait for NMI IPI event and not for external NMI or NMI watchdog tick. Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen Cc: Don Zickus Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/kernel/crash.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c index 44c8af65325e..fc57a139123e 100644 --- a/arch/x86_64/kernel/crash.c +++ b/arch/x86_64/kernel/crash.c @@ -102,7 +102,7 @@ static int crash_nmi_callback(struct notifier_block *self, struct pt_regs *regs; int cpu; - if (val != DIE_NMI) + if (val != DIE_NMI_IPI) return NOTIFY_OK; regs = ((struct die_args *)data)->regs; @@ -114,7 +114,7 @@ static int crash_nmi_callback(struct notifier_block *self, * an NMI if system was initially booted with nmi_watchdog parameter. */ if (cpu == crashing_cpu) - return 1; + return NOTIFY_STOP; local_irq_disable(); crash_save_this_cpu(regs, cpu); -- cgit v1.2.1 From 260d6790b6a2a0a048b7f96d154c2b49f1e6515a Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] i386: Kdump i386 nmi event notification fix After a crash we should wait for NMI IPI event and not for external NMI or NMI watchdog tick. Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen Cc: Don Zickus Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/crash.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c index 736c76d6b31d..67d297dc1003 100644 --- a/arch/i386/kernel/crash.c +++ b/arch/i386/kernel/crash.c @@ -102,7 +102,7 @@ static int crash_nmi_callback(struct notifier_block *self, struct pt_regs fixed_regs; int cpu; - if (val != DIE_NMI) + if (val != DIE_NMI_IPI) return NOTIFY_OK; regs = ((struct die_args *)data)->regs; @@ -113,7 +113,7 @@ static int crash_nmi_callback(struct notifier_block *self, * an NMI if system was initially booted with nmi_watchdog parameter. */ if (cpu == crashing_cpu) - return 1; + return NOTIFY_STOP; local_irq_disable(); if (!user_mode_vm(regs)) { -- cgit v1.2.1 From 1de84979dfc527c422abf63f27beabe43892989b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] i386: Enable NMI watchdog by default I've had good experiences with having this on by default on x86-64. It turns nasty hangs into easier to debug oopses. Enable the local APIC wdog by default for systems newer than 2004. This comes from a strange compromise: according to arjan the reason it was off by default was some old IBM systems that corrupted registered when NMI happened in SMI. Can't remember more specific, but >= 2004 should avoid these. It's probably overly broad because most older systems should be ok (and the really old systems won't be supported by the local apic watchdog anyways) Signed-off-by: Andi Kleen --- arch/i386/kernel/nmi.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 8e4ed930ce6b..6e5085d5d2f6 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -204,6 +205,14 @@ static int __init check_nmi_watchdog(void) unsigned int *prev_nmi_count; int cpu; + /* Enable NMI watchdog for newer systems. + Actually it should be safe for most systems before 2004 too except + for some IBM systems that corrupt registers when NMI happens + during SMM. Unfortunately we don't have more exact information + on these and use this coarse check. */ + if (nmi_watchdog == NMI_DEFAULT && dmi_get_year(DMI_BIOS_DATE) >= 2004) + nmi_watchdog = NMI_LOCAL_APIC; + if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT)) return 0; -- cgit v1.2.1 From 248dcb2ffffe8f3e4a369556a68988788c208111 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] x86: i386/x86-64 Add nmi watchdog support for new Intel CPUs AK: This redoes the changes I temporarily reverted. Intel now has support for Architectural Performance Monitoring Counters ( Refer to IA-32 Intel Architecture Software Developer's Manual http://www.intel.com/design/pentium4/manuals/253669.htm ). This feature is present starting from Intel Core Duo and Intel Core Solo processors. What this means is, the performance monitoring counters and some performance monitoring events are now defined in an architectural way (using cpuid). And there will be no need to check for family/model etc for these architectural events. Below is the patch to use this performance counters in nmi watchdog driver. Patch handles both i386 and x86-64 kernels. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Andi Kleen --- arch/i386/kernel/nmi.c | 126 +++++++++++++++++++++++++++++-- arch/x86_64/kernel/nmi.c | 130 ++++++++++++++++++++++++++++++-- include/asm-i386/intel_arch_perfmon.h | 31 ++++++++ include/asm-x86_64/intel_arch_perfmon.h | 31 ++++++++ 4 files changed, 308 insertions(+), 10 deletions(-) create mode 100644 include/asm-i386/intel_arch_perfmon.h create mode 100644 include/asm-x86_64/intel_arch_perfmon.h diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 6e5085d5d2f6..7b9a053effa3 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "mach_traps.h" @@ -77,6 +78,9 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) case X86_VENDOR_AMD: return (msr - MSR_K7_PERFCTR0); case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return (msr - MSR_ARCH_PERFMON_PERFCTR0); + switch (boot_cpu_data.x86) { case 6: return (msr - MSR_P6_PERFCTR0); @@ -95,6 +99,9 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) case X86_VENDOR_AMD: return (msr - MSR_K7_EVNTSEL0); case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return (msr - MSR_ARCH_PERFMON_EVENTSEL0); + switch (boot_cpu_data.x86) { case 6: return (msr - MSR_P6_EVNTSEL0); @@ -174,7 +181,10 @@ static __cpuinit inline int nmi_known_cpu(void) case X86_VENDOR_AMD: return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)); case X86_VENDOR_INTEL: - return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)); + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return 1; + else + return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)); } return 0; } @@ -261,8 +271,24 @@ static int __init check_nmi_watchdog(void) /* now that we know it works we can reduce NMI frequency to something more reasonable; makes a difference in some configs */ - if (nmi_watchdog == NMI_LOCAL_APIC) + if (nmi_watchdog == NMI_LOCAL_APIC) { + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + nmi_hz = 1; + /* + * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter + * are writable, with higher bits sign extending from bit 31. + * So, we can only program the counter with 31 bit values and + * 32nd bit should be 1, for 33.. to be 1. + * Find the appropriate nmi_hz + */ + if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 && + ((u64)cpu_khz * 1000) > 0x7fffffffULL) { + u64 count = (u64)cpu_khz * 1000; + do_div(count, 0x7fffffffUL); + nmi_hz = count + 1; + } + } kfree(prev_nmi_count); return 0; @@ -637,6 +663,85 @@ static void stop_p4_watchdog(void) release_perfctr_nmi(wd->perfctr_msr); } +#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL +#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK + +static int setup_intel_arch_watchdog(void) +{ + unsigned int ebx; + union cpuid10_eax eax; + unsigned int unused; + unsigned int perfctr_msr, evntsel_msr; + unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + /* + * Check whether the Architectural PerfMon supports + * Unhalted Core Cycles Event or not. + * NOTE: Corresponding bit = 0 in ebx indicates event present. + */ + cpuid(10, &(eax.full), &ebx, &unused, &unused); + if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || + (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) + goto fail; + + perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; + evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0; + + if (!reserve_perfctr_nmi(perfctr_msr)) + goto fail; + + if (!reserve_evntsel_nmi(evntsel_msr)) + goto fail1; + + wrmsrl(perfctr_msr, 0UL); + + evntsel = ARCH_PERFMON_EVENTSEL_INT + | ARCH_PERFMON_EVENTSEL_OS + | ARCH_PERFMON_EVENTSEL_USR + | ARCH_PERFMON_NMI_EVENT_SEL + | ARCH_PERFMON_NMI_EVENT_UMASK; + + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + write_watchdog_counter(perfctr_msr, "INTEL_ARCH_PERFCTR0"); + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + wd->check_bit = 1ULL << (eax.split.bit_width - 1); + return 1; +fail1: + release_perfctr_nmi(perfctr_msr); +fail: + return 0; +} + +static void stop_intel_arch_watchdog(void) +{ + unsigned int ebx; + union cpuid10_eax eax; + unsigned int unused; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + /* + * Check whether the Architectural PerfMon supports + * Unhalted Core Cycles Event or not. + * NOTE: Corresponding bit = 0 in ebx indicates event present. + */ + cpuid(10, &(eax.full), &ebx, &unused, &unused); + if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || + (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) + return; + + wrmsr(wd->evntsel_msr, 0, 0); + release_evntsel_nmi(wd->evntsel_msr); + release_perfctr_nmi(wd->perfctr_msr); +} + void setup_apic_nmi_watchdog (void *unused) { struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); @@ -663,6 +768,11 @@ void setup_apic_nmi_watchdog (void *unused) return; break; case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + if (!setup_intel_arch_watchdog()) + return; + break; + } switch (boot_cpu_data.x86) { case 6: if (boot_cpu_data.x86_model > 0xd) @@ -708,6 +818,10 @@ void stop_apic_nmi_watchdog(void *unused) stop_k7_watchdog(); break; case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + stop_intel_arch_watchdog(); + break; + } switch (boot_cpu_data.x86) { case 6: if (boot_cpu_data.x86_model > 0xd) @@ -831,10 +945,12 @@ int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) wrmsrl(wd->cccr_msr, dummy); apic_write(APIC_LVTPC, APIC_DM_NMI); } - else if (wd->perfctr_msr == MSR_P6_PERFCTR0) { - /* Only P6 based Pentium M need to re-unmask + else if (wd->perfctr_msr == MSR_P6_PERFCTR0 || + wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { + /* P6 based Pentium M need to re-unmask * the apic vector but it doesn't hurt - * other P6 variant */ + * other P6 variant. + * ArchPerfom/Core Duo also needs this */ apic_write(APIC_LVTPC, APIC_DM_NMI); } /* start the cycle over again */ diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 1b76d1574529..4d6fb047952e 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -26,6 +26,7 @@ #include #include #include +#include /* perfctr_nmi_owner tracks the ownership of the perfctr registers: * evtsel_nmi_owner tracks the ownership of the event selection @@ -73,7 +74,10 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) case X86_VENDOR_AMD: return (msr - MSR_K7_PERFCTR0); case X86_VENDOR_INTEL: - return (msr - MSR_P4_BPU_PERFCTR0); + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return (msr - MSR_ARCH_PERFMON_PERFCTR0); + else + return (msr - MSR_P4_BPU_PERFCTR0); } return 0; } @@ -86,7 +90,10 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) case X86_VENDOR_AMD: return (msr - MSR_K7_EVNTSEL0); case X86_VENDOR_INTEL: - return (msr - MSR_P4_BSU_ESCR0); + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return (msr - MSR_ARCH_PERFMON_EVENTSEL0); + else + return (msr - MSR_P4_BSU_ESCR0); } return 0; } @@ -160,7 +167,10 @@ static __cpuinit inline int nmi_known_cpu(void) case X86_VENDOR_AMD: return boot_cpu_data.x86 == 15; case X86_VENDOR_INTEL: - return boot_cpu_data.x86 == 15; + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return 1; + else + return (boot_cpu_data.x86 == 15); } return 0; } @@ -246,8 +256,22 @@ int __init check_nmi_watchdog (void) /* now that we know it works we can reduce NMI frequency to something more reasonable; makes a difference in some configs */ - if (nmi_watchdog == NMI_LOCAL_APIC) + if (nmi_watchdog == NMI_LOCAL_APIC) { + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + nmi_hz = 1; + /* + * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter + * are writable, with higher bits sign extending from bit 31. + * So, we can only program the counter with 31 bit values and + * 32nd bit should be 1, for 33.. to be 1. + * Find the appropriate nmi_hz + */ + if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 && + ((u64)cpu_khz * 1000) > 0x7fffffffULL) { + nmi_hz = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1; + } + } kfree(counts); return 0; @@ -563,6 +587,87 @@ static void stop_p4_watchdog(void) release_perfctr_nmi(wd->perfctr_msr); } +#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL +#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK + +static int setup_intel_arch_watchdog(void) +{ + unsigned int ebx; + union cpuid10_eax eax; + unsigned int unused; + unsigned int perfctr_msr, evntsel_msr; + unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + /* + * Check whether the Architectural PerfMon supports + * Unhalted Core Cycles Event or not. + * NOTE: Corresponding bit = 0 in ebx indicates event present. + */ + cpuid(10, &(eax.full), &ebx, &unused, &unused); + if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || + (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) + goto fail; + + perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; + evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0; + + if (!reserve_perfctr_nmi(perfctr_msr)) + goto fail; + + if (!reserve_evntsel_nmi(evntsel_msr)) + goto fail1; + + wrmsrl(perfctr_msr, 0UL); + + evntsel = ARCH_PERFMON_EVENTSEL_INT + | ARCH_PERFMON_EVENTSEL_OS + | ARCH_PERFMON_EVENTSEL_USR + | ARCH_PERFMON_NMI_EVENT_SEL + | ARCH_PERFMON_NMI_EVENT_UMASK; + + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); + + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + wd->check_bit = 1ULL << (eax.split.bit_width - 1); + return 1; +fail1: + release_perfctr_nmi(perfctr_msr); +fail: + return 0; +} + +static void stop_intel_arch_watchdog(void) +{ + unsigned int ebx; + union cpuid10_eax eax; + unsigned int unused; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + /* + * Check whether the Architectural PerfMon supports + * Unhalted Core Cycles Event or not. + * NOTE: Corresponding bit = 0 in ebx indicates event present. + */ + cpuid(10, &(eax.full), &ebx, &unused, &unused); + if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || + (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) + return; + + wrmsr(wd->evntsel_msr, 0, 0); + + release_evntsel_nmi(wd->evntsel_msr); + release_perfctr_nmi(wd->perfctr_msr); +} + void setup_apic_nmi_watchdog(void *unused) { struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); @@ -589,6 +694,11 @@ void setup_apic_nmi_watchdog(void *unused) return; break; case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + if (!setup_intel_arch_watchdog()) + return; + break; + } if (!setup_p4_watchdog()) return; break; @@ -620,6 +730,10 @@ void stop_apic_nmi_watchdog(void *unused) stop_k7_watchdog(); break; case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + stop_intel_arch_watchdog(); + break; + } stop_p4_watchdog(); break; default: @@ -724,7 +838,13 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) dummy &= ~P4_CCCR_OVF; wrmsrl(wd->cccr_msr, dummy); apic_write(APIC_LVTPC, APIC_DM_NMI); - } + } else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { + /* + * ArchPerfom/Core Duo needs to re-unmask + * the apic vector + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + } /* start the cycle over again */ wrmsrl(wd->perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); rc = 1; diff --git a/include/asm-i386/intel_arch_perfmon.h b/include/asm-i386/intel_arch_perfmon.h new file mode 100644 index 000000000000..b52cd60a075b --- /dev/null +++ b/include/asm-i386/intel_arch_perfmon.h @@ -0,0 +1,31 @@ +#ifndef X86_INTEL_ARCH_PERFMON_H +#define X86_INTEL_ARCH_PERFMON_H 1 + +#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 +#define MSR_ARCH_PERFMON_PERFCTR1 0xc2 + +#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 +#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 + +#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) +#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) +#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) +#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) + +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c) +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0) +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ + (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) + +union cpuid10_eax { + struct { + unsigned int version_id:8; + unsigned int num_counters:8; + unsigned int bit_width:8; + unsigned int mask_length:8; + } split; + unsigned int full; +}; + +#endif /* X86_INTEL_ARCH_PERFMON_H */ diff --git a/include/asm-x86_64/intel_arch_perfmon.h b/include/asm-x86_64/intel_arch_perfmon.h new file mode 100644 index 000000000000..8633331420ec --- /dev/null +++ b/include/asm-x86_64/intel_arch_perfmon.h @@ -0,0 +1,31 @@ +#ifndef X86_64_INTEL_ARCH_PERFMON_H +#define X86_64_INTEL_ARCH_PERFMON_H 1 + +#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 +#define MSR_ARCH_PERFMON_PERFCTR1 0xc2 + +#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 +#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 + +#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) +#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) +#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) +#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) + +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c) +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0) +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ + (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) + +union cpuid10_eax { + struct { + unsigned int version_id:8; + unsigned int num_counters:8; + unsigned int bit_width:8; + unsigned int mask_length:8; + } split; + unsigned int full; +}; + +#endif /* X86_64_INTEL_ARCH_PERFMON_H */ -- cgit v1.2.1 From 81af4449af9c9b686a4eeeb00112614621655704 Mon Sep 17 00:00:00 2001 From: Vojtech Pavlik Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] Add macros for rdtscp This patch adds macros for reading tsc via the RDTSCP instruction, as well as writing the auxilliary MSR read by RDTSCP to msr.h [AK: changed rdtscp definition for old binutils] Signed-off-by: Vojtech Pavlik Signed-off-by: Andi Kleen --- include/asm-x86_64/msr.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/asm-x86_64/msr.h b/include/asm-x86_64/msr.h index 10f8b51cec8b..37e194169fac 100644 --- a/include/asm-x86_64/msr.h +++ b/include/asm-x86_64/msr.h @@ -66,14 +66,25 @@ #define rdtscl(low) \ __asm__ __volatile__ ("rdtsc" : "=a" (low) : : "edx") +#define rdtscp(low,high,aux) \ + asm volatile (".byte 0x0f,0x01,0xf9" : "=a" (low), "=d" (high), "=c" (aux)) + #define rdtscll(val) do { \ unsigned int __a,__d; \ asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \ (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \ } while(0) +#define rdtscpll(val, aux) do { \ + unsigned long __a, __d; \ + asm volatile (".byte 0x0f,0x01,0xf9" : "=a" (__a), "=d" (__d), "=c" (aux)); \ + (val) = (__d << 32) | __a; \ +} while (0) + #define write_tsc(val1,val2) wrmsr(0x10, val1, val2) +#define write_rdtscp_aux(val) wrmsr(0xc0000103, val, 0) + #define rdpmc(counter,low,high) \ __asm__ __volatile__("rdpmc" \ : "=a" (low), "=d" (high) \ -- cgit v1.2.1 From a670fad0adb1cc6202a607d250f10bd380593905 Mon Sep 17 00:00:00 2001 From: Vojtech Pavlik Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] Add initalization of the RDTSCP auxilliary values This patch adds initalization of the RDTSCP auxilliary values to CPU numbers to time.c. If RDTSCP is available, the MSRs are written with the respective values. It can be later used to initalize per-cpu timekeeping variables. AK: Some cleanups. Move externs into headers and fix CPU hotplug. Signed-off-by: Vojtech Pavlik Signed-off-by: Andi Kleen --- arch/x86_64/kernel/smpboot.c | 2 ++ arch/x86_64/kernel/time.c | 47 +++++++++++++++++++++++++++++++++----------- include/asm-x86_64/proto.h | 1 + 3 files changed, 38 insertions(+), 12 deletions(-) diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 15555879ce95..582896f7d420 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -1181,6 +1181,8 @@ void __init smp_cpus_done(unsigned int max_cpus) #endif check_nmi_watchdog(); + + time_init_gtod(); } #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 7a9b18224182..97b9e46d1992 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -24,6 +24,8 @@ #include #include #include +#include +#include #include #include #ifdef CONFIG_ACPI @@ -49,7 +51,7 @@ static void cpufreq_delayed_get(void); extern void i8254_timer_resume(void); extern int using_apic_timer; -static char *time_init_gtod(void); +static char *timename = NULL; DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); @@ -893,11 +895,21 @@ static struct irqaction irq0 = { timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL }; -void __init time_init(void) +static int __cpuinit +time_cpu_notifier(struct notifier_block *nb, unsigned long action, void *hcpu) { - char *timename; - char *gtod; + unsigned cpu = (unsigned long) hcpu; + if (action == CPU_ONLINE && + cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) { + unsigned p; + p = smp_processor_id() | (cpu_to_node(smp_processor_id())<<12); + write_rdtscp_aux(p); + } + return NOTIFY_DONE; +} +void __init time_init(void) +{ if (nohpet) vxtime.hpet_address = 0; @@ -931,18 +943,19 @@ void __init time_init(void) } vxtime.mode = VXTIME_TSC; - gtod = time_init_gtod(); - - printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n", - vxtime_hz / 1000000, vxtime_hz % 1000000, timename, gtod); - printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; vxtime.last_tsc = get_cycles_sync(); setup_irq(0, &irq0); set_cyc2ns_scale(cpu_khz); + + hotcpu_notifier(time_cpu_notifier, 0); + time_cpu_notifier(NULL, CPU_ONLINE, (void *)(long)smp_processor_id()); + +#ifndef CONFIG_SMP + time_init_gtod(); +#endif } /* @@ -973,12 +986,13 @@ __cpuinit int unsynchronized_tsc(void) /* * Decide what mode gettimeofday should use. */ -__init static char *time_init_gtod(void) +void time_init_gtod(void) { char *timetype; if (unsynchronized_tsc()) notsc = 1; + if (vxtime.hpet_address && notsc) { timetype = hpet_use_timer ? "HPET" : "PIT/HPET"; if (hpet_use_timer) @@ -1001,7 +1015,16 @@ __init static char *time_init_gtod(void) timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC"; vxtime.mode = VXTIME_TSC; } - return timetype; + + printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n", + vxtime_hz / 1000000, vxtime_hz % 1000000, timename, timetype); + printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); + vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; + vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; + vxtime.last_tsc = get_cycles_sync(); + + set_cyc2ns_scale(cpu_khz); } __setup("report_lost_ticks", time_setup); diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index 038fe1f47e6f..3b1c60247902 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -51,6 +51,7 @@ extern unsigned long long monotonic_base; extern int sysctl_vsyscall; extern int nohpet; extern unsigned long vxtime_hz; +extern void time_init_gtod(void); extern int numa_setup(char *opt); -- cgit v1.2.1 From c08c820508233b424deab3302bc404bbecc6493a Mon Sep 17 00:00:00 2001 From: Vojtech Pavlik Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] Add the vgetcpu vsyscall This patch adds a vgetcpu vsyscall, which depending on the CPU RDTSCP capability uses either the RDTSCP or CPUID to obtain a CPU and node numbers and pass them to the program. AK: Lots of changes over Vojtech's original code: Better prototype for vgetcpu() It's better to pass the cpu / node numbers as separate arguments to avoid mistakes when going from SMP to NUMA. Also add a fast time stamp based cache using a user supplied argument to speed things more up. Use fast method from Chuck Ebbert to retrieve node/cpu from GDT limit instead of CPUID Made sure RDTSCP init is always executed after node is known. Drop printk Signed-off-by: Vojtech Pavlik Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head.S | 2 +- arch/x86_64/kernel/time.c | 13 ++++--- arch/x86_64/kernel/vmlinux.lds.S | 3 ++ arch/x86_64/kernel/vsyscall.c | 84 +++++++++++++++++++++++++++++++++++++++- include/asm-x86_64/segment.h | 5 ++- include/asm-x86_64/smp.h | 12 ++++-- include/asm-x86_64/vsyscall.h | 9 +++++ include/linux/getcpu.h | 16 ++++++++ 8 files changed, 130 insertions(+), 14 deletions(-) create mode 100644 include/linux/getcpu.h diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index c9739ca81d06..505ec4a57506 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -371,7 +371,7 @@ ENTRY(cpu_gdt_table) .quad 0,0 /* TSS */ .quad 0,0 /* LDT */ .quad 0,0,0 /* three TLS descriptors */ - .quad 0 /* unused */ + .quad 0x0000f40000000000 /* node/CPU stored in limit */ gdt_end: /* asm/segment.h:GDT_ENTRIES must match this */ /* This should be a multiple of the cache line size */ diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 97b9e46d1992..560ed944dc0e 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -899,12 +899,8 @@ static int __cpuinit time_cpu_notifier(struct notifier_block *nb, unsigned long action, void *hcpu) { unsigned cpu = (unsigned long) hcpu; - if (action == CPU_ONLINE && - cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) { - unsigned p; - p = smp_processor_id() | (cpu_to_node(smp_processor_id())<<12); - write_rdtscp_aux(p); - } + if (action == CPU_ONLINE) + vsyscall_set_cpu(cpu); return NOTIFY_DONE; } @@ -993,6 +989,11 @@ void time_init_gtod(void) if (unsynchronized_tsc()) notsc = 1; + if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) + vgetcpu_mode = VGETCPU_RDTSCP; + else + vgetcpu_mode = VGETCPU_LSL; + if (vxtime.hpet_address && notsc) { timetype = hpet_use_timer ? "HPET" : "PIT/HPET"; if (hpet_use_timer) diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 7c4de31471d4..8d5a5149bb3a 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -99,6 +99,9 @@ SECTIONS .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) } vxtime = VVIRT(.vxtime); + .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } + vgetcpu_mode = VVIRT(.vgetcpu_mode); + .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) } wall_jiffies = VVIRT(.wall_jiffies); diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index f603037df162..902783bc4d53 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -33,11 +34,15 @@ #include #include #include +#include +#include +#include #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) int __sysctl_vsyscall __section_sysctl_vsyscall = 1; seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; +int __vgetcpu_mode __section_vgetcpu_mode; #include @@ -127,9 +132,46 @@ time_t __vsyscall(1) vtime(time_t *t) return __xtime.tv_sec; } -long __vsyscall(2) venosys_0(void) +/* Fast way to get current CPU and node. + This helps to do per node and per CPU caches in user space. + The result is not guaranteed without CPU affinity, but usually + works out because the scheduler tries to keep a thread on the same + CPU. + + tcache must point to a two element sized long array. + All arguments can be NULL. */ +long __vsyscall(2) +vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) { - return -ENOSYS; + unsigned int dummy, p; + unsigned long j = 0; + + /* Fast cache - only recompute value once per jiffies and avoid + relatively costly rdtscp/cpuid otherwise. + This works because the scheduler usually keeps the process + on the same CPU and this syscall doesn't guarantee its + results anyways. + We do this here because otherwise user space would do it on + its own in a likely inferior way (no access to jiffies). + If you don't like it pass NULL. */ + if (tcache && tcache->t0 == (j = __jiffies)) { + p = tcache->t1; + } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { + /* Load per CPU data from RDTSCP */ + rdtscp(dummy, dummy, p); + } else { + /* Load per CPU data from GDT */ + asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); + } + if (tcache) { + tcache->t0 = j; + tcache->t1 = p; + } + if (cpu) + *cpu = p & 0xfff; + if (node) + *node = p >> 12; + return 0; } long __vsyscall(3) venosys_1(void) @@ -200,6 +242,43 @@ static ctl_table kernel_root_table2[] = { #endif +static void __cpuinit write_rdtscp_cb(void *info) +{ + write_rdtscp_aux((unsigned long)info); +} + +void __cpuinit vsyscall_set_cpu(int cpu) +{ + unsigned long *d; + unsigned long node = 0; +#ifdef CONFIG_NUMA + node = cpu_to_node[cpu]; +#endif + if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) { + void *info = (void *)((node << 12) | cpu); + /* Can happen on preemptive kernel */ + if (get_cpu() == cpu) + write_rdtscp_cb(info); +#ifdef CONFIG_SMP + else { + /* the notifier is unfortunately not executed on the + target CPU */ + smp_call_function_single(cpu,write_rdtscp_cb,info,0,1); + } +#endif + put_cpu(); + } + + /* Store cpu number in limit so that it can be loaded quickly + in user space in vgetcpu. + 12 bits for the CPU and 8 bits for the node. */ + d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU); + *d = 0x0f40000000000ULL; + *d |= cpu; + *d |= (node & 0xf) << 12; + *d |= (node >> 4) << 48; +} + static void __init map_vsyscall(void) { extern char __vsyscall_0; @@ -214,6 +293,7 @@ static int __init vsyscall_init(void) VSYSCALL_ADDR(__NR_vgettimeofday))); BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); + BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); map_vsyscall(); #ifdef CONFIG_SYSCTL register_sysctl_table(kernel_root_table2, 0); diff --git a/include/asm-x86_64/segment.h b/include/asm-x86_64/segment.h index d4bed33fb32c..334ddcdd8f92 100644 --- a/include/asm-x86_64/segment.h +++ b/include/asm-x86_64/segment.h @@ -20,15 +20,16 @@ #define __USER_CS 0x33 /* 6*8+3 */ #define __USER32_DS __USER_DS -#define GDT_ENTRY_TLS 1 #define GDT_ENTRY_TSS 8 /* needs two entries */ #define GDT_ENTRY_LDT 10 /* needs two entries */ #define GDT_ENTRY_TLS_MIN 12 #define GDT_ENTRY_TLS_MAX 14 -/* 15 free */ #define GDT_ENTRY_TLS_ENTRIES 3 +#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */ +#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3) + /* TLS indexes for 64bit - hardcoded in arch_prctl */ #define FS_TLS 0 #define GS_TLS 1 diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h index 6805e1feb300..d61547fd833b 100644 --- a/include/asm-x86_64/smp.h +++ b/include/asm-x86_64/smp.h @@ -133,13 +133,19 @@ static __inline int logical_smp_processor_id(void) /* we don't want to mark this access volatile - bad code generation */ return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR)); } -#endif #ifdef CONFIG_SMP #define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu] #else #define cpu_physical_id(cpu) boot_cpu_id -#endif - +static inline int smp_call_function_single(int cpuid, void (*func) (void *info), + void *info, int retry, int wait) +{ + /* Disable interrupts here? */ + func(info); + return 0; +} +#endif /* !CONFIG_SMP */ +#endif /* !__ASSEMBLY */ #endif diff --git a/include/asm-x86_64/vsyscall.h b/include/asm-x86_64/vsyscall.h index 146b24402a5f..2281e9399b96 100644 --- a/include/asm-x86_64/vsyscall.h +++ b/include/asm-x86_64/vsyscall.h @@ -4,6 +4,7 @@ enum vsyscall_num { __NR_vgettimeofday, __NR_vtime, + __NR_vgetcpu, }; #define VSYSCALL_START (-10UL << 20) @@ -15,6 +16,7 @@ enum vsyscall_num { #include #define __section_vxtime __attribute__ ((unused, __section__ (".vxtime"), aligned(16))) +#define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16))) #define __section_wall_jiffies __attribute__ ((unused, __section__ (".wall_jiffies"), aligned(16))) #define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16))) #define __section_sys_tz __attribute__ ((unused, __section__ (".sys_tz"), aligned(16))) @@ -26,6 +28,9 @@ enum vsyscall_num { #define VXTIME_HPET 2 #define VXTIME_PMTMR 3 +#define VGETCPU_RDTSCP 1 +#define VGETCPU_LSL 2 + struct vxtime_data { long hpet_address; /* HPET base address */ int last; @@ -40,6 +45,7 @@ struct vxtime_data { /* vsyscall space (readonly) */ extern struct vxtime_data __vxtime; +extern int __vgetcpu_mode; extern struct timespec __xtime; extern volatile unsigned long __jiffies; extern unsigned long __wall_jiffies; @@ -48,6 +54,7 @@ extern seqlock_t __xtime_lock; /* kernel space (writeable) */ extern struct vxtime_data vxtime; +extern int vgetcpu_mode; extern unsigned long wall_jiffies; extern struct timezone sys_tz; extern int sysctl_vsyscall; @@ -55,6 +62,8 @@ extern seqlock_t xtime_lock; extern int sysctl_vsyscall; +extern void vsyscall_set_cpu(int cpu); + #define ARCH_HAVE_XTIME_LOCK 1 #endif /* __KERNEL__ */ diff --git a/include/linux/getcpu.h b/include/linux/getcpu.h new file mode 100644 index 000000000000..031ed3780e45 --- /dev/null +++ b/include/linux/getcpu.h @@ -0,0 +1,16 @@ +#ifndef _LINUX_GETCPU_H +#define _LINUX_GETCPU_H 1 + +/* Cache for getcpu() to speed it up. Results might be upto a jiffie + out of date, but will be faster. + User programs should not refer to the contents of this structure. + It is only a cache for vgetcpu(). It might change in future kernels. + The user program must store this information per thread (__thread) + If you want 100% accurate information pass NULL instead. */ +struct getcpu_cache { + unsigned long t0; + unsigned long t1; + unsigned long res[4]; +}; + +#endif -- cgit v1.2.1 From 3cfc348bf90ffaa777c188652aa297f04eb94de8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] x86: Add portable getcpu call For NUMA optimization and some other algorithms it is useful to have a fast to get the current CPU and node numbers in user space. x86-64 added a fast way to do this in a vsyscall. This adds a generic syscall for other architectures to make it a generic portable facility. I expect some of them will also implement it as a faster vsyscall. The cache is an optimization for the x86-64 vsyscall optimization. Since what the syscall returns is an approximation anyways and user space often wants very fast results it can be cached for some time. The norma methods to get this information in user space are relatively slow The vsyscall is in a better position to manage the cache because it has direct access to a fast time stamp (jiffies). For the generic syscall optimization it doesn't help much, but enforce a valid argument to keep programs portable I only added an i386 syscall entry for now. Other architectures can follow as needed. AK: Also added some cleanups from Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/syscall_table.S | 1 + arch/x86_64/ia32/ia32entry.S | 1 + include/asm-i386/unistd.h | 3 ++- include/linux/syscalls.h | 2 ++ kernel/sys.c | 31 +++++++++++++++++++++++++++++++ 5 files changed, 37 insertions(+), 1 deletion(-) diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index dd63d4775398..7e639f78b0b9 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -317,3 +317,4 @@ ENTRY(sys_call_table) .long sys_tee /* 315 */ .long sys_vmsplice .long sys_move_pages + .long sys_getcpu diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 30ed0f6f4a2b..32fd32bea07c 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -713,4 +713,5 @@ ia32_sys_call_table: .quad sys_tee .quad compat_sys_vmsplice .quad compat_sys_move_pages + .quad sys_getcpu ia32_syscall_end: diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index fc1c8ddae149..565d0897b205 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -323,10 +323,11 @@ #define __NR_tee 315 #define __NR_vmsplice 316 #define __NR_move_pages 317 +#define __NR_getcpu 318 #ifdef __KERNEL__ -#define NR_syscalls 318 +#define NR_syscalls 319 /* * user-visible error numbers are in the range -1 - -128: see diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 008f04c56737..3f0f716225ec 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -53,6 +53,7 @@ struct mq_attr; struct compat_stat; struct compat_timeval; struct robust_list_head; +struct getcpu_cache; #include #include @@ -596,5 +597,6 @@ asmlinkage long sys_get_robust_list(int pid, size_t __user *len_ptr); asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, size_t len); +asmlinkage long sys_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *cache); #endif diff --git a/kernel/sys.c b/kernel/sys.c index e236f98f7ec5..3f894775488d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -2062,3 +2063,33 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, } return error; } + +asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep, + struct getcpu_cache __user *cache) +{ + int err = 0; + int cpu = raw_smp_processor_id(); + if (cpup) + err |= put_user(cpu, cpup); + if (nodep) + err |= put_user(cpu_to_node(cpu), nodep); + if (cache) { + /* + * The cache is not needed for this implementation, + * but make sure user programs pass something + * valid. vsyscall implementations can instead make + * good use of the cache. Only use t0 and t1 because + * these are available in both 32bit and 64bit ABI (no + * need for a compat_getcpu). 32bit has enough + * padding + */ + unsigned long t0, t1; + get_user(t0, &cache->t0); + get_user(t1, &cache->t1); + t0++; + t1++; + put_user(t0, &cache->t0); + put_user(t1, &cache->t1); + } + return err ? -EFAULT : 0; +} -- cgit v1.2.1 From 2f766d16062d0147edff91be15de4a950667ca42 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] Clean up asm/smp.h includes No need to include it from entry.S Drop all the #ifdef __ASSEMBLY__ Signed-off-by: Andi Kleen --- arch/x86_64/kernel/entry.S | 2 -- include/asm-x86_64/smp.h | 12 ------------ 2 files changed, 14 deletions(-) diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index aa8d8939abc1..7eb1678e098d 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -27,10 +27,8 @@ * - schedule it carefully for the final hardware. */ -#define ASSEMBLY 1 #include #include -#include #include #include #include diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h index d61547fd833b..612d208961a2 100644 --- a/include/asm-x86_64/smp.h +++ b/include/asm-x86_64/smp.h @@ -4,15 +4,12 @@ /* * We need the APIC definitions automatically as part of 'smp.h' */ -#ifndef __ASSEMBLY__ #include #include #include extern int disable_apic; -#endif #ifdef CONFIG_X86_LOCAL_APIC -#ifndef __ASSEMBLY__ #include #include #ifdef CONFIG_X86_IO_APIC @@ -21,10 +18,8 @@ extern int disable_apic; #include #include #endif -#endif #ifdef CONFIG_SMP -#ifndef ASSEMBLY #include @@ -83,13 +78,10 @@ extern void prefill_possible_map(void); extern unsigned num_processors; extern unsigned disabled_cpus; -#endif /* !ASSEMBLY */ - #define NO_PROC_ID 0xFF /* No processor magic marker */ #endif -#ifndef ASSEMBLY /* * Some lowlevel functions might want to know about * the real APIC ID <-> CPU # mapping. @@ -111,8 +103,6 @@ static inline int cpu_present_to_apicid(int mps_cpu) return BAD_APICID; } -#endif /* !ASSEMBLY */ - #ifndef CONFIG_SMP #define stack_smp_processor_id() 0 #define safe_smp_processor_id() 0 @@ -127,7 +117,6 @@ static inline int cpu_present_to_apicid(int mps_cpu) }) #endif -#ifndef __ASSEMBLY__ static __inline int logical_smp_processor_id(void) { /* we don't want to mark this access volatile - bad code generation */ @@ -146,6 +135,5 @@ static inline int smp_call_function_single(int cpuid, void (*func) (void *info), return 0; } #endif /* !CONFIG_SMP */ -#endif /* !__ASSEMBLY */ #endif -- cgit v1.2.1 From d3a4f48d4866b8623ca9adde8ce4e5fde979c132 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] x86-64 TIF flags for debug regs and io bitmap in ctxsw Hello, Following my discussion with Andi. Here is a patch that introduces two new TIF flags to simplify the context switch code in __switch_to(). The idea is to minimize the number of cache lines accessed in the common case, i.e., when neither the debug registers nor the I/O bitmap are used. This patch covers the x86-64 modifications. A patch for i386 follows. Changelog: - add TIF_DEBUG to track when debug registers are active - add TIF_IO_BITMAP to track when I/O bitmap is used - modify __switch_to() to use the new TIF flags : eranian@hpl.hp.com Signed-off-by: Andi Kleen --- arch/x86_64/ia32/ptrace32.c | 4 +++ arch/x86_64/kernel/ioport.c | 1 + arch/x86_64/kernel/process.c | 73 +++++++++++++++++++++++----------------- arch/x86_64/kernel/ptrace.c | 8 +++-- include/asm-x86_64/thread_info.h | 7 ++++ 5 files changed, 60 insertions(+), 33 deletions(-) diff --git a/arch/x86_64/ia32/ptrace32.c b/arch/x86_64/ia32/ptrace32.c index 659c0722f6b8..72bf92a9d375 100644 --- a/arch/x86_64/ia32/ptrace32.c +++ b/arch/x86_64/ia32/ptrace32.c @@ -117,6 +117,10 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 val) if ((0x5454 >> ((val >> (16 + 4*i)) & 0xf)) & 1) return -EIO; child->thread.debugreg7 = val; + if (val) + set_tsk_thread_flag(child, TIF_DEBUG); + else + clear_tsk_thread_flag(child, TIF_DEBUG); break; default: diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c index b81614970ecc..fe063d3cfe42 100644 --- a/arch/x86_64/kernel/ioport.c +++ b/arch/x86_64/kernel/ioport.c @@ -56,6 +56,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) memset(bitmap, 0xff, IO_BITMAP_BYTES); t->io_bitmap_ptr = bitmap; + set_thread_flag(TIF_IO_BITMAP); } /* diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index bb6745d13b8f..6e0527635b4c 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -350,6 +350,7 @@ void exit_thread(void) kfree(t->io_bitmap_ptr); t->io_bitmap_ptr = NULL; + clear_thread_flag(TIF_IO_BITMAP); /* * Careful, clear this in the TSS too: */ @@ -369,6 +370,7 @@ void flush_thread(void) if (t->flags & _TIF_IA32) current_thread_info()->status |= TS_COMPAT; } + t->flags &= ~_TIF_DEBUG; tsk->thread.debugreg0 = 0; tsk->thread.debugreg1 = 0; @@ -461,7 +463,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, asm("mov %%es,%0" : "=m" (p->thread.es)); asm("mov %%ds,%0" : "=m" (p->thread.ds)); - if (unlikely(me->thread.io_bitmap_ptr != NULL)) { + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { p->thread.io_bitmap_max = 0; @@ -469,6 +471,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, } memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES); + set_tsk_thread_flag(p, TIF_IO_BITMAP); } /* @@ -498,6 +501,40 @@ out: */ #define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r) +static inline void __switch_to_xtra(struct task_struct *prev_p, + struct task_struct *next_p, + struct tss_struct *tss) +{ + struct thread_struct *prev, *next; + + prev = &prev_p->thread, + next = &next_p->thread; + + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { + loaddebug(next, 0); + loaddebug(next, 1); + loaddebug(next, 2); + loaddebug(next, 3); + /* no 4 and 5 */ + loaddebug(next, 6); + loaddebug(next, 7); + } + + if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { + /* + * Copy the relevant range of the IO bitmap. + * Normally this is 128 bytes or less: + */ + memcpy(tss->io_bitmap, next->io_bitmap_ptr, + max(prev->io_bitmap_max, next->io_bitmap_max)); + } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { + /* + * Clear any possible leftover bits: + */ + memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); + } +} + /* * switch_to(x,y) should switch tasks from x to y. * @@ -586,37 +623,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); /* - * Now maybe reload the debug registers + * Now maybe reload the debug registers and handle I/O bitmaps */ - if (unlikely(next->debugreg7)) { - loaddebug(next, 0); - loaddebug(next, 1); - loaddebug(next, 2); - loaddebug(next, 3); - /* no 4 and 5 */ - loaddebug(next, 6); - loaddebug(next, 7); - } - - - /* - * Handle the IO bitmap - */ - if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { - if (next->io_bitmap_ptr) - /* - * Copy the relevant range of the IO bitmap. - * Normally this is 128 bytes or less: - */ - memcpy(tss->io_bitmap, next->io_bitmap_ptr, - max(prev->io_bitmap_max, next->io_bitmap_max)); - else { - /* - * Clear any possible leftover bits: - */ - memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); - } - } + if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) + || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) + __switch_to_xtra(prev_p, next_p, tss); return prev_p; } diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c index 2d50024c9f30..d35ec1bc696a 100644 --- a/arch/x86_64/kernel/ptrace.c +++ b/arch/x86_64/kernel/ptrace.c @@ -420,9 +420,13 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1) break; if (i == 4) { - child->thread.debugreg7 = data; + child->thread.debugreg7 = data; + if (data) + set_tsk_thread_flag(child, TIF_DEBUG); + else + clear_tsk_thread_flag(child, TIF_DEBUG); ret = 0; - } + } break; } break; diff --git a/include/asm-x86_64/thread_info.h b/include/asm-x86_64/thread_info.h index 790c512a4369..787a08114b48 100644 --- a/include/asm-x86_64/thread_info.h +++ b/include/asm-x86_64/thread_info.h @@ -120,6 +120,8 @@ static inline struct thread_info *stack_thread_info(void) #define TIF_FORK 18 /* ret_from_fork */ #define TIF_ABI_PENDING 19 #define TIF_MEMDIE 20 +#define TIF_DEBUG 21 /* uses debug registers */ +#define TIF_IO_BITMAP 22 /* uses I/O bitmap */ #define _TIF_SYSCALL_TRACE (1< Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] Don't print virtual address in HPET initialization virtual addresses don't belong into kernel logs for non debugging Cc: clemens@ladisch.de Signed-off-by: Andi Kleen --- drivers/char/hpet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index 8afba339f05a..58b0eb581114 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c @@ -868,8 +868,8 @@ int hpet_alloc(struct hpet_data *hdp) do_div(temp, period); hpetp->hp_tick_freq = temp; /* ticks per second */ - printk(KERN_INFO "hpet%d: at MMIO 0x%lx (virtual 0x%p), IRQ%s", - hpetp->hp_which, hdp->hd_phys_address, hdp->hd_address, + printk(KERN_INFO "hpet%d: at MMIO 0x%lx, IRQ%s", + hpetp->hp_which, hdp->hd_phys_address, hpetp->hp_ntimer > 1 ? "s" : ""); for (i = 0; i < hpetp->hp_ntimer; i++) printk("%s %d", i > 0 ? "," : "", hdp->hd_irq[i]); -- cgit v1.2.1 From d5d9ca6d882f7c8d47ef91a701fc042cbebbc334 Mon Sep 17 00:00:00 2001 From: Adam Henley Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] A few trivial spelling and grammar fixes A few trivial spelling and grammar mistakes picked up in "arch/x86_64/aperture.c", "arch/x86_64/crash.c" and "arch/x86_64/apic.c". I think all are correct fixes but am ever aware of my fallibility :o) This is my first patch submission so all feedback is appreciated, esp. WRT CCing to Linus, Andi and trivial@kernel.org, is this correct? And which is the most appropriate kernel version to diff against? If any. Should apply cleanly to 2.6.18-rc1 Signed-off-by: Adam Henley Signed-off-by: Andi Kleen - adam --- arch/x86_64/kernel/aperture.c | 2 +- arch/x86_64/kernel/apic.c | 4 ++-- arch/x86_64/kernel/crash.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index 58af8e73738b..04dbf1662523 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -48,7 +48,7 @@ static u32 __init allocate_aperture(void) /* * Aperture has to be naturally aligned. This means an 2GB aperture won't - * have much chances to find a place in the lower 4GB of memory. + * have much chance of finding a place in the lower 4GB of memory. * Unfortunately we cannot move it up because that would make the * IOMMU useless. */ diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 692e9b579743..63d9b037afc6 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -400,7 +400,7 @@ void __cpuinit setup_local_APIC (void) value |= APIC_SPIV_APIC_ENABLED; /* - * Some unknown Intel IO/APIC (or APIC) errata is biting us with + * Some unknown Intel IO/APIC (or APIC) errata are biting us with * certain networking cards. If high frequency interrupts are * happening on a particular IOAPIC pin, plus the IOAPIC routing * entry is masked/unmasked at a high rate as well then sooner or @@ -950,7 +950,7 @@ void smp_local_timer_interrupt(struct pt_regs *regs) * We take the 'long' return path, and there every subsystem * grabs the appropriate locks (kernel lock/ irq lock). * - * we might want to decouple profiling from the 'long path', + * We might want to decouple profiling from the 'long path', * and do the profiling totally in assembly. * * Currently this isn't too much of an issue (performance wise), diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c index fc57a139123e..7d7a9e75f70c 100644 --- a/arch/x86_64/kernel/crash.c +++ b/arch/x86_64/kernel/crash.c @@ -69,7 +69,7 @@ static void crash_save_this_cpu(struct pt_regs *regs, int cpu) * for the data I pass, and I need tags * on the data to indicate what information I have * squirrelled away. ELF notes happen to provide - * all of that that no need to invent something new. + * all of that, no need to invent something new. */ buf = (u32*)per_cpu_ptr(crash_notes, cpu); -- cgit v1.2.1 From c16b63e09d9d03158e0a92e961234e94c4862620 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] i386/x86-64: Don't randomize stack top when no randomization personality is set Based on patch from Frank van Maarseveen , but extended. Signed-off-by: Andi Kleen --- arch/i386/kernel/process.c | 3 ++- arch/x86_64/kernel/process.c | 2 +- fs/binfmt_elf.c | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 8657c739656a..b741c3e1a5eb 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -905,7 +906,7 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) unsigned long arch_align_stack(unsigned long sp) { - if (randomize_va_space) + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) sp -= get_random_int() % 8192; return sp & ~0xf; } diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 6e0527635b4c..6fbd19564e4e 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -845,7 +845,7 @@ int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) unsigned long arch_align_stack(unsigned long sp) { - if (randomize_va_space) + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) sp -= get_random_int() % 8192; return sp & ~0xf; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 672a3b90bc55..5109dbff93bf 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -515,7 +515,8 @@ static unsigned long randomize_stack_top(unsigned long stack_top) { unsigned int random_variable = 0; - if (current->flags & PF_RANDOMIZE) { + if ((current->flags & PF_RANDOMIZE) && + !(current->personality & ADDR_NO_RANDOMIZE)) { random_variable = get_random_int() & STACK_RND_MASK; random_variable <<= PAGE_SHIFT; } -- cgit v1.2.1 From 0cb91a2293648507886563ccb91979cfc94d6a4b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] i386: Account spinlocks to the caller during profiling for !FP kernels This ports the algorithm from x86-64 (with improvements) to i386. Previously this only worked for frame pointer enabled kernels. But spinlocks have a very simple stack frame that can be manually analyzed. Do this. Signed-off-by: Andi Kleen --- arch/i386/kernel/time.c | 23 +++++++++++++++++++---- include/asm-i386/ptrace.h | 4 ---- kernel/spinlock.c | 5 +++++ 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index edd00f6cee37..5af802ef00b2 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -130,18 +130,33 @@ static int set_rtc_mmss(unsigned long nowtime) int timer_ack; -#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); - if (!user_mode_vm(regs) && in_lock_functions(pc)) +#ifdef CONFIG_SMP + if (!user_mode_vm(regs) && in_lock_functions(pc)) { +#ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->ebp + 4); - +#else + unsigned long *sp; + if ((regs->xcs & 3) == 0) + sp = (unsigned long *)®s->esp; + else + sp = (unsigned long *)regs->esp; + /* Return address is either directly at stack pointer + or above a saved eflags. Eflags has bits 22-31 zero, + kernel addresses don't. */ + if (sp[0] >> 22) + return sp[0]; + if (sp[1] >> 22) + return sp[1]; +#endif + } +#endif return pc; } EXPORT_SYMBOL(profile_pc); -#endif /* * This is the same as the above, except we _also_ save the current diff --git a/include/asm-i386/ptrace.h b/include/asm-i386/ptrace.h index f324c53b6f9a..30a442ec2059 100644 --- a/include/asm-i386/ptrace.h +++ b/include/asm-i386/ptrace.h @@ -80,11 +80,7 @@ static inline int user_mode_vm(struct pt_regs *regs) return ((regs->xcs & 3) | (regs->eflags & VM_MASK)) != 0; } #define instruction_pointer(regs) ((regs)->eip) -#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) extern unsigned long profile_pc(struct pt_regs *regs); -#else -#define profile_pc(regs) instruction_pointer(regs) -#endif #endif /* __KERNEL__ */ #endif diff --git a/kernel/spinlock.c b/kernel/spinlock.c index fb524b009eef..9644a41e0bef 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -7,6 +7,11 @@ * * This file contains the spinlock/rwlock implementations for the * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) + * + * Note that some architectures have special knowledge about the + * stack frames of these functions in their profile_pc. If you + * change anything significant here that could change the stack + * frame contact the architecture maintainers. */ #include -- cgit v1.2.1 From 31679f38d88696ed032d59e457f1605c97e7d719 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] Simplify profile_pc on x86-64 Use knowledge about EFLAGS layout (bits 22:63 are always 0) to distingush EFLAGS word and kernel address in the spin lock stack frame. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/time.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 560ed944dc0e..ea00915d393a 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -189,20 +189,15 @@ unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); - /* Assume the lock function has either no stack frame or only a single - word. This checks if the address on the stack looks like a kernel - text address. - There is a small window for false hits, but in that case the tick - is just accounted to the spinlock function. - Better would be to write these functions in assembler again - and check exactly. */ + /* Assume the lock function has either no stack frame or a copy + of eflags from PUSHF + Eflags always has bits 22 and up cleared unlike kernel addresses. */ if (!user_mode(regs) && in_lock_functions(pc)) { - char *v = *(char **)regs->rsp; - if ((v >= _stext && v <= _etext) || - (v >= _sinittext && v <= _einittext) || - (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END)) - return (unsigned long)v; - return ((unsigned long *)regs->rsp)[1]; + unsigned long *sp = (unsigned long *)regs->rsp; + if (sp[0] >> 22) + return sp[0]; + if (sp[1] >> 22) + return sp[1]; } return pc; } -- cgit v1.2.1 From 538b5b419c7ae39a4b2deb15278da36102e42346 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] Document backtracer selection options Signed-off-by: Andi Kleen --- Documentation/x86_64/boot-options.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt index 6da24e7a56cb..4303e0c12476 100644 --- a/Documentation/x86_64/boot-options.txt +++ b/Documentation/x86_64/boot-options.txt @@ -245,6 +245,13 @@ Debugging newfallback: use new unwinder but fall back to old if it gets stuck (default) + call_trace=[old|both|newfallback|new] + old: use old inexact backtracer + new: use new exact dwarf2 unwinder + both: print entries from both + newfallback: use new unwinder but fall back to old if it gets + stuck (default) + Misc noreplacement Don't replace instructions with more appropriate ones -- cgit v1.2.1 From b4062b16094038334d9bbadac0397a3fc9e981b0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] Support patchable lock prefix for pure assembly files Signed-off-by: Andi Kleen --- include/asm-x86_64/alternative-asm.i | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 include/asm-x86_64/alternative-asm.i diff --git a/include/asm-x86_64/alternative-asm.i b/include/asm-x86_64/alternative-asm.i new file mode 100644 index 000000000000..e4041f4fa4dc --- /dev/null +++ b/include/asm-x86_64/alternative-asm.i @@ -0,0 +1,14 @@ +#include + +#ifdef CONFIG_SMP + .macro LOCK_PREFIX +1: lock + .section .smp_locks,"a" + .align 8 + .quad 1b + .previous + .endm +#else + .macro LOCK_PREFIX + .endm +#endif -- cgit v1.2.1 From 9a0b26e6bc4ae1979d9bcc6194e57a71b2b5cac6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] Clean up read write lock assembly - Move the slow path fallbacks to their own assembly files This makes them much easier to read and is needed for the next change. - Add CFI annotations for unwinding (XXX need review) - Remove constant case which can never happen with out of line spinlocks - Use patchable LOCK prefixes - Don't use lock sections anymore for inline code because they can't be expressed by the unwinder (this adds one taken jump to the lock fast path) Cc: jbeulich@novell.com Signed-off-by: Andi Kleen --- arch/x86_64/lib/Makefile | 2 +- arch/x86_64/lib/rwlock.S | 38 ++++++++++++++++++++++++ arch/x86_64/lib/thunk.S | 30 ------------------- include/asm-x86_64/rwlock.h | 68 +++++++------------------------------------ include/asm-x86_64/spinlock.h | 11 ++----- 5 files changed, 51 insertions(+), 98 deletions(-) create mode 100644 arch/x86_64/lib/rwlock.S diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile index ccef6ae747a3..b78d4170fce2 100644 --- a/arch/x86_64/lib/Makefile +++ b/arch/x86_64/lib/Makefile @@ -9,4 +9,4 @@ obj-y := io.o iomap_copy.o lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \ usercopy.o getuser.o putuser.o \ thunk.o clear_page.o copy_page.o bitstr.o bitops.o -lib-y += memcpy.o memmove.o memset.o copy_user.o +lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o diff --git a/arch/x86_64/lib/rwlock.S b/arch/x86_64/lib/rwlock.S new file mode 100644 index 000000000000..0cde1f807314 --- /dev/null +++ b/arch/x86_64/lib/rwlock.S @@ -0,0 +1,38 @@ +/* Slow paths of read/write spinlocks. */ + +#include +#include +#include +#include + +/* rdi: pointer to rwlock_t */ +ENTRY(__write_lock_failed) + CFI_STARTPROC + LOCK_PREFIX + addl $RW_LOCK_BIAS,(%rdi) +1: rep + nop + cmpl $RW_LOCK_BIAS,(%rdi) + jne 1b + LOCK_PREFIX + subl $RW_LOCK_BIAS,(%rdi) + jnz __write_lock_failed + ret + CFI_ENDPROC +END(__write_lock_failed) + +/* rdi: pointer to rwlock_t */ +ENTRY(__read_lock_failed) + CFI_STARTPROC + LOCK_PREFIX + incl (%rdi) +1: rep + nop + cmpl $1,(%rdi) + js 1b + LOCK_PREFIX + decl (%rdi) + js __read_lock_failed + ret + CFI_ENDPROC +END(__read_lock_failed) diff --git a/arch/x86_64/lib/thunk.S b/arch/x86_64/lib/thunk.S index 332ea5dff916..6cff27c775ae 100644 --- a/arch/x86_64/lib/thunk.S +++ b/arch/x86_64/lib/thunk.S @@ -67,33 +67,3 @@ restore_norax: RESTORE_ARGS 1 ret CFI_ENDPROC - -#ifdef CONFIG_SMP -/* Support for read/write spinlocks. */ - .text -/* rax: pointer to rwlock_t */ -ENTRY(__write_lock_failed) - lock - addl $RW_LOCK_BIAS,(%rax) -1: rep - nop - cmpl $RW_LOCK_BIAS,(%rax) - jne 1b - lock - subl $RW_LOCK_BIAS,(%rax) - jnz __write_lock_failed - ret - -/* rax: pointer to rwlock_t */ -ENTRY(__read_lock_failed) - lock - incl (%rax) -1: rep - nop - cmpl $1,(%rax) - js 1b - lock - decl (%rax) - js __read_lock_failed - ret -#endif diff --git a/include/asm-x86_64/rwlock.h b/include/asm-x86_64/rwlock.h index dea0e9459264..28a080d23119 100644 --- a/include/asm-x86_64/rwlock.h +++ b/include/asm-x86_64/rwlock.h @@ -18,69 +18,21 @@ #ifndef _ASM_X86_64_RWLOCK_H #define _ASM_X86_64_RWLOCK_H -#include - #define RW_LOCK_BIAS 0x01000000 -#define RW_LOCK_BIAS_STR "0x01000000" +#define RW_LOCK_BIAS_STR "0x01000000" -#define __build_read_lock_ptr(rw, helper) \ +#define __build_read_lock(rw) \ asm volatile(LOCK_PREFIX "subl $1,(%0)\n\t" \ - "js 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tcall " helper "\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - ::"a" (rw) : "memory") - -#define __build_read_lock_const(rw, helper) \ - asm volatile(LOCK_PREFIX "subl $1,%0\n\t" \ - "js 2f\n" \ + "jns 1f\n" \ + "call __read_lock_failed\n" \ "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tpushq %%rax\n\t" \ - "leaq %0,%%rax\n\t" \ - "call " helper "\n\t" \ - "popq %%rax\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - :"=m" (*((volatile int *)rw))::"memory") - -#define __build_read_lock(rw, helper) do { \ - if (__builtin_constant_p(rw)) \ - __build_read_lock_const(rw, helper); \ - else \ - __build_read_lock_ptr(rw, helper); \ - } while (0) + ::"D" (rw), "i" (RW_LOCK_BIAS) : "memory") -#define __build_write_lock_ptr(rw, helper) \ - asm volatile(LOCK_PREFIX "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ - "jnz 2f\n" \ +#define __build_write_lock(rw) \ + asm volatile(LOCK_PREFIX "subl %1,(%0)\n\t" \ + "jz 1f\n" \ + "\tcall __write_lock_failed\n\t" \ "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tcall " helper "\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - ::"a" (rw) : "memory") - -#define __build_write_lock_const(rw, helper) \ - asm volatile(LOCK_PREFIX "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ - "jnz 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tpushq %%rax\n\t" \ - "leaq %0,%%rax\n\t" \ - "call " helper "\n\t" \ - "popq %%rax\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - :"=m" (*((volatile long *)rw))::"memory") - -#define __build_write_lock(rw, helper) do { \ - if (__builtin_constant_p(rw)) \ - __build_write_lock_const(rw, helper); \ - else \ - __build_write_lock_ptr(rw, helper); \ - } while (0) + ::"D" (rw), "i" (RW_LOCK_BIAS) : "memory") #endif diff --git a/include/asm-x86_64/spinlock.h b/include/asm-x86_64/spinlock.h index 248a79f0eaff..a8e3d89f591f 100644 --- a/include/asm-x86_64/spinlock.h +++ b/include/asm-x86_64/spinlock.h @@ -79,13 +79,6 @@ static inline void __raw_spin_unlock(raw_spinlock_t *lock) * * On x86, we implement read-write locks as a 32-bit counter * with the high bit (sign) being the "contended" bit. - * - * The inline assembly is non-obvious. Think about it. - * - * Changed to use the same technique as rw semaphores. See - * semaphore.h for details. -ben - * - * the helpers are in arch/i386/kernel/semaphore.c */ #define __raw_read_can_lock(x) ((int)(x)->lock > 0) @@ -93,12 +86,12 @@ static inline void __raw_spin_unlock(raw_spinlock_t *lock) static inline void __raw_read_lock(raw_rwlock_t *rw) { - __build_read_lock(rw, "__read_lock_failed"); + __build_read_lock(rw); } static inline void __raw_write_lock(raw_rwlock_t *rw) { - __build_write_lock(rw, "__write_lock_failed"); + __build_write_lock(rw); } static inline int __raw_read_trylock(raw_rwlock_t *lock) -- cgit v1.2.1 From 1a015b5644ec6df0a2c4cbeff1f8a3d24ba0478e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] i386: Remove const case for rwlocks rwlocks are now out of line, so it near never triggers. Also it was incompatible with the new dwarf2 unwinder because it had unannotiatable push/pops. Signed-off-by: Andi Kleen --- include/asm-i386/rwlock.h | 38 ++------------------------------------ 1 file changed, 2 insertions(+), 36 deletions(-) diff --git a/include/asm-i386/rwlock.h b/include/asm-i386/rwlock.h index 87c069ccba08..f40ccbd8cb7f 100644 --- a/include/asm-i386/rwlock.h +++ b/include/asm-i386/rwlock.h @@ -20,52 +20,18 @@ #define RW_LOCK_BIAS 0x01000000 #define RW_LOCK_BIAS_STR "0x01000000" -#define __build_read_lock_ptr(rw, helper) \ +#define __build_read_lock(rw, helper) \ asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t" \ "jns 1f\n" \ "call " helper "\n\t" \ "1:\n" \ ::"a" (rw) : "memory") -#define __build_read_lock_const(rw, helper) \ - asm volatile(LOCK_PREFIX " subl $1,%0\n\t" \ - "jns 1f\n" \ - "pushl %%eax\n\t" \ - "leal %0,%%eax\n\t" \ - "call " helper "\n\t" \ - "popl %%eax\n\t" \ - "1:\n" \ - :"+m" (*(volatile int *)rw) : : "memory") - -#define __build_read_lock(rw, helper) do { \ - if (__builtin_constant_p(rw)) \ - __build_read_lock_const(rw, helper); \ - else \ - __build_read_lock_ptr(rw, helper); \ - } while (0) - -#define __build_write_lock_ptr(rw, helper) \ +#define __build_write_lock(rw, helper) \ asm volatile(LOCK_PREFIX " subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ "jz 1f\n" \ "call " helper "\n\t" \ "1:\n" \ ::"a" (rw) : "memory") -#define __build_write_lock_const(rw, helper) \ - asm volatile(LOCK_PREFIX " subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ - "jz 1f\n" \ - "pushl %%eax\n\t" \ - "leal %0,%%eax\n\t" \ - "call " helper "\n\t" \ - "popl %%eax\n\t" \ - "1:\n" \ - :"+m" (*(volatile int *)rw) : : "memory") - -#define __build_write_lock(rw, helper) do { \ - if (__builtin_constant_p(rw)) \ - __build_write_lock_const(rw, helper); \ - else \ - __build_write_lock_ptr(rw, helper); \ - } while (0) - #endif -- cgit v1.2.1 From b06babac45e1546dfb504f1f25eb0495632bfc41 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Add proper alignment to ENTRY Previously it didn't align. Use the same one as the C compiler in blended mode, which is good for K8 and Core2 and doesn't hurt on P4. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/entry.S | 3 +-- include/asm-x86_64/linkage.h | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 7eb1678e098d..2dc5c01f754d 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -617,8 +617,7 @@ retint_signal: #ifdef CONFIG_PREEMPT /* Returning to kernel space. Check if we need preemption */ /* rcx: threadinfo. interrupts off. */ - .p2align -retint_kernel: +ENTRY(retint_kernel) cmpl $0,threadinfo_preempt_count(%rcx) jnz retint_restore_args bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) diff --git a/include/asm-x86_64/linkage.h b/include/asm-x86_64/linkage.h index 291c2d01c44f..b5f39d0189ce 100644 --- a/include/asm-x86_64/linkage.h +++ b/include/asm-x86_64/linkage.h @@ -1,6 +1,6 @@ #ifndef __ASM_LINKAGE_H #define __ASM_LINKAGE_H -/* Nothing to see here... */ +#define __ALIGN .p2align 4,,15 #endif -- cgit v1.2.1 From 07c9819b31eda7954feddc83f2fae035f31c11e1 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] i386: add alternative-asm.h to allow LOCK_PREFIX replacement in .S files LOCK_PREFIX is replaced by nops on UP systems, so it has to be a special macro. Previously this was only possible from C. Allow it for pure assembly files too. Similar to earlier x86-64 patch. Signed-off-by: Andi Kleen --- include/asm-i386/alternative-asm.i | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 include/asm-i386/alternative-asm.i diff --git a/include/asm-i386/alternative-asm.i b/include/asm-i386/alternative-asm.i new file mode 100644 index 000000000000..6c47e3b9484b --- /dev/null +++ b/include/asm-i386/alternative-asm.i @@ -0,0 +1,14 @@ +#include + +#ifdef CONFIG_SMP + .macro LOCK_PREFIX +1: lock + .section .smp_locks,"a" + .align 4 + .long 1b + .previous + .endm +#else + .macro LOCK_PREFIX + .endm +#endif -- cgit v1.2.1 From ecaf45ee5ce60afe7cc46e91d82c1b0cbda09387 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] i386: Redo semaphore and rwlock assembly helpers - Move them to a pure assembly file. Previously they were in a C file that only consisted of inline assembly. Doing it in pure assembler is much nicer. - Add a frame.i include with FRAME/ENDFRAME macros to easily add frame pointers to assembly functions - Add dwarf2 annotation to them so that the new dwarf2 unwinder doesn't get stuck on them - Random cleanups Includes feedback from Jan Beulich and a UML build fix from Andrew Morton. Cc: jbeulich@novell.com Cc: jdike@addtoit.com Signed-off-by: Andi Kleen --- arch/i386/kernel/Makefile | 2 +- arch/i386/kernel/semaphore.c | 134 ---------------------------------- arch/i386/lib/Makefile | 2 +- arch/i386/lib/semaphore.S | 154 +++++++++++++++++++++++++++++++++++++++ arch/um/sys-i386/Makefile | 2 +- include/asm-i386/frame.i | 24 ++++++ include/asm-um/alternative-asm.i | 6 ++ include/asm-um/frame.i | 6 ++ 8 files changed, 193 insertions(+), 137 deletions(-) delete mode 100644 arch/i386/kernel/semaphore.c create mode 100644 arch/i386/lib/semaphore.S create mode 100644 include/asm-i386/frame.i create mode 100644 include/asm-um/alternative-asm.i create mode 100644 include/asm-um/frame.i diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 5427a842e841..dab497472deb 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -4,7 +4,7 @@ extra-y := head.o init_task.o vmlinux.lds -obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \ +obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \ pci-dma.o i386_ksyms.o i387.o bootflag.o \ quirks.o i8237.o topology.o alternative.o i8253.o tsc.o diff --git a/arch/i386/kernel/semaphore.c b/arch/i386/kernel/semaphore.c deleted file mode 100644 index 98352c374c76..000000000000 --- a/arch/i386/kernel/semaphore.c +++ /dev/null @@ -1,134 +0,0 @@ -/* - * i386 semaphore implementation. - * - * (C) Copyright 1999 Linus Torvalds - * - * Portions Copyright 1999 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * rw semaphores implemented November 1999 by Benjamin LaHaise - */ -#include - -/* - * The semaphore operations have a special calling sequence that - * allow us to do a simpler in-line version of them. These routines - * need to convert that sequence back into the C sequence when - * there is contention on the semaphore. - * - * %eax contains the semaphore pointer on entry. Save the C-clobbered - * registers (%eax, %edx and %ecx) except %eax whish is either a return - * value or just clobbered.. - */ -asm( -".section .sched.text\n" -".align 4\n" -".globl __down_failed\n" -"__down_failed:\n\t" -#if defined(CONFIG_FRAME_POINTER) - "pushl %ebp\n\t" - "movl %esp,%ebp\n\t" -#endif - "pushl %edx\n\t" - "pushl %ecx\n\t" - "call __down\n\t" - "popl %ecx\n\t" - "popl %edx\n\t" -#if defined(CONFIG_FRAME_POINTER) - "movl %ebp,%esp\n\t" - "popl %ebp\n\t" -#endif - "ret" -); - -asm( -".section .sched.text\n" -".align 4\n" -".globl __down_failed_interruptible\n" -"__down_failed_interruptible:\n\t" -#if defined(CONFIG_FRAME_POINTER) - "pushl %ebp\n\t" - "movl %esp,%ebp\n\t" -#endif - "pushl %edx\n\t" - "pushl %ecx\n\t" - "call __down_interruptible\n\t" - "popl %ecx\n\t" - "popl %edx\n\t" -#if defined(CONFIG_FRAME_POINTER) - "movl %ebp,%esp\n\t" - "popl %ebp\n\t" -#endif - "ret" -); - -asm( -".section .sched.text\n" -".align 4\n" -".globl __down_failed_trylock\n" -"__down_failed_trylock:\n\t" -#if defined(CONFIG_FRAME_POINTER) - "pushl %ebp\n\t" - "movl %esp,%ebp\n\t" -#endif - "pushl %edx\n\t" - "pushl %ecx\n\t" - "call __down_trylock\n\t" - "popl %ecx\n\t" - "popl %edx\n\t" -#if defined(CONFIG_FRAME_POINTER) - "movl %ebp,%esp\n\t" - "popl %ebp\n\t" -#endif - "ret" -); - -asm( -".section .sched.text\n" -".align 4\n" -".globl __up_wakeup\n" -"__up_wakeup:\n\t" - "pushl %edx\n\t" - "pushl %ecx\n\t" - "call __up\n\t" - "popl %ecx\n\t" - "popl %edx\n\t" - "ret" -); - -/* - * rw spinlock fallbacks - */ -#if defined(CONFIG_SMP) -asm( -".section .sched.text\n" -".align 4\n" -".globl __write_lock_failed\n" -"__write_lock_failed:\n\t" - LOCK_PREFIX "addl $" RW_LOCK_BIAS_STR ",(%eax)\n" -"1: rep; nop\n\t" - "cmpl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" - "jne 1b\n\t" - LOCK_PREFIX "subl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" - "jnz __write_lock_failed\n\t" - "ret" -); - -asm( -".section .sched.text\n" -".align 4\n" -".globl __read_lock_failed\n" -"__read_lock_failed:\n\t" - LOCK_PREFIX "incl (%eax)\n" -"1: rep; nop\n\t" - "cmpl $1,(%eax)\n\t" - "js 1b\n\t" - LOCK_PREFIX "decl (%eax)\n\t" - "js __read_lock_failed\n\t" - "ret" -); -#endif diff --git a/arch/i386/lib/Makefile b/arch/i386/lib/Makefile index 914933e9ec3d..d86a548b8d54 100644 --- a/arch/i386/lib/Makefile +++ b/arch/i386/lib/Makefile @@ -4,6 +4,6 @@ lib-y = checksum.o delay.o usercopy.o getuser.o putuser.o memcpy.o strstr.o \ - bitops.o + bitops.o semaphore.o lib-$(CONFIG_X86_USE_3DNOW) += mmx.o diff --git a/arch/i386/lib/semaphore.S b/arch/i386/lib/semaphore.S new file mode 100644 index 000000000000..e16ff1696390 --- /dev/null +++ b/arch/i386/lib/semaphore.S @@ -0,0 +1,154 @@ +/* + * i386 semaphore implementation. + * + * (C) Copyright 1999 Linus Torvalds + * + * Portions Copyright 1999 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * rw semaphores implemented November 1999 by Benjamin LaHaise + */ + +#include +#include +#include +#include +#include +#include + +/* + * The semaphore operations have a special calling sequence that + * allow us to do a simpler in-line version of them. These routines + * need to convert that sequence back into the C sequence when + * there is contention on the semaphore. + * + * %eax contains the semaphore pointer on entry. Save the C-clobbered + * registers (%eax, %edx and %ecx) except %eax whish is either a return + * value or just clobbered.. + */ + .section .sched.text +ENTRY(__down_failed) + CFI_STARTPROC + FRAME + pushl %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx,0 + pushl %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx,0 + call __down + popl %ecx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ecx + popl %edx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE edx + ENDFRAME + ret + CFI_ENDPROC + END(__down_failed) + +ENTRY(__down_failed_interruptible) + CFI_STARTPROC + FRAME + pushl %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx,0 + pushl %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx,0 + call __down_interruptible + popl %ecx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ecx + popl %edx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE edx + ENDFRAME + ret + CFI_ENDPROC + END(__down_failed_interruptible) + +ENTRY(__down_failed_trylock) + CFI_STARTPROC + FRAME + pushl %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx,0 + pushl %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx,0 + call __down_trylock + popl %ecx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ecx + popl %edx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE edx + ENDFRAME + ret + CFI_ENDPROC + END(__down_failed_trylock) + +ENTRY(__up_wakeup) + CFI_STARTPROC + FRAME + pushl %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx,0 + pushl %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx,0 + call __up + popl %ecx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ecx + popl %edx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE edx + ENDFRAME + ret + CFI_ENDPROC + END(__up_wakeup) + +/* + * rw spinlock fallbacks + */ +#ifdef CONFIG_SMP +ENTRY(__write_lock_failed) + CFI_STARTPROC simple + FRAME +2: LOCK_PREFIX + addl $ RW_LOCK_BIAS,(%eax) +1: rep; nop + cmpl $ RW_LOCK_BIAS,(%eax) + jne 1b + LOCK_PREFIX + subl $ RW_LOCK_BIAS,(%eax) + jnz 2b + ENDFRAME + ret + CFI_ENDPROC + END(__write_lock_failed) + +ENTRY(__read_lock_failed) + CFI_STARTPROC + FRAME +2: LOCK_PREFIX + incl (%eax) +1: rep; nop + cmpl $1,(%eax) + js 1b + LOCK_PREFIX + decl (%eax) + js 2b + ENDFRAME + ret + CFI_ENDPROC + END(__read_lock_failed) + +#endif diff --git a/arch/um/sys-i386/Makefile b/arch/um/sys-i386/Makefile index 374d61a19439..6289779ca67c 100644 --- a/arch/um/sys-i386/Makefile +++ b/arch/um/sys-i386/Makefile @@ -4,7 +4,7 @@ obj-y = bugs.o checksum.o delay.o fault.o ksyms.o ldt.o ptrace.o \ obj-$(CONFIG_MODE_SKAS) += stub.o stub_segv.o -subarch-obj-y = lib/bitops.o kernel/semaphore.o +subarch-obj-y = lib/bitops.o lib/semaphore.o subarch-obj-$(CONFIG_HIGHMEM) += mm/highmem.o subarch-obj-$(CONFIG_MODULES) += kernel/module.o diff --git a/include/asm-i386/frame.i b/include/asm-i386/frame.i new file mode 100644 index 000000000000..4d68ddce18b6 --- /dev/null +++ b/include/asm-i386/frame.i @@ -0,0 +1,24 @@ +#include +#include + +/* The annotation hides the frame from the unwinder and makes it look + like a ordinary ebp save/restore. This avoids some special cases for + frame pointer later */ +#ifdef CONFIG_FRAME_POINTER + .macro FRAME + pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebp,0 + movl %esp,%ebp + .endm + .macro ENDFRAME + popl %ebp + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ebp + .endm +#else + .macro FRAME + .endm + .macro ENDFRAME + .endm +#endif diff --git a/include/asm-um/alternative-asm.i b/include/asm-um/alternative-asm.i new file mode 100644 index 000000000000..cae9faca132f --- /dev/null +++ b/include/asm-um/alternative-asm.i @@ -0,0 +1,6 @@ +#ifndef __UM_ALTERNATIVE_ASM_I +#define __UM_ALTERNATIVE_ASM_I + +#include "asm/arch/alternative-asm.i" + +#endif diff --git a/include/asm-um/frame.i b/include/asm-um/frame.i new file mode 100644 index 000000000000..09d5dca5d928 --- /dev/null +++ b/include/asm-um/frame.i @@ -0,0 +1,6 @@ +#ifndef __UM_FRAME_I +#define __UM_FRAME_I + +#include "asm/arch/frame.i" + +#endif -- cgit v1.2.1 From cc1e684a9f1cc069b2ac249aa11486613369dd69 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Remove leftover CVS Id in thunk.S And move the comment to a proper place. Signed-off-by: Andi Kleen --- arch/x86_64/lib/thunk.S | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86_64/lib/thunk.S b/arch/x86_64/lib/thunk.S index 6cff27c775ae..0025535cac8d 100644 --- a/arch/x86_64/lib/thunk.S +++ b/arch/x86_64/lib/thunk.S @@ -1,10 +1,9 @@ - /* - * Save registers before calling assembly functions. This avoids - * disturbance of register allocation in some inline assembly constructs. - * Copyright 2001,2002 by Andi Kleen, SuSE Labs. - * Subject to the GNU public license, v.2. No warranty of any kind. - * $Id: thunk.S,v 1.2 2002/03/13 20:06:58 ak Exp $ - */ +/* + * Save registers before calling assembly functions. This avoids + * disturbance of register allocation in some inline assembly constructs. + * Copyright 2001,2002 by Andi Kleen, SuSE Labs. + * Subject to the GNU public license, v.2. No warranty of any kind. + */ #include #include -- cgit v1.2.1 From 5ba5891d44a6acade44887a0f3195489d46c12de Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Add some comments what tce.c actually does Signed-off-by: Andi Kleen --- arch/x86_64/kernel/tce.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c index 5530dda3f27a..0905ce6b9b32 100644 --- a/arch/x86_64/kernel/tce.c +++ b/arch/x86_64/kernel/tce.c @@ -1,4 +1,6 @@ /* + * This file manages the translation entries for the IBM Calgary IOMMU. + * * Derived from arch/powerpc/platforms/pseries/iommu.c * * Copyright (C) IBM Corporation, 2006 -- cgit v1.2.1 From 7f11d8a5efd625ffa41cde1d8472f85e885478ec Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Remove all ifdefs for local/io apic IO-APIC or local APIC can only be disabled at runtime anyways and Kconfig has forced these options on for a long time now. The Kconfigs are kept only now for the benefit of the shared acpi boot.c code. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/Makefile | 4 ++-- arch/x86_64/kernel/apic.c | 6 +----- arch/x86_64/kernel/crash.c | 2 -- arch/x86_64/kernel/entry.S | 2 -- arch/x86_64/kernel/head64.c | 2 -- arch/x86_64/kernel/i8259.c | 9 --------- arch/x86_64/kernel/irq.c | 6 ------ arch/x86_64/kernel/mpparse.c | 9 --------- arch/x86_64/kernel/setup.c | 6 ------ arch/x86_64/kernel/smpboot.c | 5 ----- arch/x86_64/kernel/time.c | 8 -------- arch/x86_64/kernel/traps.c | 2 -- include/asm-x86_64/apic.h | 4 ---- include/asm-x86_64/fixmap.h | 4 ---- include/asm-x86_64/io_apic.h | 6 ------ include/asm-x86_64/irq.h | 2 -- include/asm-x86_64/mpspec.h | 2 -- include/asm-x86_64/smp.h | 4 ---- 18 files changed, 3 insertions(+), 80 deletions(-) diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index b5aaeafc1cd3..0ef39553c0f6 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -20,8 +20,8 @@ obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o -obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ +obj-y += apic.o nmi.o +obj-y += io_apic.o mpparse.o \ genapic.o genapic_cluster.o genapic_flat.o obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 63d9b037afc6..c6ca100a7dea 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -661,7 +661,6 @@ void __init init_apic_mappings(void) */ boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); -#ifdef CONFIG_X86_IO_APIC { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; int i; @@ -679,7 +678,6 @@ void __init init_apic_mappings(void) idx++; } } -#endif } /* @@ -1129,12 +1127,10 @@ int __init APIC_init_uniprocessor (void) setup_local_APIC(); -#ifdef CONFIG_X86_IO_APIC if (smp_found_config && !skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); + setup_IO_APIC(); else nr_ioapics = 0; -#endif setup_boot_APIC_clock(); check_nmi_watchdog(); return 0; diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c index 7d7a9e75f70c..3525f884af82 100644 --- a/arch/x86_64/kernel/crash.c +++ b/arch/x86_64/kernel/crash.c @@ -194,9 +194,7 @@ void machine_crash_shutdown(struct pt_regs *regs) if(cpu_has_apic) disable_local_APIC(); -#if defined(CONFIG_X86_IO_APIC) disable_IO_APIC(); -#endif crash_save_self(regs); } diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 2dc5c01f754d..a0cf36ba7c56 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -676,7 +676,6 @@ ENTRY(call_function_interrupt) END(call_function_interrupt) #endif -#ifdef CONFIG_X86_LOCAL_APIC ENTRY(apic_timer_interrupt) apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt END(apic_timer_interrupt) @@ -688,7 +687,6 @@ END(error_interrupt) ENTRY(spurious_interrupt) apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt END(spurious_interrupt) -#endif /* * Exception entry points. diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index 36647ce6aecb..bacbd75c63ad 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -111,10 +111,8 @@ void __init x86_64_start_kernel(char * real_mode_data) if (s != NULL) numa_setup(s+5); #endif -#ifdef CONFIG_X86_IO_APIC if (strstr(saved_command_line, "disableapic")) disable_apic = 1; -#endif /* You need early console to see that */ if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE) panic("Kernel too big for kernel mapping\n"); diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index 0434b1f8e3dd..d2f5acde5749 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c @@ -55,7 +55,6 @@ */ BUILD_16_IRQS(0x0) -#ifdef CONFIG_X86_LOCAL_APIC /* * The IO-APIC gives us many more interrupt sources. Most of these * are unused but an SMP system is supposed to have enough memory ... @@ -75,8 +74,6 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_15_IRQS(0xe) #endif -#endif - #undef BUILD_16_IRQS #undef BUILD_15_IRQS #undef BI @@ -100,7 +97,6 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) void (*interrupt[NR_IRQS])(void) = { IRQLIST_16(0x0), -#ifdef CONFIG_X86_IO_APIC IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3), IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), @@ -110,7 +106,6 @@ void (*interrupt[NR_IRQS])(void) = { , IRQLIST_15(0xe) #endif -#endif }; #undef IRQ @@ -453,9 +448,7 @@ void __init init_ISA_irqs (void) { int i; -#ifdef CONFIG_X86_LOCAL_APIC init_bsp_APIC(); -#endif init_8259A(0); for (i = 0; i < NR_IRQS; i++) { @@ -581,14 +574,12 @@ void __init init_IRQ(void) set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); -#ifdef CONFIG_X86_LOCAL_APIC /* self generated IPI for local APIC timer */ set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); /* IPI vectors for APIC spurious and error interrupts */ set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); -#endif /* * Set the clock to HZ Hz, we already have a valid diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index 5221a53e90c1..bfbcf92cfa4a 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c @@ -20,11 +20,9 @@ #include atomic_t irq_err_count; -#ifdef CONFIG_X86_IO_APIC #ifdef APIC_MISMATCH_DEBUG atomic_t irq_mis_count; #endif -#endif #ifdef CONFIG_DEBUG_STACKOVERFLOW /* @@ -92,17 +90,13 @@ skip: for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); seq_putc(p, '\n'); -#ifdef CONFIG_X86_LOCAL_APIC seq_printf(p, "LOC: "); for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); seq_putc(p, '\n'); -#endif seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); -#ifdef CONFIG_X86_IO_APIC #ifdef APIC_MISMATCH_DEBUG seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); -#endif #endif } return 0; diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index a1ab4197f8a1..c73dd1f41e65 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -74,14 +74,10 @@ physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; /* ACPI MADT entry parsing functions */ #ifdef CONFIG_ACPI extern struct acpi_boot_flags acpi_boot; -#ifdef CONFIG_X86_LOCAL_APIC extern int acpi_parse_lapic (acpi_table_entry_header *header); extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header); extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header); -#endif /*CONFIG_X86_LOCAL_APIC*/ -#ifdef CONFIG_X86_IO_APIC extern int acpi_parse_ioapic (acpi_table_entry_header *header); -#endif /*CONFIG_X86_IO_APIC*/ #endif /*CONFIG_ACPI*/ u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; @@ -661,9 +657,7 @@ void __init find_intel_smp (void) */ void __init find_smp_config (void) { -#ifdef CONFIG_X86_LOCAL_APIC find_intel_smp(); -#endif } @@ -717,8 +711,6 @@ void __cpuinit mp_register_lapic ( MP_processor_info(&processor); } -#ifdef CONFIG_X86_IO_APIC - #define MP_ISA_BUS 0 #define MP_MAX_IOAPIC_PIN 127 @@ -997,5 +989,4 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) return gsi; } -#endif /*CONFIG_X86_IO_APIC*/ #endif /*CONFIG_ACPI*/ diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 34afad704824..fbe9f7faa2db 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -340,10 +340,8 @@ static __init void parse_cmdline_early (char ** cmdline_p) else if (fullarg(from, "acpi=strict")) { acpi_strict = 1; } -#ifdef CONFIG_X86_IO_APIC else if (fullarg(from, "acpi_skip_timer_override")) acpi_skip_timer_override = 1; -#endif #endif if (fullarg(from, "disable_timer_pin_1")) @@ -625,12 +623,10 @@ void __init setup_arch(char **cmdline_p) */ acpi_reserve_bootmem(); #endif -#ifdef CONFIG_X86_LOCAL_APIC /* * Find and reserve possible boot-time SMP configuration: */ find_smp_config(); -#endif #ifdef CONFIG_BLK_DEV_INITRD if (LOADER_TYPE && INITRD_START) { if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { @@ -674,14 +670,12 @@ void __init setup_arch(char **cmdline_p) init_cpu_to_node(); -#ifdef CONFIG_X86_LOCAL_APIC /* * get boot-time SMP configuration: */ if (smp_found_config) get_smp_config(); init_apic_mappings(); -#endif /* * Request address space for all standard RAM and ROM resources diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 582896f7d420..d29571e24953 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -1175,13 +1175,8 @@ int __cpuinit __cpu_up(unsigned int cpu) void __init smp_cpus_done(unsigned int max_cpus) { smp_cleanup_boot(); - -#ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); -#endif - check_nmi_watchdog(); - time_init_gtod(); } diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index ea00915d393a..d66c7f750e75 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -41,9 +41,7 @@ #include #include #include -#ifdef CONFIG_X86_LOCAL_APIC #include -#endif #ifdef CONFIG_CPU_FREQ static void cpufreq_delayed_get(void); @@ -438,12 +436,8 @@ void main_timer_handler(struct pt_regs *regs) * have to call the local interrupt handler. */ -#ifndef CONFIG_X86_LOCAL_APIC - profile_tick(CPU_PROFILING, regs); -#else if (!using_apic_timer) smp_local_timer_interrupt(regs); -#endif /* * If we have an externally synchronized Linux clock, then update CMOS clock @@ -467,10 +461,8 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) if (apic_runs_main_timer > 1) return IRQ_HANDLED; main_timer_handler(regs); -#ifdef CONFIG_X86_LOCAL_APIC if (using_apic_timer) smp_send_timer_broadcast_ipi(); -#endif return IRQ_HANDLED; } diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 96f62a033242..56d7ff0c894c 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -788,7 +788,6 @@ asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs) if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) == NOTIFY_STOP) return; -#ifdef CONFIG_X86_LOCAL_APIC /* * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. @@ -796,7 +795,6 @@ asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs) if (nmi_watchdog_tick(regs,reason)) return; if (!do_nmi_callback(regs,cpu)) -#endif unknown_nmi_error(reason, regs); return; diff --git a/include/asm-x86_64/apic.h b/include/asm-x86_64/apic.h index 9c96a0a8d1bd..8ed0f4d67b8d 100644 --- a/include/asm-x86_64/apic.h +++ b/include/asm-x86_64/apic.h @@ -29,8 +29,6 @@ extern int apic_runs_main_timer; printk(s, ##a); \ } while (0) -#ifdef CONFIG_X86_LOCAL_APIC - struct pt_regs; /* @@ -104,8 +102,6 @@ void switch_ipi_to_APIC_timer(void *cpumask); #define ARCH_APICTIMER_STOPS_ON_C3 1 -#endif /* CONFIG_X86_LOCAL_APIC */ - extern unsigned boot_cpu_id; #endif /* __ASM_APIC_H */ diff --git a/include/asm-x86_64/fixmap.h b/include/asm-x86_64/fixmap.h index 0b4ffbd1a125..1b620db5b9e3 100644 --- a/include/asm-x86_64/fixmap.h +++ b/include/asm-x86_64/fixmap.h @@ -37,13 +37,9 @@ enum fixed_addresses { VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1, VSYSCALL_HPET, FIX_HPET_BASE, -#ifdef CONFIG_X86_LOCAL_APIC FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ -#endif -#ifdef CONFIG_X86_IO_APIC FIX_IO_APIC_BASE_0, FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1, -#endif __end_of_fixed_addresses }; diff --git a/include/asm-x86_64/io_apic.h b/include/asm-x86_64/io_apic.h index fb7a0909a174..5d1b5c68e36e 100644 --- a/include/asm-x86_64/io_apic.h +++ b/include/asm-x86_64/io_apic.h @@ -10,8 +10,6 @@ * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar */ -#ifdef CONFIG_X86_IO_APIC - #ifdef CONFIG_PCI_MSI static inline int use_pci_vector(void) {return 1;} static inline void disable_edge_ioapic_vector(unsigned int vector) { } @@ -209,10 +207,6 @@ extern int timer_uses_ioapic_pin_0; extern int sis_apic_bug; /* dummy */ -#else /* !CONFIG_X86_IO_APIC */ -#define io_apic_assign_pci_irqs 0 -#endif - extern int assign_irq_vector(int irq); void enable_NMI_through_LVT0 (void * dummy); diff --git a/include/asm-x86_64/irq.h b/include/asm-x86_64/irq.h index 9db5a1b4f7b1..43469d8ab71a 100644 --- a/include/asm-x86_64/irq.h +++ b/include/asm-x86_64/irq.h @@ -44,9 +44,7 @@ static __inline__ int irq_canonicalize(int irq) return ((irq == 2) ? 9 : irq); } -#ifdef CONFIG_X86_LOCAL_APIC #define ARCH_HAS_NMI_WATCHDOG /* See include/linux/nmi.h */ -#endif #ifdef CONFIG_HOTPLUG_CPU #include diff --git a/include/asm-x86_64/mpspec.h b/include/asm-x86_64/mpspec.h index 14fc3ddd9031..3c7aae9fd157 100644 --- a/include/asm-x86_64/mpspec.h +++ b/include/asm-x86_64/mpspec.h @@ -184,12 +184,10 @@ extern int pic_mode; extern void mp_register_lapic (u8 id, u8 enabled); extern void mp_register_lapic_address (u64 address); -#ifdef CONFIG_X86_IO_APIC extern void mp_register_ioapic (u8 id, u32 address, u32 gsi_base); extern void mp_override_legacy_irq (u8 bus_irq, u8 polarity, u8 trigger, u32 gsi); extern void mp_config_acpi_legacy_irqs (void); extern int mp_register_gsi (u32 gsi, int triggering, int polarity); -#endif /*CONFIG_X86_IO_APIC*/ #endif extern int using_apic_timer; diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h index 612d208961a2..6353fa41bebe 100644 --- a/include/asm-x86_64/smp.h +++ b/include/asm-x86_64/smp.h @@ -9,15 +9,11 @@ #include extern int disable_apic; -#ifdef CONFIG_X86_LOCAL_APIC #include #include -#ifdef CONFIG_X86_IO_APIC #include -#endif #include #include -#endif #ifdef CONFIG_SMP -- cgit v1.2.1 From 6c96a29f20762e8ce40b674f906055d009c302ee Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Remove apic mismatch counter Nobody has been setting the mismatch counter and the ifdef was never set so remove it. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/irq.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index bfbcf92cfa4a..b3677e6ccc6e 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c @@ -20,9 +20,6 @@ #include atomic_t irq_err_count; -#ifdef APIC_MISMATCH_DEBUG -atomic_t irq_mis_count; -#endif #ifdef CONFIG_DEBUG_STACKOVERFLOW /* @@ -95,9 +92,6 @@ skip: seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); seq_putc(p, '\n'); seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); -#ifdef APIC_MISMATCH_DEBUG - seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); -#endif } return 0; } -- cgit v1.2.1 From 3f14c746a61ec932c204aca820c02c293118c5df Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Remove old "focus disabled" chipset errata workaround The new systems already use focus disabled and the comment was completely outdated. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/apic.c | 28 ++-------------------------- 1 file changed, 2 insertions(+), 26 deletions(-) diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index c6ca100a7dea..78c43b1a3064 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -399,32 +399,8 @@ void __cpuinit setup_local_APIC (void) */ value |= APIC_SPIV_APIC_ENABLED; - /* - * Some unknown Intel IO/APIC (or APIC) errata are biting us with - * certain networking cards. If high frequency interrupts are - * happening on a particular IOAPIC pin, plus the IOAPIC routing - * entry is masked/unmasked at a high rate as well then sooner or - * later IOAPIC line gets 'stuck', no more interrupts are received - * from the device. If focus CPU is disabled then the hang goes - * away, oh well :-( - * - * [ This bug can be reproduced easily with a level-triggered - * PCI Ne2000 networking cards and PII/PIII processors, dual - * BX chipset. ] - */ - /* - * Actually disabling the focus CPU check just makes the hang less - * frequent as it makes the interrupt distributon model be more - * like LRU than MRU (the short-term load is more even across CPUs). - * See also the comment in end_level_ioapic_irq(). --macro - */ -#if 1 - /* Enable focus processor (bit==0) */ - value &= ~APIC_SPIV_FOCUS_DISABLED; -#else - /* Disable focus processor (bit==1) */ - value |= APIC_SPIV_FOCUS_DISABLED; -#endif + /* We always use processor focus */ + /* * Set spurious IRQ vector */ -- cgit v1.2.1 From b1c78c0fcc29097567e1afc39701012e6d89adb7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Clean up and minor fixes to TLB flush - Convert CR* accesses to dedicated inline functions and rewrite the rest as C inlines - Don't do a double flush for global flushes (pointed out by Zach Amsden) This was a bug workaround for old CPUs that don't do 64bit and is obsolete. - Add a proper memory clobber to invlpg - Remove an unused extern Signed-off-by: Andi Kleen --- include/asm-x86_64/pgtable.h | 2 -- include/asm-x86_64/tlbflush.h | 70 +++++++++++++++++++++---------------------- 2 files changed, 35 insertions(+), 37 deletions(-) diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index a31ab4e68a9b..0c1e2422400a 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -25,8 +25,6 @@ extern int nonx_setup(char *str); extern void paging_init(void); extern void clear_kernel_mapping(unsigned long addr, unsigned long size); -extern unsigned long pgkern_mask; - /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. diff --git a/include/asm-x86_64/tlbflush.h b/include/asm-x86_64/tlbflush.h index d16d5b60f419..983bd296c81a 100644 --- a/include/asm-x86_64/tlbflush.h +++ b/include/asm-x86_64/tlbflush.h @@ -4,44 +4,44 @@ #include #include -#define __flush_tlb() \ - do { \ - unsigned long tmpreg; \ - \ - __asm__ __volatile__( \ - "movq %%cr3, %0; # flush TLB \n" \ - "movq %0, %%cr3; \n" \ - : "=r" (tmpreg) \ - :: "memory"); \ - } while (0) +static inline unsigned long get_cr3(void) +{ + unsigned long cr3; + asm volatile("mov %%cr3,%0" : "=r" (cr3)); + return cr3; +} -/* - * Global pages have to be flushed a bit differently. Not a real - * performance problem because this does not happen often. - */ -#define __flush_tlb_global() \ - do { \ - unsigned long tmpreg, cr4, cr4_orig; \ - \ - __asm__ __volatile__( \ - "movq %%cr4, %2; # turn off PGE \n" \ - "movq %2, %1; \n" \ - "andq %3, %1; \n" \ - "movq %1, %%cr4; \n" \ - "movq %%cr3, %0; # flush TLB \n" \ - "movq %0, %%cr3; \n" \ - "movq %2, %%cr4; # turn PGE back on \n" \ - : "=&r" (tmpreg), "=&r" (cr4), "=&r" (cr4_orig) \ - : "i" (~X86_CR4_PGE) \ - : "memory"); \ - } while (0) - -extern unsigned long pgkern_mask; - -#define __flush_tlb_all() __flush_tlb_global() +static inline void set_cr3(unsigned long cr3) +{ + asm volatile("mov %0,%%cr3" :: "r" (cr3) : "memory"); +} + +static inline void __flush_tlb(void) +{ + set_cr3(get_cr3()); +} + +static inline unsigned long get_cr4(void) +{ + unsigned long cr4; + asm volatile("mov %%cr4,%0" : "=r" (cr4)); + return cr4; +} + +static inline void set_cr4(unsigned long cr4) +{ + asm volatile("mov %0,%%cr4" :: "r" (cr4) : "memory"); +} + +static inline void __flush_tlb_all(void) +{ + unsigned long cr4 = get_cr4(); + set_cr4(cr4 & ~X86_CR4_PGE); /* clear PGE */ + set_cr4(cr4); /* write old PGE again and flush TLBs */ +} #define __flush_tlb_one(addr) \ - __asm__ __volatile__("invlpg %0": :"m" (*(char *) addr)) + __asm__ __volatile__("invlpg (%0)" :: "r" (addr) : "memory") /* -- cgit v1.2.1 From 107878bb14b1fb6bddc646f4d5e72e8beaa2f6a2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] i386: Minor fixes & cleanup to tlb flush (based on x86-64 changes) - Add a proper memory clobber to invlpg - Remove an unused extern Signed-off-by: Andi Kleen --- include/asm-i386/tlbflush.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/asm-i386/tlbflush.h b/include/asm-i386/tlbflush.h index d57ca5c540b6..360648b0f2b3 100644 --- a/include/asm-i386/tlbflush.h +++ b/include/asm-i386/tlbflush.h @@ -36,8 +36,6 @@ : "memory"); \ } while (0) -extern unsigned long pgkern_mask; - # define __flush_tlb_all() \ do { \ if (cpu_has_pge) \ @@ -49,7 +47,7 @@ extern unsigned long pgkern_mask; #define cpu_has_invlpg (boot_cpu_data.x86 > 3) #define __flush_tlb_single(addr) \ - __asm__ __volatile__("invlpg %0": :"m" (*(char *) addr)) + __asm__ __volatile__("invlpg (%0)" ::"r" (addr) : "memory") #ifdef CONFIG_X86_INVLPG # define __flush_tlb_one(addr) __flush_tlb_single(addr) -- cgit v1.2.1 From 2e91a17b35116885373e04af142b1d08cf1b47bf Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Add some comments to entry.S And remove some old obsolete ones. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/entry.S | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index a0cf36ba7c56..a04fc4108ff2 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -22,9 +22,21 @@ * at the top of the kernel process stack. * - partial stack frame: partially saved registers upto R11. * - full stack frame: Like partial stack frame, but all register saved. - * - * TODO: - * - schedule it carefully for the final hardware. + * + * Some macro usage: + * - CFI macros are used to generate dwarf2 unwind information for better + * backtraces. They don't change any code. + * - SAVE_ALL/RESTORE_ALL - Save/restore all registers + * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify. + * There are unfortunately lots of special cases where some registers + * not touched. The macro is a big mess that should be cleaned up. + * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS. + * Gives a full stack frame. + * - ENTRY/END Define functions in the symbol table. + * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack + * frame that is otherwise undefined after a SYSCALL + * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. + * - errorentry/paranoidentry/zeroentry - Define exception entry points. */ #include -- cgit v1.2.1 From 5cb6b99928a8f1a1bbdac7a16b88aef8cb64d432 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Remove pirq overwrite support This was an old workaround for broken MP-BIOS. The user could specify overwrites on the command line. I've never seen it being used for anything on 64bit. So get rid of it for now. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/io_apic.c | 55 -------------------------------------------- 1 file changed, 55 deletions(-) diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 924a4a332954..86cc07caa0e5 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -225,14 +225,6 @@ static void clear_IO_APIC (void) clear_IO_APIC_pin(apic, pin); } -/* - * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to - * specific CPU-side IRQs. - */ - -#define MAX_PIRQS 8 -static int pirq_entries [MAX_PIRQS]; -static int pirqs_enabled; int skip_ioapic_setup; int ioapic_force; @@ -370,34 +362,6 @@ void __init check_ioapic(void) } } -static int __init ioapic_pirq_setup(char *str) -{ - int i, max; - int ints[MAX_PIRQS+1]; - - get_options(str, ARRAY_SIZE(ints), ints); - - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; - - pirqs_enabled = 1; - apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n"); - max = MAX_PIRQS; - if (ints[0] < MAX_PIRQS) - max = ints[0]; - - for (i = 0; i < max; i++) { - apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); - /* - * PIRQs are mapped upside down, usually. - */ - pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; - } - return 1; -} - -__setup("pirq=", ioapic_pirq_setup); - /* * Find the IRQ entry number of a certain pin. */ @@ -793,22 +757,6 @@ static int pin_2_irq(int idx, int apic, int pin) } } BUG_ON(irq >= NR_IRQS); - - /* - * PCI IRQ command line redirection. Yes, limits are hardcoded. - */ - if ((pin >= 16) && (pin <= 23)) { - if (pirq_entries[pin-16] != -1) { - if (!pirq_entries[pin-16]) { - apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16); - } else { - irq = pirq_entries[pin-16]; - apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n", - pin-16, irq); - } - } - } - BUG_ON(irq >= NR_IRQS); return irq; } @@ -1281,9 +1229,6 @@ static void __init enable_IO_APIC(void) irq_2_pin[i].pin = -1; irq_2_pin[i].next = 0; } - if (!pirqs_enabled) - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; /* * The number of IO-APIC IRQ registers (== #pins): -- cgit v1.2.1 From e50991343488edf25d58820be1684322808c763c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Remove leftover MCE/EISA support No 64bit EISA or Microchannel systems ever. Remove the left over code in the IO-APIC driver and the mptable parser Signed-off-by: Andi Kleen --- arch/x86_64/kernel/io_apic.c | 57 ++------------------------------------------ arch/x86_64/kernel/mpparse.c | 16 +------------ 2 files changed, 3 insertions(+), 70 deletions(-) diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 86cc07caa0e5..97e90f6abcc4 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -389,9 +389,7 @@ static int __init find_isa_irq_pin(int irq, int type) for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].mpc_srcbus; - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || - mp_bus_id_to_type[lbus] == MP_BUS_EISA || - mp_bus_id_to_type[lbus] == MP_BUS_MCA) && + if (mp_bus_id_to_type[lbus] == MP_BUS_ISA && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) @@ -407,9 +405,7 @@ static int __init find_isa_irq_apic(int irq, int type) for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].mpc_srcbus; - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || - mp_bus_id_to_type[lbus] == MP_BUS_EISA || - mp_bus_id_to_type[lbus] == MP_BUS_MCA) && + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA) && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) break; @@ -472,27 +468,6 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) return best_guess; } -/* - * EISA Edge/Level control register, ELCR - */ -static int EISA_ELCR(unsigned int irq) -{ - if (irq < 16) { - unsigned int port = 0x4d0 + (irq >> 3); - return (inb(port) >> (irq & 7)) & 1; - } - apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq); - return 0; -} - -/* EISA interrupts are always polarity zero and can be edge or level - * trigger depending on the ELCR value. If an interrupt is listed as - * EISA conforming in the MP table, that means its trigger type must - * be read in from the ELCR */ - -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) -#define default_EISA_polarity(idx) (0) - /* ISA interrupts are always polarity zero edge triggered, * when listed as conforming in the MP table. */ @@ -505,12 +480,6 @@ static int EISA_ELCR(unsigned int irq) #define default_PCI_trigger(idx) (1) #define default_PCI_polarity(idx) (1) -/* MCA interrupts are always polarity zero level triggered, - * when listed as conforming in the MP table. */ - -#define default_MCA_trigger(idx) (1) -#define default_MCA_polarity(idx) (0) - static int __init MPBIOS_polarity(int idx) { int bus = mp_irqs[idx].mpc_srcbus; @@ -530,21 +499,11 @@ static int __init MPBIOS_polarity(int idx) polarity = default_ISA_polarity(idx); break; } - case MP_BUS_EISA: /* EISA pin */ - { - polarity = default_EISA_polarity(idx); - break; - } case MP_BUS_PCI: /* PCI pin */ { polarity = default_PCI_polarity(idx); break; } - case MP_BUS_MCA: /* MCA pin */ - { - polarity = default_MCA_polarity(idx); - break; - } default: { printk(KERN_WARNING "broken BIOS!!\n"); @@ -599,21 +558,11 @@ static int MPBIOS_trigger(int idx) trigger = default_ISA_trigger(idx); break; } - case MP_BUS_EISA: /* EISA pin */ - { - trigger = default_EISA_trigger(idx); - break; - } case MP_BUS_PCI: /* PCI pin */ { trigger = default_PCI_trigger(idx); break; } - case MP_BUS_MCA: /* MCA pin */ - { - trigger = default_MCA_trigger(idx); - break; - } default: { printk(KERN_WARNING "broken BIOS!!\n"); @@ -731,8 +680,6 @@ static int pin_2_irq(int idx, int apic, int pin) switch (mp_bus_id_to_type[bus]) { case MP_BUS_ISA: /* ISA pin */ - case MP_BUS_EISA: - case MP_BUS_MCA: { irq = mp_irqs[idx].mpc_srcbusirq; break; diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index c73dd1f41e65..90e99cf27aa5 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -175,14 +175,10 @@ static void __init MP_bus_info (struct mpc_config_bus *m) if (strncmp(str, "ISA", 3) == 0) { mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; - } else if (strncmp(str, "EISA", 4) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; } else if (strncmp(str, "PCI", 3) == 0) { mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; mp_current_pci_id++; - } else if (strncmp(str, "MCA", 3) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; } else { printk(KERN_ERR "Unknown bustype %s\n", str); } @@ -465,14 +461,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) case 5: memcpy(bus.mpc_bustype, "ISA ", 6); break; - case 2: - case 6: - case 3: - memcpy(bus.mpc_bustype, "EISA ", 6); - break; - case 4: - case 7: - memcpy(bus.mpc_bustype, "MCA ", 6); } MP_bus_info(&bus); if (mpc_default_type > 4) { @@ -629,9 +617,7 @@ void __init find_intel_smp (void) smp_scan_config(0xF0000,0x10000)) return; /* - * If it is an SMP machine we should know now, unless the - * configuration is in an EISA/MCA bus machine with an - * extended bios data area. + * If it is an SMP machine we should know now. * * there is a real-mode segmented pointer pointing to the * 4K EBDA area at 0x40E, calculate and scan it here. -- cgit v1.2.1 From a8fcf1a24a16e1c735c795d99773b5dcefb71518 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Remove obsolete PIC mode PIC mode is an outdated way to drive the APICs that was used on some early MP boards. It is not supported in the ACPI model. It is unlikely to be ever configured by any x86-64 system Remove it thus. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/apic.c | 92 ++++++++++++++------------------------------ arch/x86_64/kernel/mpparse.c | 8 ---- arch/x86_64/kernel/smpboot.c | 1 - include/asm-x86_64/mpspec.h | 1 - include/asm-x86_64/smp.h | 1 - 5 files changed, 29 insertions(+), 74 deletions(-) diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 78c43b1a3064..20a76f209629 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -136,72 +136,40 @@ void clear_local_APIC(void) apic_read(APIC_ESR); } -void __init connect_bsp_APIC(void) -{ - if (pic_mode) { - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); - /* - * PIC mode, enable APIC mode in the IMCR, i.e. - * connect BSP's local APIC to INT and NMI lines. - */ - apic_printk(APIC_VERBOSE, "leaving PIC mode, enabling APIC mode.\n"); - outb(0x70, 0x22); - outb(0x01, 0x23); - } -} - void disconnect_bsp_APIC(int virt_wire_setup) { - if (pic_mode) { - /* - * Put the board back into PIC mode (has an effect - * only on certain older boards). Note that APIC - * interrupts, including IPIs, won't work beyond - * this point! The only exception are INIT IPIs. - */ - apic_printk(APIC_QUIET, "disabling APIC mode, entering PIC mode.\n"); - outb(0x70, 0x22); - outb(0x00, 0x23); - } - else { - /* Go back to Virtual Wire compatibility mode */ - unsigned long value; - - /* For the spurious interrupt use vector F, and enable it */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= 0xf; - apic_write(APIC_SPIV, value); - - if (!virt_wire_setup) { - /* For LVT0 make it edge triggered, active high, external and enabled */ - value = apic_read(APIC_LVT0); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write(APIC_LVT0, value); - } - else { - /* Disable LVT0 */ - apic_write(APIC_LVT0, APIC_LVT_MASKED); - } + /* Go back to Virtual Wire compatibility mode */ + unsigned long value; - /* For LVT1 make it edge triggered, active high, nmi and enabled */ - value = apic_read(APIC_LVT1); - value &= ~( - APIC_MODE_MASK | APIC_SEND_PENDING | + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write(APIC_SPIV, value); + + if (!virt_wire_setup) { + /* For LVT0 make it edge triggered, active high, external and enabled */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write(APIC_LVT1, value); + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write(APIC_LVT0, value); + } else { + /* Disable LVT0 */ + apic_write(APIC_LVT0, APIC_LVT_MASKED); } + + /* For LVT1 make it edge triggered, active high, nmi and enabled */ + value = apic_read(APIC_LVT1); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write(APIC_LVT1, value); } void disable_local_APIC(void) @@ -418,7 +386,7 @@ void __cpuinit setup_local_APIC (void) * TODO: set up through-local-APIC from through-I/O-APIC? --macro */ value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; - if (!smp_processor_id() && (pic_mode || !value)) { + if (!smp_processor_id() && !value) { value = APIC_DM_EXTINT; apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id()); } else { @@ -1096,8 +1064,6 @@ int __init APIC_init_uniprocessor (void) verify_local_APIC(); - connect_bsp_APIC(); - phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id)); diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 90e99cf27aa5..022624503552 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -56,7 +56,6 @@ struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; int mp_irq_entries; int nr_ioapics; -int pic_mode; unsigned long mp_lapic_addr = 0; @@ -514,13 +513,6 @@ void __init get_smp_config (void) printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); - if (mpf->mpf_feature2 & (1<<7)) { - printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); - pic_mode = 1; - } else { - printk(KERN_INFO " Virtual Wire compatibility mode.\n"); - pic_mode = 0; - } /* * Now see if we need to read further. diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index d29571e24953..ae7d6d84e352 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -1090,7 +1090,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus) /* * Switch from PIC to APIC mode. */ - connect_bsp_APIC(); setup_local_APIC(); if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) { diff --git a/include/asm-x86_64/mpspec.h b/include/asm-x86_64/mpspec.h index 3c7aae9fd157..1dc83436cc91 100644 --- a/include/asm-x86_64/mpspec.h +++ b/include/asm-x86_64/mpspec.h @@ -178,7 +178,6 @@ extern int mp_irq_entries; extern struct mpc_config_intsrc mp_irqs [MAX_IRQ_SOURCES]; extern int mpc_default_type; extern unsigned long mp_lapic_addr; -extern int pic_mode; #ifdef CONFIG_ACPI extern void mp_register_lapic (u8 id, u8 enabled); diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h index 6353fa41bebe..498fbc1fc179 100644 --- a/include/asm-x86_64/smp.h +++ b/include/asm-x86_64/smp.h @@ -33,7 +33,6 @@ extern cpumask_t cpu_initialized; extern void smp_alloc_memory(void); extern volatile unsigned long smp_invalidate_needed; -extern int pic_mode; extern void lock_ipi_call_lock(void); extern void unlock_ipi_call_lock(void); extern int smp_num_siblings; -- cgit v1.2.1 From c1a58b42b428e717afbbb298356e041cea54ad17 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] i386/x86-64: Remove obsolete sanity check in mptable parsing It apparently has never triggered in many years. Signed-off-by: Andi Kleen --- arch/i386/kernel/mpparse.c | 13 ------------- arch/x86_64/kernel/mpparse.c | 13 ------------- 2 files changed, 26 deletions(-) diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index 827569688562..a2b126fe9a54 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -294,19 +294,6 @@ static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); - /* - * Well it seems all SMP boards in existence - * use ExtINT/LVT1 == LINT0 and - * NMI/LVT2 == LINT1 - the following check - * will show us if this assumptions is false. - * Until then we do not have to add baggage. - */ - if ((m->mpc_irqtype == mp_ExtINT) && - (m->mpc_destapiclint != 0)) - BUG(); - if ((m->mpc_irqtype == mp_NMI) && - (m->mpc_destapiclint != 1)) - BUG(); } #ifdef CONFIG_X86_NUMAQ diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 022624503552..c12acc3b5552 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -223,19 +223,6 @@ static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); - /* - * Well it seems all SMP boards in existence - * use ExtINT/LVT1 == LINT0 and - * NMI/LVT2 == LINT1 - the following check - * will show us if this assumptions is false. - * Until then we do not have to add baggage. - */ - if ((m->mpc_irqtype == mp_ExtINT) && - (m->mpc_destapiclint != 0)) - BUG(); - if ((m->mpc_irqtype == mp_NMI) && - (m->mpc_destapiclint != 1)) - BUG(); } /* -- cgit v1.2.1 From eea0e11c1f0d6ef89e64182b2f1223a4ca2b74a2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Factor out common io apic routing entry access The IO APIC code had lots of duplicated code to read/write 64bit routing entries into the IO-APIC. Factor this out int common read/write functions In a few cases the IO APIC lock is taken more often now, but this isn't a problem because it's all initialization/shutdown only slow path code. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/io_apic.c | 82 ++++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 97e90f6abcc4..1ccf1b41c3ed 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -111,6 +111,33 @@ int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1}; FINAL; \ } +union entry_union { + struct { u32 w1, w2; }; + struct IO_APIC_route_entry entry; +}; + +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) +{ + union entry_union eu; + unsigned long flags; + spin_lock_irqsave(&ioapic_lock, flags); + eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); + eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + return eu.entry; +} + +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + unsigned long flags; + union entry_union eu; + eu.entry = e; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + #ifdef CONFIG_SMP static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) { @@ -196,13 +223,9 @@ static void unmask_IO_APIC_irq (unsigned int irq) static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; - unsigned long flags; /* Check delivery_mode to be sure we're not clearing an SMI pin */ - spin_lock_irqsave(&ioapic_lock, flags); - *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry = ioapic_read_entry(apic, pin); if (entry.delivery_mode == dest_SMI) return; /* @@ -210,10 +233,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) */ memset(&entry, 0, sizeof(entry)); entry.mask = 1; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(apic, pin, entry); } static void clear_IO_APIC (void) @@ -838,9 +858,9 @@ static void __init setup_IO_APIC_irqs(void) if (!apic && (irq < 16)) disable_8259A_irq(irq); } + ioapic_write_entry(apic, pin, entry); + spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); set_native_irq_info(irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -978,10 +998,7 @@ void __apicdebuginit print_IO_APIC(void) for (i = 0; i <= reg_01.bits.entries; i++) { struct IO_APIC_route_entry entry; - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); - *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry = ioapic_read_entry(apic, i); printk(KERN_DEBUG " %02x %03X %02X ", i, @@ -1191,11 +1208,7 @@ static void __init enable_IO_APIC(void) /* See if any of the pins is in ExtINT mode */ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { struct IO_APIC_route_entry entry; - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); - + entry = ioapic_read_entry(apic, pin); /* If the interrupt line is enabled and in ExtInt mode * I have found the pin where the i8259 is connected. @@ -1247,7 +1260,6 @@ void disable_IO_APIC(void) */ if (ioapic_i8259.pin != -1) { struct IO_APIC_route_entry entry; - unsigned long flags; memset(&entry, 0, sizeof(entry)); entry.mask = 0; /* Enabled */ @@ -1264,12 +1276,7 @@ void disable_IO_APIC(void) /* * Add it to the IO-APIC irq-routing table: */ - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin, - *(((int *)&entry)+1)); - io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin, - *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); } disconnect_bsp_APIC(ioapic_i8259.pin != -1); @@ -1879,17 +1886,12 @@ static int ioapic_suspend(struct sys_device *dev, pm_message_t state) { struct IO_APIC_route_entry *entry; struct sysfs_ioapic_data *data; - unsigned long flags; int i; data = container_of(dev, struct sysfs_ioapic_data, dev); entry = data->entry; - spin_lock_irqsave(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i); - *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i); - } - spin_unlock_irqrestore(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) + *entry = ioapic_read_entry(dev->id, i); return 0; } @@ -1911,11 +1913,9 @@ static int ioapic_resume(struct sys_device *dev) reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; io_apic_write(dev->id, 0, reg_00.raw); } - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1)); - io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0)); - } spin_unlock_irqrestore(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i++) + ioapic_write_entry(dev->id, i, entry[i]); return 0; } @@ -2040,10 +2040,10 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p if (!ioapic && (irq < 16)) disable_8259A_irq(irq); + ioapic_write_entry(ioapic, pin, entry); + spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); - set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS); + set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); return 0; -- cgit v1.2.1 From cf4c6a2f27f5db810b69dcb1da7f194489e8ff88 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] i386: Factor out common io apic routing entry access The IO APIC code had lots of duplicated code to read/write 64bit routing entries into the IO-APIC. Factor this out int common read/write functions In a few cases the IO APIC lock is taken more often now, but this isn't a problem because it's all initialization/shutdown only slow path code. Similar to earlier x86-64 patch. Includes a fix by Jiri Slaby for a mistake that broke resume Signed-off-by: Andi Kleen --- arch/i386/kernel/io_apic.c | 100 +++++++++++++++++++-------------------------- 1 file changed, 43 insertions(+), 57 deletions(-) diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 4eacd52eeb74..b713f27d7e86 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -94,6 +94,34 @@ int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1}; #define vector_to_irq(vector) (vector) #endif + +union entry_union { + struct { u32 w1, w2; }; + struct IO_APIC_route_entry entry; +}; + +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) +{ + union entry_union eu; + unsigned long flags; + spin_lock_irqsave(&ioapic_lock, flags); + eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); + eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + return eu.entry; +} + +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + unsigned long flags; + union entry_union eu; + eu.entry = e; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are * shared ISA-space IRQs, so we have to support them. We are super @@ -201,13 +229,9 @@ static void unmask_IO_APIC_irq (unsigned int irq) static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; - unsigned long flags; /* Check delivery_mode to be sure we're not clearing an SMI pin */ - spin_lock_irqsave(&ioapic_lock, flags); - *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry = ioapic_read_entry(apic, pin); if (entry.delivery_mode == dest_SMI) return; @@ -216,10 +240,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) */ memset(&entry, 0, sizeof(entry)); entry.mask = 1; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(apic, pin, entry); } static void clear_IO_APIC (void) @@ -1284,9 +1305,8 @@ static void __init setup_IO_APIC_irqs(void) if (!apic && (irq < 16)) disable_8259A_irq(irq); } + ioapic_write_entry(apic, pin, entry); spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); set_native_irq_info(irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -1302,7 +1322,6 @@ static void __init setup_IO_APIC_irqs(void) static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) { struct IO_APIC_route_entry entry; - unsigned long flags; memset(&entry,0,sizeof(entry)); @@ -1332,10 +1351,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in /* * Add it to the IO-APIC irq-routing table: */ - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(apic, pin, entry); enable_8259A_irq(0); } @@ -1445,10 +1461,7 @@ void __init print_IO_APIC(void) for (i = 0; i <= reg_01.bits.entries; i++) { struct IO_APIC_route_entry entry; - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); - *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry = ioapic_read_entry(apic, i); printk(KERN_DEBUG " %02x %03X %02X ", i, @@ -1667,10 +1680,7 @@ static void __init enable_IO_APIC(void) /* See if any of the pins is in ExtINT mode */ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { struct IO_APIC_route_entry entry; - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry = ioapic_read_entry(apic, pin); /* If the interrupt line is enabled and in ExtInt mode @@ -1727,7 +1737,6 @@ void disable_IO_APIC(void) */ if (ioapic_i8259.pin != -1) { struct IO_APIC_route_entry entry; - unsigned long flags; memset(&entry, 0, sizeof(entry)); entry.mask = 0; /* Enabled */ @@ -1744,12 +1753,7 @@ void disable_IO_APIC(void) /* * Add it to the IO-APIC irq-routing table: */ - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin, - *(((int *)&entry)+1)); - io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin, - *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); } disconnect_bsp_APIC(ioapic_i8259.pin != -1); } @@ -2214,17 +2218,13 @@ static inline void unlock_ExtINT_logic(void) int apic, pin, i; struct IO_APIC_route_entry entry0, entry1; unsigned char save_control, save_freq_select; - unsigned long flags; pin = find_isa_irq_pin(8, mp_INT); apic = find_isa_irq_apic(8, mp_INT); if (pin == -1) return; - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry0 = ioapic_read_entry(apic, pin); clear_IO_APIC_pin(apic, pin); memset(&entry1, 0, sizeof(entry1)); @@ -2237,10 +2237,7 @@ static inline void unlock_ExtINT_logic(void) entry1.trigger = 0; entry1.vector = 0; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(apic, pin, entry1); save_control = CMOS_READ(RTC_CONTROL); save_freq_select = CMOS_READ(RTC_FREQ_SELECT); @@ -2259,10 +2256,7 @@ static inline void unlock_ExtINT_logic(void) CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); clear_IO_APIC_pin(apic, pin); - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(apic, pin, entry0); } int timer_uses_ioapic_pin_0; @@ -2462,17 +2456,12 @@ static int ioapic_suspend(struct sys_device *dev, pm_message_t state) { struct IO_APIC_route_entry *entry; struct sysfs_ioapic_data *data; - unsigned long flags; int i; data = container_of(dev, struct sysfs_ioapic_data, dev); entry = data->entry; - spin_lock_irqsave(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i); - *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i); - } - spin_unlock_irqrestore(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) + entry[i] = ioapic_read_entry(dev->id, i); return 0; } @@ -2494,11 +2483,9 @@ static int ioapic_resume(struct sys_device *dev) reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; io_apic_write(dev->id, 0, reg_00.raw); } - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1)); - io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0)); - } spin_unlock_irqrestore(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) + ioapic_write_entry(dev->id, i, entry[i]); return 0; } @@ -2695,9 +2682,8 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a if (!ioapic && (irq < 16)) disable_8259A_irq(irq); + ioapic_write_entry(ioapic, pin, entry); spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); -- cgit v1.2.1 From 606bd58de6542e847c51b1b6d83a4cd70a632fe7 Mon Sep 17 00:00:00 2001 From: Diego Calleja Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] x86: AUX_DEVICE_INFO is one byte long, use 'movb' Bugzilla #6552 says: "In arch/i386/boot/setup.S, movw is used instead of movb for PS/2 mouse information, although it is unsigned char. This does not harm, because the jmp instruction overwritten by movw is used before executing movw, and never be used again" I've no idea if this is a real bug or how it gets fixed, so I'm submitting it for review instead of letting it die of boredom in bugzilla. Aditionally to i386, I've changed x86-64, which mirrors the same code. Credits to Yoshinori K. Okuji, who found the problem and suggested a fix. Signed-off-by: Diego Calleja Signed-off-by: Andi Kleen --- arch/i386/boot/setup.S | 4 ++-- arch/x86_64/boot/setup.S | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/i386/boot/setup.S b/arch/i386/boot/setup.S index d2b684cd620a..3aec4538a113 100644 --- a/arch/i386/boot/setup.S +++ b/arch/i386/boot/setup.S @@ -494,12 +494,12 @@ no_voyager: movw %cs, %ax # aka SETUPSEG subw $DELTA_INITSEG, %ax # aka INITSEG movw %ax, %ds - movw $0, (0x1ff) # default is no pointing device + movb $0, (0x1ff) # default is no pointing device int $0x11 # int 0x11: equipment list testb $0x04, %al # check if mouse installed jz no_psmouse - movw $0xAA, (0x1ff) # device present + movb $0xAA, (0x1ff) # device present no_psmouse: #if defined(CONFIG_X86_SPEEDSTEP_SMI) || defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S index a50b631f4d2b..c3bfd223ab49 100644 --- a/arch/x86_64/boot/setup.S +++ b/arch/x86_64/boot/setup.S @@ -526,12 +526,12 @@ is_disk1: movw %cs, %ax # aka SETUPSEG subw $DELTA_INITSEG, %ax # aka INITSEG movw %ax, %ds - movw $0, (0x1ff) # default is no pointing device + movb $0, (0x1ff) # default is no pointing device int $0x11 # int 0x11: equipment list testb $0x04, %al # check if mouse installed jz no_psmouse - movw $0xAA, (0x1ff) # device present + movb $0xAA, (0x1ff) # device present no_psmouse: #include "../../i386/boot/edd.S" -- cgit v1.2.1 From edd965229669f8f8dfddec8c863250440fb65ab3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Remove MPS table APIC renumbering The MPS table specification says that the operating system should renumber the IO-APICs following the table as needed. However in ACPI this is not allowed or neeeded and all x86-64 systems are ACPI compliant. The code was already disabled on some systems because it caused problems there. Remove it completely now. CC: mdomsch@dell.com Signed-off-by: Andi Kleen --- arch/x86_64/kernel/io_apic.c | 71 -------------------------------------------- 1 file changed, 71 deletions(-) diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 1ccf1b41c3ed..a02ed17d05dc 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -1282,72 +1282,6 @@ void disable_IO_APIC(void) disconnect_bsp_APIC(ioapic_i8259.pin != -1); } -/* - * function to set the IO-APIC physical IDs based on the - * values stored in the MPC table. - * - * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 - */ - -static void __init setup_ioapic_ids_from_mpc (void) -{ - union IO_APIC_reg_00 reg_00; - int apic; - int i; - unsigned char old_id; - unsigned long flags; - - /* - * Set the IOAPIC ID to the value stored in the MPC table. - */ - for (apic = 0; apic < nr_ioapics; apic++) { - - /* Read the register 0 value */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - old_id = mp_ioapics[apic].mpc_apicid; - - - printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid); - - - /* - * We need to adjust the IRQ routing table - * if the ID changed. - */ - if (old_id != mp_ioapics[apic].mpc_apicid) - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mpc_dstapic == old_id) - mp_irqs[i].mpc_dstapic - = mp_ioapics[apic].mpc_apicid; - - /* - * Read the right value from the MPC table and - * write it into the ID register. - */ - apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...", - mp_ioapics[apic].mpc_apicid); - - reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0, reg_00.raw); - spin_unlock_irqrestore(&ioapic_lock, flags); - - /* - * Sanity check - */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) - printk("could not set ID!\n"); - else - apic_printk(APIC_VERBOSE," ok.\n"); - } -} - /* * There is a nasty bug in some older SMP boards, their mptable lies * about the timer IRQ. We do the following to work around the situation: @@ -1863,11 +1797,6 @@ void __init setup_IO_APIC(void) apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); - /* - * Set up the IO-APIC IRQ routing table. - */ - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); -- cgit v1.2.1 From dfa4698c50bf85b7927214b0e4a3dc4bc3b3c4a9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Move early chipset quirks out to new file They did not really belong into io_apic.c. Move them into a new file and clean it up a bit. Also remove outdated ATI quirk that was obsolete, Signed-off-by: Andi Kleen --- arch/x86_64/kernel/Makefile | 2 +- arch/x86_64/kernel/early-quirks.c | 118 ++++++++++++++++++++++++++++++++++++++ arch/x86_64/kernel/io_apic.c | 101 -------------------------------- arch/x86_64/kernel/setup.c | 2 +- include/asm-x86_64/proto.h | 2 +- 5 files changed, 121 insertions(+), 104 deletions(-) create mode 100644 arch/x86_64/kernel/early-quirks.c diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 0ef39553c0f6..000e67e8f028 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -8,7 +8,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ - pci-dma.o pci-nommu.o alternative.o + pci-dma.o pci-nommu.o alternative.o early-quirks.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_X86_MCE) += mce.o diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c new file mode 100644 index 000000000000..d637cff1c4b1 --- /dev/null +++ b/arch/x86_64/kernel/early-quirks.c @@ -0,0 +1,118 @@ +/* Various workarounds for chipset bugs. + This code runs very early and can't use the regular PCI subsystem + The entries are keyed to PCI bridges which usually identify chipsets + uniquely. + This is only for whole classes of chipsets with specific problems which + need early invasive action (e.g. before the timers are initialized). + Most PCI device specific workarounds can be done later and should be + in standard PCI quirks + Mainboard specific bugs should be handled by DMI entries. + CPU specific bugs in setup.c */ + +#include +#include +#include +#include +#include +#include + +static void via_bugs(void) +{ +#ifdef CONFIG_IOMMU + if ((end_pfn > MAX_DMA32_PFN || force_iommu) && + !iommu_aperture_allowed) { + printk(KERN_INFO + "Looks like a VIA chipset. Disabling IOMMU. Override with iommu=allowed\n"); + iommu_aperture_disabled = 1; + } +#endif +} + +#ifdef CONFIG_ACPI + +static int nvidia_hpet_detected __initdata; + +static int __init nvidia_hpet_check(unsigned long phys, unsigned long size) +{ + nvidia_hpet_detected = 1; + return 0; +} +#endif + +static void nvidia_bugs(void) +{ +#ifdef CONFIG_ACPI + /* + * All timer overrides on Nvidia are + * wrong unless HPET is enabled. + */ + nvidia_hpet_detected = 0; + acpi_table_parse(ACPI_HPET, nvidia_hpet_check); + if (nvidia_hpet_detected == 0) { + acpi_skip_timer_override = 1; + printk(KERN_INFO "Nvidia board " + "detected. Ignoring ACPI " + "timer override.\n"); + } +#endif + /* RED-PEN skip them on mptables too? */ + +} + +static void ati_bugs(void) +{ +#if 1 /* for testing */ + printk("ATI board detected\n"); +#endif + /* No bugs right now */ +} + +struct chipset { + u16 vendor; + void (*f)(void); +}; + +static struct chipset early_qrk[] = { + { PCI_VENDOR_ID_NVIDIA, nvidia_bugs }, + { PCI_VENDOR_ID_VIA, via_bugs }, + { PCI_VENDOR_ID_ATI, ati_bugs }, + {} +}; + +void __init early_quirks(void) +{ + int num, slot, func; + /* Poor man's PCI discovery */ + for (num = 0; num < 32; num++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { + u32 class; + u32 vendor; + u8 type; + int i; + class = read_pci_config(num,slot,func, + PCI_CLASS_REVISION); + if (class == 0xffffffff) + break; + + if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) + continue; + + vendor = read_pci_config(num, slot, func, + PCI_VENDOR_ID); + vendor &= 0xffff; + + for (i = 0; early_qrk[i].f; i++) + if (early_qrk[i].vendor == vendor) { + early_qrk[i].f(); + return; + } + + type = read_pci_config_byte(num, slot, func, + PCI_HEADER_TYPE); + if (!(type & 0x80)) + break; + } + } + } +} diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index a02ed17d05dc..8bbff64c20d7 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -280,107 +280,6 @@ static int __init setup_enable_8254_timer(char *s) __setup("disable_8254_timer", setup_disable_8254_timer); __setup("enable_8254_timer", setup_enable_8254_timer); -#include -#include -#include - - -#ifdef CONFIG_ACPI - -static int nvidia_hpet_detected __initdata; - -static int __init nvidia_hpet_check(unsigned long phys, unsigned long size) -{ - nvidia_hpet_detected = 1; - return 0; -} -#endif - -/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC - off. Check for an Nvidia or VIA PCI bridge and turn it off. - Use pci direct infrastructure because this runs before the PCI subsystem. - - Can be overwritten with "apic" - - And another hack to disable the IOMMU on VIA chipsets. - - ... and others. Really should move this somewhere else. - - Kludge-O-Rama. */ -void __init check_ioapic(void) -{ - int num,slot,func; - /* Poor man's PCI discovery */ - for (num = 0; num < 32; num++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { - u32 class; - u32 vendor; - u8 type; - class = read_pci_config(num,slot,func, - PCI_CLASS_REVISION); - if (class == 0xffffffff) - break; - - if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) - continue; - - vendor = read_pci_config(num, slot, func, - PCI_VENDOR_ID); - vendor &= 0xffff; - switch (vendor) { - case PCI_VENDOR_ID_VIA: -#ifdef CONFIG_IOMMU - if ((end_pfn > MAX_DMA32_PFN || - force_iommu) && - !iommu_aperture_allowed) { - printk(KERN_INFO - "Looks like a VIA chipset. Disabling IOMMU. Override with \"iommu=allowed\"\n"); - iommu_aperture_disabled = 1; - } -#endif - return; - case PCI_VENDOR_ID_NVIDIA: -#ifdef CONFIG_ACPI - /* - * All timer overrides on Nvidia are - * wrong unless HPET is enabled. - */ - nvidia_hpet_detected = 0; - acpi_table_parse(ACPI_HPET, - nvidia_hpet_check); - if (nvidia_hpet_detected == 0) { - acpi_skip_timer_override = 1; - printk(KERN_INFO "Nvidia board " - "detected. Ignoring ACPI " - "timer override.\n"); - } -#endif - /* RED-PEN skip them on mptables too? */ - return; - - /* This should be actually default, but - for 2.6.16 let's do it for ATI only where - it's really needed. */ - case PCI_VENDOR_ID_ATI: - if (timer_over_8254 == 1) { - timer_over_8254 = 0; - printk(KERN_INFO - "ATI board detected. Disabling timer routing over 8254.\n"); - } - return; - } - - - /* No multi-function device? */ - type = read_pci_config_byte(num,slot,func, - PCI_HEADER_TYPE); - if (!(type & 0x80)) - break; - } - } - } -} /* * Find the IRQ entry number of a certain pin. diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index fbe9f7faa2db..aab4bd66aa0d 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -653,7 +653,7 @@ void __init setup_arch(char **cmdline_p) paging_init(); - check_ioapic(); + early_quirks(); /* * set this early, so we dont allocate cpu0 diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index 3b1c60247902..58fec91318e4 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -92,7 +92,7 @@ extern void syscall32_cpu_init(void); extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long end); -extern void check_ioapic(void); +extern void early_quirks(void); extern void check_efer(void); extern int unhandled_signal(struct task_struct *tsk, int sig); -- cgit v1.2.1 From 55f05ffaa788e039df2f1ebe0d7bfbcb6f39d0b4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Replace mp bus array with bitmap for bus not pci Since we only support PCI and ISA legacy busses now there is no need to have an full array with checking. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/io_apic.c | 92 +++++++++++--------------------------------- arch/x86_64/kernel/mpparse.c | 9 ++--- include/asm-x86_64/mpspec.h | 8 +--- 3 files changed, 27 insertions(+), 82 deletions(-) diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 8bbff64c20d7..a1412042b918 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -308,7 +308,7 @@ static int __init find_isa_irq_pin(int irq, int type) for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].mpc_srcbus; - if (mp_bus_id_to_type[lbus] == MP_BUS_ISA && + if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) @@ -324,7 +324,7 @@ static int __init find_isa_irq_apic(int irq, int type) for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].mpc_srcbus; - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA) && + if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) break; @@ -364,7 +364,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) mp_irqs[i].mpc_dstapic == MP_APIC_ALL) break; - if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && + if (!test_bit(lbus, mp_bus_not_pci) && !mp_irqs[i].mpc_irqtype && (bus == lbus) && (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { @@ -410,28 +410,11 @@ static int __init MPBIOS_polarity(int idx) switch (mp_irqs[idx].mpc_irqflag & 3) { case 0: /* conforms, ie. bus-type dependent polarity */ - { - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - { - polarity = default_ISA_polarity(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - polarity = default_PCI_polarity(idx); - break; - } - default: - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - } + if (test_bit(bus, mp_bus_not_pci)) + polarity = default_ISA_polarity(idx); + else + polarity = default_PCI_polarity(idx); break; - } case 1: /* high active */ { polarity = 0; @@ -469,28 +452,11 @@ static int MPBIOS_trigger(int idx) switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) { case 0: /* conforms, ie. bus-type dependent */ - { - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - { - trigger = default_ISA_trigger(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - trigger = default_PCI_trigger(idx); - break; - } - default: - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - } + if (test_bit(bus, mp_bus_not_pci)) + trigger = default_ISA_trigger(idx); + else + trigger = default_PCI_trigger(idx); break; - } case 1: /* edge */ { trigger = 0; @@ -596,31 +562,17 @@ static int pin_2_irq(int idx, int apic, int pin) if (mp_irqs[idx].mpc_dstirq != pin) printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - { - irq = mp_irqs[idx].mpc_srcbusirq; - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - /* - * PCI IRQs are mapped in order - */ - i = irq = 0; - while (i < apic) - irq += nr_ioapic_registers[i++]; - irq += pin; - irq = gsi_irq_sharing(irq); - break; - } - default: - { - printk(KERN_ERR "unknown bus type %d.\n",bus); - irq = 0; - break; - } + if (test_bit(bus, mp_bus_not_pci)) { + irq = mp_irqs[idx].mpc_srcbusirq; + } else { + /* + * PCI IRQs are mapped in order + */ + i = irq = 0; + while (i < apic) + irq += nr_ioapic_registers[i++]; + irq += pin; + irq = gsi_irq_sharing(irq); } BUG_ON(irq >= NR_IRQS); return irq; diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index c12acc3b5552..a8d38c0bb449 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -42,7 +42,7 @@ int acpi_found_madt; * MP-table. */ unsigned char apic_version [MAX_APICS]; -unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; static int mp_current_pci_id = 0; @@ -173,9 +173,9 @@ static void __init MP_bus_info (struct mpc_config_bus *m) Dprintk("Bus #%d is %s\n", m->mpc_busid, str); if (strncmp(str, "ISA", 3) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; + set_bit(m->mpc_busid, mp_bus_not_pci); } else if (strncmp(str, "PCI", 3) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; + clear_bit(m->mpc_busid, mp_bus_not_pci); mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; mp_current_pci_id++; } else { @@ -808,8 +808,7 @@ void __init mp_config_acpi_legacy_irqs (void) /* * Fabricate the legacy ISA bus (bus #31). */ - mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); + set_bit(MP_ISA_BUS, mp_bus_not_pci); /* * Locate the IOAPIC that manages the ISA IRQs (0-15). diff --git a/include/asm-x86_64/mpspec.h b/include/asm-x86_64/mpspec.h index 1dc83436cc91..017fddb61dc5 100644 --- a/include/asm-x86_64/mpspec.h +++ b/include/asm-x86_64/mpspec.h @@ -159,13 +159,7 @@ struct mpc_config_lintsrc #define MAX_MP_BUSSES 256 /* Each PCI slot may be a combo card with its own bus. 4 IRQ pins per slot. */ #define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4) -enum mp_bustype { - MP_BUS_ISA = 1, - MP_BUS_EISA, - MP_BUS_PCI, - MP_BUS_MCA -}; -extern unsigned char mp_bus_id_to_type [MAX_MP_BUSSES]; +extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); extern int mp_bus_id_to_pci_bus [MAX_MP_BUSSES]; extern unsigned int boot_cpu_physical_apicid; -- cgit v1.2.1 From a01fd3baff5e2e744f9d1af0e5d3e6b6082e4dcf Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Remove useless wrapper in mpparse.c code It used to contain support code for NUMAQ, but that is long gone already on 64bit. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/mpparse.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index a8d38c0bb449..79423d1bfcac 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -579,7 +579,7 @@ static int __init smp_scan_config (unsigned long base, unsigned long length) return 0; } -void __init find_intel_smp (void) +void __init find_smp_config(void) { unsigned int address; @@ -617,15 +617,6 @@ void __init find_intel_smp (void) printk(KERN_INFO "No mptable found.\n"); } -/* - * - Intel MP Configuration Table - */ -void __init find_smp_config (void) -{ - find_intel_smp(); -} - - /* -------------------------------------------------------------------------- ACPI-based MP Configuration -------------------------------------------------------------------------- */ -- cgit v1.2.1 From 276ec1a76ad204d47947894a914e10e798400ffb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Remove some unneeded ACPI externs in mpparse.c They are not used in this file so remove them. i386 didn't have them either. Cc: len.brown@intel.com Signed-off-by: Andi Kleen --- arch/x86_64/kernel/mpparse.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 79423d1bfcac..0cc88f7f288b 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -70,15 +70,6 @@ unsigned disabled_cpus __initdata; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; -/* ACPI MADT entry parsing functions */ -#ifdef CONFIG_ACPI -extern struct acpi_boot_flags acpi_boot; -extern int acpi_parse_lapic (acpi_table_entry_header *header); -extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header); -extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header); -extern int acpi_parse_ioapic (acpi_table_entry_header *header); -#endif /*CONFIG_ACPI*/ - u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; -- cgit v1.2.1 From efec3b9a3282714c8441b9bf476f8358bed9ecaa Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Fix up some non linuxy style in ACPI functions in mpparse.c No functional changes. Cc: len.brown@intel.com Signed-off-by: Andi Kleen --- arch/x86_64/kernel/mpparse.c | 57 +++++++++++++------------------------------- 1 file changed, 16 insertions(+), 41 deletions(-) diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 0cc88f7f288b..d63f849aea2c 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -614,23 +614,17 @@ void __init find_smp_config(void) #ifdef CONFIG_ACPI -void __init mp_register_lapic_address ( - u64 address) +void __init mp_register_lapic_address(u64 address) { mp_lapic_addr = (unsigned long) address; - set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); - if (boot_cpu_id == -1U) boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); } - -void __cpuinit mp_register_lapic ( - u8 id, - u8 enabled) +void __cpuinit mp_register_lapic (u8 id, u8 enabled) { struct mpc_config_processor processor; int boot_cpu = 0; @@ -668,11 +662,9 @@ static struct mp_ioapic_routing { u32 pin_programmed[4]; } mp_ioapic_routing[MAX_IO_APICS]; - -static int mp_find_ioapic ( - int gsi) +static int mp_find_ioapic(int gsi) { - int i = 0; + int i = 0; /* Find the IOAPIC that manages this GSI. */ for (i = 0; i < nr_ioapics; i++) { @@ -682,17 +674,12 @@ static int mp_find_ioapic ( } printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); - return -1; } - -void __init mp_register_ioapic ( - u8 id, - u32 address, - u32 gsi_base) +void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) { - int idx = 0; + int idx = 0; if (nr_ioapics >= MAX_IO_APICS) { printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " @@ -729,16 +716,10 @@ void __init mp_register_ioapic ( mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, mp_ioapic_routing[idx].gsi_start, mp_ioapic_routing[idx].gsi_end); - - return; } - -void __init mp_override_legacy_irq ( - u8 bus_irq, - u8 polarity, - u8 trigger, - u32 gsi) +void __init +mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) { struct mpc_config_intsrc intsrc; int ioapic = -1; @@ -776,16 +757,13 @@ void __init mp_override_legacy_irq ( mp_irqs[mp_irq_entries] = intsrc; if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!\n"); - - return; } - -void __init mp_config_acpi_legacy_irqs (void) +void __init mp_config_acpi_legacy_irqs(void) { struct mpc_config_intsrc intsrc; - int i = 0; - int ioapic = -1; + int i = 0; + int ioapic = -1; /* * Fabricate the legacy ISA bus (bus #31). @@ -843,24 +821,22 @@ void __init mp_config_acpi_legacy_irqs (void) if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!\n"); } - - return; } #define MAX_GSI_NUM 4096 int mp_register_gsi(u32 gsi, int triggering, int polarity) { - int ioapic = -1; - int ioapic_pin = 0; - int idx, bit = 0; - static int pci_irq = 16; + int ioapic = -1; + int ioapic_pin = 0; + int idx, bit = 0; + static int pci_irq = 16; /* * Mapping between Global System Interrupts, which * represent all possible interrupts, to the IRQs * assigned to actual devices. */ - static int gsi_to_irq[MAX_GSI_NUM]; + static int gsi_to_irq[MAX_GSI_NUM]; if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; @@ -934,5 +910,4 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) polarity == ACPI_ACTIVE_HIGH ? 0 : 1); return gsi; } - #endif /*CONFIG_ACPI*/ -- cgit v1.2.1 From 19f03ffecdb599c1f64113b6dda0a1f143d2bab9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] i386: Clean up code style in mpparse.c ACPI code Remove some unlinuxy ways to write function parameter definitions. Remove some stray "return;"s No functional change. Cc: len.brown@intel.com Signed-off-by: Andi Kleen --- arch/i386/kernel/mpparse.c | 52 +++++++++++++++------------------------------- 1 file changed, 17 insertions(+), 35 deletions(-) diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index a2b126fe9a54..bdaa75a5bc1c 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -810,8 +810,7 @@ int es7000_plat; #ifdef CONFIG_ACPI -void __init mp_register_lapic_address ( - u64 address) +void __init mp_register_lapic_address(u64 address) { mp_lapic_addr = (unsigned long) address; @@ -823,13 +822,10 @@ void __init mp_register_lapic_address ( Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); } - -void __devinit mp_register_lapic ( - u8 id, - u8 enabled) +void __devinit mp_register_lapic (u8 id, u8 enabled) { struct mpc_config_processor processor; - int boot_cpu = 0; + int boot_cpu = 0; if (MAX_APICS - id <= 0) { printk(KERN_WARNING "Processor #%d invalid (max %d)\n", @@ -866,11 +862,9 @@ static struct mp_ioapic_routing { u32 pin_programmed[4]; } mp_ioapic_routing[MAX_IO_APICS]; - -static int mp_find_ioapic ( - int gsi) +static int mp_find_ioapic (int gsi) { - int i = 0; + int i = 0; /* Find the IOAPIC that manages this GSI. */ for (i = 0; i < nr_ioapics; i++) { @@ -883,15 +877,11 @@ static int mp_find_ioapic ( return -1; } - -void __init mp_register_ioapic ( - u8 id, - u32 address, - u32 gsi_base) +void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) { - int idx = 0; - int tmpid; + int idx = 0; + int tmpid; if (nr_ioapics >= MAX_IO_APICS) { printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " @@ -937,16 +927,10 @@ void __init mp_register_ioapic ( mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); - - return; } - -void __init mp_override_legacy_irq ( - u8 bus_irq, - u8 polarity, - u8 trigger, - u32 gsi) +void __init +mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) { struct mpc_config_intsrc intsrc; int ioapic = -1; @@ -984,15 +968,13 @@ void __init mp_override_legacy_irq ( mp_irqs[mp_irq_entries] = intsrc; if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!\n"); - - return; } void __init mp_config_acpi_legacy_irqs (void) { struct mpc_config_intsrc intsrc; - int i = 0; - int ioapic = -1; + int i = 0; + int ioapic = -1; /* * Fabricate the legacy ISA bus (bus #31). @@ -1061,12 +1043,12 @@ void __init mp_config_acpi_legacy_irqs (void) #define MAX_GSI_NUM 4096 -int mp_register_gsi (u32 gsi, int triggering, int polarity) +int mp_register_gsi(u32 gsi, int triggering, int polarity) { - int ioapic = -1; - int ioapic_pin = 0; - int idx, bit = 0; - static int pci_irq = 16; + int ioapic = -1; + int ioapic_pin = 0; + int idx, bit = 0; + static int pci_irq = 16; /* * Mapping between Global System Interrups, which * represent all possible interrupts, and IRQs -- cgit v1.2.1 From fe7414a2882c953788af13a7f2c9d570ed8f71c5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Use BUILD_BUG_ON in apic.c build sanity checking Makes code a little shorter. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/apic.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 20a76f209629..496cd1bd2ae5 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -265,8 +265,6 @@ void __init sync_Arb_IDs(void) | APIC_DM_INIT); } -extern void __error_in_apic_c (void); - /* * An initial setup of the virtual wire mode. */ @@ -313,8 +311,7 @@ void __cpuinit setup_local_APIC (void) value = apic_read(APIC_LVR); - if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f) - __error_in_apic_c(); + BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f); /* * Double-check whether this APIC is really registered. -- cgit v1.2.1 From e2414910f212c52d9d7c64c99a22863488ac5b48 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] x86: Detect CFI support in the assembler at runtime ... instead of using a CONFIG option. The config option still controls if the resulting executable actually has unwind information. This is useful to prevent compilation errors when users select CONFIG_STACK_UNWIND on old binutils and also allows to use CFI in the future for non kernel debugging applications. Cc: jbeulich@novell.com Cc: sam@ravnborg.org Signed-off-by: Andi Kleen --- Documentation/kbuild/makefiles.txt | 5 +++++ arch/i386/Makefile | 4 ++++ arch/x86_64/Makefile | 3 +++ include/asm-x86_64/dwarf2.h | 2 +- scripts/Kbuild.include | 7 +++++++ 5 files changed, 20 insertions(+), 1 deletion(-) diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt index b7d6abb501a6..e2cbd59cf2d0 100644 --- a/Documentation/kbuild/makefiles.txt +++ b/Documentation/kbuild/makefiles.txt @@ -421,6 +421,11 @@ more details, with real examples. The second argument is optional, and if supplied will be used if first argument is not supported. + as-instr + as-instr checks if the assembler reports a specific instruction + and then outputs either option1 or option2 + C escapes are supported in the test instruction + cc-option cc-option is used to check if $(CC) supports a given option, and not supported to use an optional second option. diff --git a/arch/i386/Makefile b/arch/i386/Makefile index 3e4adb1e2244..508cdbeb3a09 100644 --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -46,6 +46,10 @@ cflags-y += -ffreestanding # a lot more stack due to the lack of sharing of stacklots: CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then echo $(call cc-option,-fno-unit-at-a-time); fi ;) +# do binutils support CFI? +cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) +AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) + CFLAGS += $(cflags-y) # Default subarch .c files diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index 431bb4bc36cd..d6472ddf5f6e 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -54,6 +54,9 @@ endif cflags-y += $(call cc-option,-funit-at-a-time) # prevent gcc from generating any FP code by mistake cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) +# do binutils support CFI? +cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) +AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) CFLAGS += $(cflags-y) CFLAGS_KERNEL += $(cflags-kernel-y) diff --git a/include/asm-x86_64/dwarf2.h b/include/asm-x86_64/dwarf2.h index 0744db777676..2b9368365fad 100644 --- a/include/asm-x86_64/dwarf2.h +++ b/include/asm-x86_64/dwarf2.h @@ -13,7 +13,7 @@ away for older version. */ -#ifdef CONFIG_UNWIND_INFO +#ifdef CONFIG_AS_CFI #define CFI_STARTPROC .cfi_startproc #define CFI_ENDPROC .cfi_endproc diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index 3d523899fdc0..7adef12a0c26 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -63,6 +63,13 @@ as-option = $(shell if $(CC) $(CFLAGS) $(1) -Wa,-Z -c -o /dev/null \ -xassembler /dev/null > /dev/null 2>&1; then echo "$(1)"; \ else echo "$(2)"; fi ;) +# as-instr +# Usage: cflags-y += $(call as-instr, instr, option1, option2) + +as-instr = $(shell if echo -e "$(1)" | $(AS) -Z -o astest$$$$.out \ + 2>&1 >/dev/null ; then echo "$(2)"; else echo "$(3)"; fi; \ + rm -f astest$$$$.out) + # cc-option # Usage: cflags-y += $(call cc-option, -march=winchip-c6, -march=i586) -- cgit v1.2.1 From 44cc45267bbe7c64f7d85b074bd670b48b5abdfb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] Remove obsolete CVS $Id$ from assembler files in arch/x86_64/kernel/* CVS hasn't been used for a long time for them. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/entry.S | 2 -- arch/x86_64/kernel/head.S | 2 -- 2 files changed, 4 deletions(-) diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index a04fc4108ff2..2092f565aa87 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -4,8 +4,6 @@ * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs * Copyright (C) 2000 Pavel Machek - * - * $Id$ */ /* diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 505ec4a57506..4a3326ce709c 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -5,8 +5,6 @@ * Copyright (C) 2000 Pavel Machek * Copyright (C) 2000 Karsten Keil * Copyright (C) 2001,2002 Andi Kleen - * - * $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $ */ -- cgit v1.2.1 From caff0710ebf6f2c44cbd2b8b31fd6329148bed2e Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] initialize end of memory variables as early as possible While an earlier patch already did a small step into that direction, this patch moves initialization of all memory end variables to as early as possible, so that dependent code doesn't need to check whether these variables have already been set. Also, remove a misleading (perhaps just outdated) comment, and make static a variable only used in a single file. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/x86_64/kernel/e820.c | 2 +- arch/x86_64/kernel/setup.c | 7 ++++++- arch/x86_64/mm/init.c | 6 ------ 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index d6d7f731f6f0..1362aad4f3aa 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -40,7 +40,7 @@ unsigned long end_pfn_map; /* * Last pfn which the user wants to use. */ -unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT; +static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; extern struct resource code_resource, data_resource; diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index aab4bd66aa0d..ddc3b2d9a165 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -554,7 +554,7 @@ void __init setup_arch(char **cmdline_p) * we are rounding upwards: */ end_pfn = e820_end_of_ram(); - num_physpages = end_pfn; /* for pfn_valid */ + num_physpages = end_pfn; check_efer(); @@ -574,6 +574,11 @@ void __init setup_arch(char **cmdline_p) acpi_boot_table_init(); #endif + /* How many end-of-memory variables you have, grandma! */ + max_low_pfn = end_pfn; + max_pfn = end_pfn; + high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1; + #ifdef CONFIG_ACPI_NUMA /* * Parse SRAT to discover nodes. diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index d14fb2dfbfc4..d40134bd6399 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -597,12 +597,6 @@ void __init mem_init(void) pci_iommu_alloc(); - /* How many end-of-memory variables you have, grandma! */ - max_low_pfn = end_pfn; - max_pfn = end_pfn; - num_physpages = end_pfn; - high_memory = (void *) __va(end_pfn * PAGE_SIZE); - /* clear the zero-page */ memset(empty_zero_page, 0, PAGE_SIZE); -- cgit v1.2.1 From 3b94355c47e2b025a7ececa0a14180e8fce6b0f1 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] remove int_delivery_dest The genapic field and the accessor macro weren't used anywhere. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/x86_64/kernel/genapic_cluster.c | 1 - arch/x86_64/kernel/genapic_flat.c | 2 -- include/asm-x86_64/genapic.h | 1 - include/asm-x86_64/mach_apic.h | 1 - 4 files changed, 5 deletions(-) diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c index 3020917546de..cdb90e671b88 100644 --- a/arch/x86_64/kernel/genapic_cluster.c +++ b/arch/x86_64/kernel/genapic_cluster.c @@ -118,7 +118,6 @@ struct genapic apic_cluster = { .name = "clustered", .int_delivery_mode = dest_Fixed, .int_dest_mode = (APIC_DEST_PHYSICAL != 0), - .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED, .target_cpus = cluster_target_cpus, .apic_id_registered = cluster_apic_id_registered, .init_apic_ldr = cluster_init_apic_ldr, diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c index eb86d374813a..2ceb8b872b57 100644 --- a/arch/x86_64/kernel/genapic_flat.c +++ b/arch/x86_64/kernel/genapic_flat.c @@ -121,7 +121,6 @@ struct genapic apic_flat = { .name = "flat", .int_delivery_mode = dest_LowestPrio, .int_dest_mode = (APIC_DEST_LOGICAL != 0), - .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST, .target_cpus = flat_target_cpus, .apic_id_registered = flat_apic_id_registered, .init_apic_ldr = flat_init_apic_ldr, @@ -180,7 +179,6 @@ struct genapic apic_physflat = { .name = "physical flat", .int_delivery_mode = dest_Fixed, .int_dest_mode = (APIC_DEST_PHYSICAL != 0), - .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED, .target_cpus = physflat_target_cpus, .apic_id_registered = flat_apic_id_registered, .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/ diff --git a/include/asm-x86_64/genapic.h b/include/asm-x86_64/genapic.h index 50b38e7c58e4..81e714665344 100644 --- a/include/asm-x86_64/genapic.h +++ b/include/asm-x86_64/genapic.h @@ -16,7 +16,6 @@ struct genapic { char *name; u32 int_delivery_mode; u32 int_dest_mode; - u32 int_delivery_dest; /* for quick IPIs */ int (*apic_id_registered)(void); cpumask_t (*target_cpus)(void); void (*init_apic_ldr)(void); diff --git a/include/asm-x86_64/mach_apic.h b/include/asm-x86_64/mach_apic.h index 0acea44c9377..d33422450c00 100644 --- a/include/asm-x86_64/mach_apic.h +++ b/include/asm-x86_64/mach_apic.h @@ -16,7 +16,6 @@ #define INT_DELIVERY_MODE (genapic->int_delivery_mode) #define INT_DEST_MODE (genapic->int_dest_mode) -#define INT_DELIVERY_DEST (genapic->int_delivery_dest) #define TARGET_CPUS (genapic->target_cpus()) #define apic_id_registered (genapic->apic_id_registered) #define init_apic_ldr (genapic->init_apic_ldr) -- cgit v1.2.1 From ba9c231f7499ff6918c069c72ff5fd836c76b963 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] i386: initialize end-of-memory variables as early as possible Move initialization of all memory end variables to as early as possible, so that dependent code doesn't need to check whether these variables have already been set. Change the range check in kunmap_atomic to actually make use of this so that the no-mapping-estabished path (under CONFIG_DEBUG_HIGHMEM) gets used only when the address is inside the lowmem area (and BUG() otherwise). Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/kernel/setup.c | 8 ++++++++ arch/i386/mm/discontig.c | 5 +++++ arch/i386/mm/highmem.c | 2 +- arch/i386/mm/init.c | 20 -------------------- 4 files changed, 14 insertions(+), 21 deletions(-) diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index f1682206d304..71a540362b78 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -1170,6 +1170,14 @@ static unsigned long __init setup_memory(void) } printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); + num_physpages = highend_pfn; + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; +#else + num_physpages = max_low_pfn; + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; +#endif +#ifdef CONFIG_FLATMEM + max_mapnr = num_physpages; #endif printk(KERN_NOTICE "%ldMB LOWMEM available.\n", pages_to_mb(max_low_pfn)); diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c index 7c392dc553b8..f0c10b3cd158 100644 --- a/arch/i386/mm/discontig.c +++ b/arch/i386/mm/discontig.c @@ -313,6 +313,11 @@ unsigned long __init setup_memory(void) highstart_pfn = system_max_low_pfn; printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); + num_physpages = highend_pfn; + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; +#else + num_physpages = system_max_low_pfn; + high_memory = (void *) __va(system_max_low_pfn * PAGE_SIZE - 1) + 1; #endif printk(KERN_NOTICE "%ldMB LOWMEM available.\n", pages_to_mb(system_max_low_pfn)); diff --git a/arch/i386/mm/highmem.c b/arch/i386/mm/highmem.c index b6eb4dcb8777..ba44000b9069 100644 --- a/arch/i386/mm/highmem.c +++ b/arch/i386/mm/highmem.c @@ -54,7 +54,7 @@ void kunmap_atomic(void *kvaddr, enum km_type type) unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); - if (vaddr < FIXADDR_START) { // FIXME + if (vaddr >= PAGE_OFFSET && vaddr < (unsigned long)high_memory) { dec_preempt_count(); preempt_check_resched(); return; diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 89e8486aac34..d0c2fdf6a4d7 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -552,18 +552,6 @@ static void __init test_wp_bit(void) } } -static void __init set_max_mapnr_init(void) -{ -#ifdef CONFIG_HIGHMEM - num_physpages = highend_pfn; -#else - num_physpages = max_low_pfn; -#endif -#ifdef CONFIG_FLATMEM - max_mapnr = num_physpages; -#endif -} - static struct kcore_list kcore_mem, kcore_vmalloc; void __init mem_init(void) @@ -590,14 +578,6 @@ void __init mem_init(void) } #endif - set_max_mapnr_init(); - -#ifdef CONFIG_HIGHMEM - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; -#else - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; -#endif - /* this will put all low memory onto the freelists */ totalram_pages += free_all_bootmem(); -- cgit v1.2.1 From 352f7bae81faa2befa2a3c02b84478dce16b8fd6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] Add stack documentation document from Keith Owens Describes the stack organization on x86-64. I changed it a bit and removed some obsolete information and the questions. Cc: kaos@sgi.com Signed-off-by: Andi Kleen --- Documentation/x86_64/kernel-stacks | 99 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 Documentation/x86_64/kernel-stacks diff --git a/Documentation/x86_64/kernel-stacks b/Documentation/x86_64/kernel-stacks new file mode 100644 index 000000000000..bddfddd466ab --- /dev/null +++ b/Documentation/x86_64/kernel-stacks @@ -0,0 +1,99 @@ +Most of the text from Keith Owens, hacked by AK + +x86_64 page size (PAGE_SIZE) is 4K. + +Like all other architectures, x86_64 has a kernel stack for every +active thread. These thread stacks are THREAD_SIZE (2*PAGE_SIZE) big. +These stacks contain useful data as long as a thread is alive or a +zombie. While the thread is in user space the kernel stack is empty +except for the thread_info structure at the bottom. + +In addition to the per thread stacks, there are specialized stacks +associated with each cpu. These stacks are only used while the kernel +is in control on that cpu, when a cpu returns to user space the +specialized stacks contain no useful data. The main cpu stacks is + +* Interrupt stack. IRQSTACKSIZE + + Used for external hardware interrupts. If this is the first external + hardware interrupt (i.e. not a nested hardware interrupt) then the + kernel switches from the current task to the interrupt stack. Like + the split thread and interrupt stacks on i386 (with CONFIG_4KSTACKS), + this gives more room for kernel interrupt processing without having + to increase the size of every per thread stack. + + The interrupt stack is also used when processing a softirq. + +Switching to the kernel interrupt stack is done by software based on a +per CPU interrupt nest counter. This is needed because x86-64 "IST" +hardware stacks cannot nest without races. + +x86_64 also has a feature which is not available on i386, the ability +to automatically switch to a new stack for designated events such as +double fault or NMI, which makes it easier to handle these unusual +events on x86_64. This feature is called the Interrupt Stack Table +(IST). There can be up to 7 IST entries per cpu. The IST code is an +index into the Task State Segment (TSS), the IST entries in the TSS +point to dedicated stacks, each stack can be a different size. + +An IST is selected by an non-zero value in the IST field of an +interrupt-gate descriptor. When an interrupt occurs and the hardware +loads such a descriptor, the hardware automatically sets the new stack +pointer based on the IST value, then invokes the interrupt handler. If +software wants to allow nested IST interrupts then the handler must +adjust the IST values on entry to and exit from the interrupt handler. +(this is occasionally done, e.g. for debug exceptions) + +Events with different IST codes (i.e. with different stacks) can be +nested. For example, a debug interrupt can safely be interrupted by an +NMI. arch/x86_64/kernel/entry.S::paranoidentry adjusts the stack +pointers on entry to and exit from all IST events, in theory allowing +IST events with the same code to be nested. However in most cases, the +stack size allocated to an IST assumes no nesting for the same code. +If that assumption is ever broken then the stacks will become corrupt. + +The currently assigned IST stacks are :- + +* STACKFAULT_STACK. EXCEPTION_STKSZ (PAGE_SIZE). + + Used for interrupt 12 - Stack Fault Exception (#SS). + + This allows to recover from invalid stack segments. Rarely + happens. + +* DOUBLEFAULT_STACK. EXCEPTION_STKSZ (PAGE_SIZE). + + Used for interrupt 8 - Double Fault Exception (#DF). + + Invoked when handling a exception causes another exception. Happens + when the kernel is very confused (e.g. kernel stack pointer corrupt) + Using a separate stack allows to recover from it well enough in many + cases to still output an oops. + +* NMI_STACK. EXCEPTION_STKSZ (PAGE_SIZE). + + Used for non-maskable interrupts (NMI). + + NMI can be delivered at any time, including when the kernel is in the + middle of switching stacks. Using IST for NMI events avoids making + assumptions about the previous state of the kernel stack. + +* DEBUG_STACK. DEBUG_STKSZ + + Used for hardware debug interrupts (interrupt 1) and for software + debug interrupts (INT3). + + When debugging a kernel, debug interrupts (both hardware and + software) can occur at any time. Using IST for these interrupts + avoids making assumptions about the previous state of the kernel + stack. + +* MCE_STACK. EXCEPTION_STKSZ (PAGE_SIZE). + + Used for interrupt 18 - Machine Check Exception (#MC). + + MCE can be delivered at any time, including when the kernel is in the + middle of switching stacks. Using IST for MCE events avoids making + assumptions about the previous state of the kernel stack. + +For more details see the Intel IA32 or AMD AMD64 architecture manuals. -- cgit v1.2.1 From 5f4a7a93886ce1a4327f6028cc05d423f39eebf0 Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] Calgary IOMMU: rearrange 'struct iommu_table' members Rearrange struct members loosely based on size for improved alignment and to save a few bytes. Signed-off-by: Muli Ben-Yehuda Signed-off-by: Jon Mason Signed-off-by: Andi Kleen --- include/asm-x86_64/calgary.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/asm-x86_64/calgary.h b/include/asm-x86_64/calgary.h index 4e3919524240..0a03bda94d03 100644 --- a/include/asm-x86_64/calgary.h +++ b/include/asm-x86_64/calgary.h @@ -34,12 +34,12 @@ struct iommu_table { unsigned long it_base; /* mapped address of tce table */ unsigned long it_hint; /* Hint for next alloc */ unsigned long *it_map; /* A simple allocation bitmap for now */ + void __iomem *bbar; /* Bridge BAR */ + u64 tar_val; /* Table Address Register */ + struct timer_list watchdog_timer; spinlock_t it_lock; /* Protects it_map */ unsigned int it_size; /* Size of iommu table in entries */ unsigned char it_busno; /* Bus number this table belongs to */ - void __iomem *bbar; - u64 tar_val; - struct timer_list watchdog_timer; }; #define TCE_TABLE_SIZE_UNSPECIFIED ~0 -- cgit v1.2.1 From f38db651d5da5e10235fd7dd31095969fb7ef6fb Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] Calgary IOMMU: consolidate per bus data structures Move the tce_table_kva array, disabled bitmap and bus_to_phb array into a new per bus 'struct calgary_bus_info'. Also slightly reorganize build_tce_table and tce_table_setparms to avoid exporting bus_info to tce.c. Signed-off-by: Muli Ben-Yehuda Signed-off-by: Jon Mason Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-calgary.c | 53 +++++++++++++++++++++------------------- arch/x86_64/kernel/tce.c | 10 -------- include/asm-x86_64/tce.h | 1 - 3 files changed, 28 insertions(+), 36 deletions(-) diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 146924ba5df5..96f6a866afad 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -111,17 +111,17 @@ static const unsigned long phb_offsets[] = { 0xB000 /* PHB3 */ }; -static char bus_to_phb[MAX_PHB_BUS_NUM]; -void* tce_table_kva[MAX_PHB_BUS_NUM]; unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; static int translate_empty_slots __read_mostly = 0; static int calgary_detected __read_mostly = 0; -/* - * the bitmap of PHBs the user requested that we disable - * translation on. - */ -static DECLARE_BITMAP(translation_disabled, MAX_PHB_BUS_NUM); +struct calgary_bus_info { + void *tce_space; + int translation_disabled; + signed char phbid; +}; + +static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; static void tce_cache_blast(struct iommu_table *tbl); @@ -149,7 +149,7 @@ static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) static inline int translate_phb(struct pci_dev* dev) { - int disabled = test_bit(dev->bus->number, translation_disabled); + int disabled = bus_info[dev->bus->number].translation_disabled; return !disabled; } @@ -454,7 +454,7 @@ static struct dma_mapping_ops calgary_dma_ops = { static inline int busno_to_phbid(unsigned char num) { - return bus_to_phb[num]; + return bus_info[num].phbid; } static inline unsigned long split_queue_offset(unsigned char num) @@ -631,6 +631,10 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) if (ret) return ret; + tbl = dev->sysdata; + tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; + tce_free(tbl, 0, tbl->it_size); + calgary_reserve_regions(dev); /* set TARs for each PHB */ @@ -824,7 +828,7 @@ static int __init calgary_init(void) calgary_init_one_nontraslated(dev); continue; } - if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) { + if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) { pci_dev_put(dev); continue; } @@ -844,7 +848,7 @@ error: pci_dev_put(dev); continue; } - if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) + if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) continue; calgary_disable_translation(dev); calgary_free_tar(dev); @@ -894,9 +898,8 @@ void __init detect_calgary(void) for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { int dev; - - tce_table_kva[bus] = NULL; - bus_to_phb[bus] = -1; + struct calgary_bus_info *info = &bus_info[bus]; + info->phbid = -1; if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY) continue; @@ -907,12 +910,9 @@ void __init detect_calgary(void) */ phb = (phb + 1) % PHBS_PER_CALGARY; - if (test_bit(bus, translation_disabled)) { - printk(KERN_INFO "Calgary: translation is disabled for " - "PHB 0x%x\n", bus); - /* skip this phb, don't allocate a tbl for it */ + if (info->translation_disabled) continue; - } + /* * Scan the slots of the PCI bus to see if there is a device present. * The parent bus will be the zero-ith device, so start at 1. @@ -923,8 +923,8 @@ void __init detect_calgary(void) tbl = alloc_tce_table(); if (!tbl) goto cleanup; - tce_table_kva[bus] = tbl; - bus_to_phb[bus] = phb; + info->tce_space = tbl; + info->phbid = phb; calgary_found = 1; break; } @@ -940,9 +940,12 @@ void __init detect_calgary(void) return; cleanup: - for (--bus; bus >= 0; --bus) - if (tce_table_kva[bus]) - free_tce_table(tce_table_kva[bus]); + for (--bus; bus >= 0; --bus) { + struct calgary_bus_info *info = &bus_info[bus]; + + if (info->tce_space) + free_tce_table(info->tce_space); + } } int __init calgary_iommu_init(void) @@ -1016,7 +1019,7 @@ static int __init calgary_parse_options(char *p) if (bridge < MAX_PHB_BUS_NUM) { printk(KERN_INFO "Calgary: disabling " "translation for PHB 0x%x\n", bridge); - set_bit(bridge, translation_disabled); + bus_info[bridge].translation_disabled = 1; } } diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c index 0905ce6b9b32..cbabfdf78e06 100644 --- a/arch/x86_64/kernel/tce.c +++ b/arch/x86_64/kernel/tce.c @@ -106,14 +106,6 @@ static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl) /* set the tce table size - measured in entries */ tbl->it_size = table_size_to_number_of_entries(specified_table_size); - tbl->it_base = (unsigned long)tce_table_kva[dev->bus->number]; - if (!tbl->it_base) { - printk(KERN_ERR "Calgary: iommu_table_setparms: " - "no table allocated?!\n"); - ret = -ENOMEM; - goto done; - } - /* * number of bytes needed for the bitmap size in number of * entries; we need one bit per entry @@ -162,8 +154,6 @@ int build_tce_table(struct pci_dev *dev, void __iomem *bbar) if (ret) goto free_tbl; - tce_free(tbl, 0, tbl->it_size); - tbl->bbar = bbar; /* diff --git a/include/asm-x86_64/tce.h b/include/asm-x86_64/tce.h index 53e9a68b3336..dbb047febc5e 100644 --- a/include/asm-x86_64/tce.h +++ b/include/asm-x86_64/tce.h @@ -24,7 +24,6 @@ #ifndef _ASM_X86_64_TCE_H #define _ASM_X86_64_TCE_H -extern void* tce_table_kva[]; extern unsigned int specified_table_size; struct iommu_table; -- cgit v1.2.1 From 9f2dc46d5ec6fd7787182d2232a1003af11879f1 Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] Calgary IOMMU: break out of pci_find_device_reverse if dev not found Signed-off-by: Muli Ben-Yehuda Signed-off-by: Jon Mason Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-calgary.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 96f6a866afad..b2182c936305 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -844,6 +844,8 @@ error: dev = pci_find_device_reverse(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_CALGARY, dev); + if (!dev) + break; if (!translate_phb(dev)) { pci_dev_put(dev); continue; -- cgit v1.2.1 From b8f4fe66a560b5ccbb4d4d0d2df356a5d9b98b83 Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] Calgary IOMMU: fix error path memleak in calgary_free_tar We were freeing the iommu_table and leaking the bitmap pages. Also rename it to calgary_free_bus, which is more accurate. Signed-off-by: Muli Ben-Yehuda Signed-off-by: Jon Mason Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-calgary.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index b2182c936305..6c5da1d813f3 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -658,11 +658,12 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) return 0; } -static void __init calgary_free_tar(struct pci_dev *dev) +static void __init calgary_free_bus(struct pci_dev *dev) { u64 val64; struct iommu_table *tbl = dev->sysdata; void __iomem *target; + unsigned int bitmapsz; target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number)); val64 = be64_to_cpu(readq(target)); @@ -670,8 +671,15 @@ static void __init calgary_free_tar(struct pci_dev *dev) writeq(cpu_to_be64(val64), target); readq(target); /* flush */ + bitmapsz = tbl->it_size / BITS_PER_BYTE; + free_pages((unsigned long)tbl->it_map, get_order(bitmapsz)); + tbl->it_map = NULL; + kfree(tbl); dev->sysdata = NULL; + + /* Can't free bootmem allocated memory after system is up :-( */ + bus_info[dev->bus->number].tce_space = NULL; } static void calgary_watchdog(unsigned long data) @@ -853,7 +861,7 @@ error: if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) continue; calgary_disable_translation(dev); - calgary_free_tar(dev); + calgary_free_bus(dev); pci_dev_put(dev); } -- cgit v1.2.1 From 871b17008e93d7e96f96829ce5f0393c9902d25b Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] Calgary IOMMU: fix reference counting of Calgary PCI devices The pci_get_device() API decrements the reference count on the 'from' parameter when it continues searching. Therefore, take a ref count on Calgary bus when we initialize them in either translated or non-translated mode. Signed-off-by: Muli Ben-Yehuda Signed-off-by: Jon Mason Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-calgary.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 6c5da1d813f3..7302834718c8 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -786,6 +786,7 @@ static inline unsigned int __init locate_register_space(struct pci_dev *dev) static int __init calgary_init_one_nontraslated(struct pci_dev *dev) { + pci_dev_get(dev); dev->sysdata = NULL; dev->bus->self = dev; @@ -810,6 +811,7 @@ static int __init calgary_init_one(struct pci_dev *dev) if (ret) goto iounmap; + pci_dev_get(dev); dev->bus->self = dev; calgary_enable_translation(dev); @@ -836,10 +838,9 @@ static int __init calgary_init(void) calgary_init_one_nontraslated(dev); continue; } - if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) { - pci_dev_put(dev); + if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) continue; - } + ret = calgary_init_one(dev); if (ret) goto error; @@ -860,9 +861,10 @@ error: } if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) continue; + calgary_disable_translation(dev); calgary_free_bus(dev); - pci_dev_put(dev); + pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ } return ret; -- cgit v1.2.1 From a4fc520a0ff92810eea46d74bf742ef73849302e Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] Calgary IOMMU: calgary_init_one_nontraslated() can return void Signed-off-by: Muli Ben-Yehuda Signed-off-by: Jon Mason Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-calgary.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 7302834718c8..f541fb564d92 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -784,13 +784,11 @@ static inline unsigned int __init locate_register_space(struct pci_dev *dev) return address; } -static int __init calgary_init_one_nontraslated(struct pci_dev *dev) +static void __init calgary_init_one_nontraslated(struct pci_dev *dev) { pci_dev_get(dev); dev->sysdata = NULL; dev->bus->self = dev; - - return 0; } static int __init calgary_init_one(struct pci_dev *dev) -- cgit v1.2.1 From 0577f148b5e9a773020e3da1e6332a7c6df9d601 Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] Calgary IOMMU: save a bit of space in bus_info Make translation_disabled a uchar rather than an int Signed-off-by: Muli Ben-Yehuda Signed-off-by: Jon Mason Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-calgary.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index f541fb564d92..caf84e4e4ca9 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -117,7 +117,7 @@ static int calgary_detected __read_mostly = 0; struct calgary_bus_info { void *tce_space; - int translation_disabled; + unsigned char translation_disabled; signed char phbid; }; -- cgit v1.2.1 From 01215ad8d83e18321d99e9b5750a6f21cac243a2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] i386: Remove lock section support in mutex.h Lock sections don't work the new dwarf2 unwinder This generates slightly smaller code. It adds one more taken jump to the fast path. Cc: jbeulich@novell.com Signed-off-by: Andi Kleen --- include/asm-i386/mutex.h | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/include/asm-i386/mutex.h b/include/asm-i386/mutex.h index 05a538531229..7a17d9e58ad6 100644 --- a/include/asm-i386/mutex.h +++ b/include/asm-i386/mutex.h @@ -30,14 +30,10 @@ do { \ \ __asm__ __volatile__( \ LOCK_PREFIX " decl (%%eax) \n" \ - " js 2f \n" \ + " jns 1f \n" \ + " call "#fail_fn" \n" \ "1: \n" \ \ - LOCK_SECTION_START("") \ - "2: call "#fail_fn" \n" \ - " jmp 1b \n" \ - LOCK_SECTION_END \ - \ :"=a" (dummy) \ : "a" (count) \ : "memory", "ecx", "edx"); \ @@ -86,14 +82,10 @@ do { \ \ __asm__ __volatile__( \ LOCK_PREFIX " incl (%%eax) \n" \ - " jle 2f \n" \ + " jg 1f \n" \ + " call "#fail_fn" \n" \ "1: \n" \ \ - LOCK_SECTION_START("") \ - "2: call "#fail_fn" \n" \ - " jmp 1b \n" \ - LOCK_SECTION_END \ - \ :"=a" (dummy) \ : "a" (count) \ : "memory", "ecx", "edx"); \ -- cgit v1.2.1 From add659bf8aa92f8b3f01a8c0220557c959507fb1 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] i386: Remove lock section support in rwsem.h Lock sections don't work the new dwarf2 unwinder This generates slightly smaller code. It adds one more taken jump to the fast path. Also move the trampolines into semaphore.S and add proper CFI annotations. Cc: jbeulich@novell.com Signed-off-by: Andi Kleen --- arch/i386/lib/semaphore.S | 63 +++++++++++++++++++++++++++++++++++++++++++++++ include/asm-i386/rwsem.h | 62 +++++++++------------------------------------- 2 files changed, 75 insertions(+), 50 deletions(-) diff --git a/arch/i386/lib/semaphore.S b/arch/i386/lib/semaphore.S index e16ff1696390..01f80b5c45d2 100644 --- a/arch/i386/lib/semaphore.S +++ b/arch/i386/lib/semaphore.S @@ -152,3 +152,66 @@ ENTRY(__read_lock_failed) END(__read_lock_failed) #endif + +/* Fix up special calling conventions */ +ENTRY(call_rwsem_down_read_failed) + CFI_STARTPROC + push %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx,0 + push %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx,0 + call rwsem_down_read_failed + pop %edx + CFI_ADJUST_CFA_OFFSET -4 + pop %ecx + CFI_ADJUST_CFA_OFFSET -4 + ret + CFI_ENDPROC + END(call_rwsem_down_read_failed) + +ENTRY(call_rwsem_down_write_failed) + CFI_STARTPROC + push %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx,0 + calll rwsem_down_write_failed + pop %ecx + CFI_ADJUST_CFA_OFFSET -4 + ret + CFI_ENDPROC + END(call_rwsem_down_write_failed) + +ENTRY(call_rwsem_wake) + CFI_STARTPROC + decw %dx /* do nothing if still outstanding active readers */ + jnz 1f + push %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx,0 + call rwsem_wake + pop %ecx + CFI_ADJUST_CFA_OFFSET -4 +1: ret + CFI_ENDPROC + END(call_rwsem_wake) + +/* Fix up special calling conventions */ +ENTRY(call_rwsem_downgrade_wake) + CFI_STARTPROC + push %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx,0 + push %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx,0 + call rwsem_downgrade_wake + pop %edx + CFI_ADJUST_CFA_OFFSET -4 + pop %ecx + CFI_ADJUST_CFA_OFFSET -4 + ret + CFI_ENDPROC + END(call_rwsem_downgrade_wake) + diff --git a/include/asm-i386/rwsem.h b/include/asm-i386/rwsem.h index 43113f5608eb..bc598d6388e3 100644 --- a/include/asm-i386/rwsem.h +++ b/include/asm-i386/rwsem.h @@ -99,17 +99,9 @@ static inline void __down_read(struct rw_semaphore *sem) __asm__ __volatile__( "# beginning down_read\n\t" LOCK_PREFIX " incl (%%eax)\n\t" /* adds 0x00000001, returns the old value */ - " js 2f\n\t" /* jump if we weren't granted the lock */ + " jns 1f\n" + " call call_rwsem_down_read_failed\n" "1:\n\t" - LOCK_SECTION_START("") - "2:\n\t" - " pushl %%ecx\n\t" - " pushl %%edx\n\t" - " call rwsem_down_read_failed\n\t" - " popl %%edx\n\t" - " popl %%ecx\n\t" - " jmp 1b\n" - LOCK_SECTION_END "# ending down_read\n\t" : "+m" (sem->count) : "a" (sem) @@ -151,15 +143,9 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) "# beginning down_write\n\t" LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns the old value */ " testl %%edx,%%edx\n\t" /* was the count 0 before? */ - " jnz 2f\n\t" /* jump if we weren't granted the lock */ - "1:\n\t" - LOCK_SECTION_START("") - "2:\n\t" - " pushl %%ecx\n\t" - " call rwsem_down_write_failed\n\t" - " popl %%ecx\n\t" - " jmp 1b\n" - LOCK_SECTION_END + " jz 1f\n" + " call call_rwsem_down_write_failed\n" + "1:\n" "# ending down_write" : "+m" (sem->count), "=d" (tmp) : "a" (sem), "1" (tmp) @@ -193,17 +179,9 @@ static inline void __up_read(struct rw_semaphore *sem) __asm__ __volatile__( "# beginning __up_read\n\t" LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" /* subtracts 1, returns the old value */ - " js 2f\n\t" /* jump if the lock is being waited upon */ - "1:\n\t" - LOCK_SECTION_START("") - "2:\n\t" - " decw %%dx\n\t" /* do nothing if still outstanding active readers */ - " jnz 1b\n\t" - " pushl %%ecx\n\t" - " call rwsem_wake\n\t" - " popl %%ecx\n\t" - " jmp 1b\n" - LOCK_SECTION_END + " jns 1f\n\t" + " call call_rwsem_wake\n" + "1:\n" "# ending __up_read\n" : "+m" (sem->count), "=d" (tmp) : "a" (sem), "1" (tmp) @@ -219,17 +197,9 @@ static inline void __up_write(struct rw_semaphore *sem) "# beginning __up_write\n\t" " movl %2,%%edx\n\t" LOCK_PREFIX " xaddl %%edx,(%%eax)\n\t" /* tries to transition 0xffff0001 -> 0x00000000 */ - " jnz 2f\n\t" /* jump if the lock is being waited upon */ + " jz 1f\n" + " call call_rwsem_wake\n" "1:\n\t" - LOCK_SECTION_START("") - "2:\n\t" - " decw %%dx\n\t" /* did the active count reduce to 0? */ - " jnz 1b\n\t" /* jump back if not */ - " pushl %%ecx\n\t" - " call rwsem_wake\n\t" - " popl %%ecx\n\t" - " jmp 1b\n" - LOCK_SECTION_END "# ending __up_write\n" : "+m" (sem->count) : "a" (sem), "i" (-RWSEM_ACTIVE_WRITE_BIAS) @@ -244,17 +214,9 @@ static inline void __downgrade_write(struct rw_semaphore *sem) __asm__ __volatile__( "# beginning __downgrade_write\n\t" LOCK_PREFIX " addl %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 -> 0xYYYY0001 */ - " js 2f\n\t" /* jump if the lock is being waited upon */ + " jns 1f\n\t" + " call call_rwsem_downgrade_wake\n" "1:\n\t" - LOCK_SECTION_START("") - "2:\n\t" - " pushl %%ecx\n\t" - " pushl %%edx\n\t" - " call rwsem_downgrade_wake\n\t" - " popl %%edx\n\t" - " popl %%ecx\n\t" - " jmp 1b\n" - LOCK_SECTION_END "# ending __downgrade_write\n" : "+m" (sem->count) : "a" (sem), "i" (-RWSEM_WAITING_BIAS) -- cgit v1.2.1 From 7ca2b49b06a6d26e89e3535653889f1d7892b085 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] i386: Remove lock section support in semaphore.h Lock sections don't work the new dwarf2 unwinder This generates slightly smaller code. It adds one more taken jump to the fast path. Cc: jbeulich@novell.com Signed-off-by: Andi Kleen --- include/asm-i386/semaphore.h | 49 +++++++++++++++----------------------------- 1 file changed, 17 insertions(+), 32 deletions(-) diff --git a/include/asm-i386/semaphore.h b/include/asm-i386/semaphore.h index d51e800acf29..e63b6a68f04c 100644 --- a/include/asm-i386/semaphore.h +++ b/include/asm-i386/semaphore.h @@ -100,13 +100,10 @@ static inline void down(struct semaphore * sem) __asm__ __volatile__( "# atomic down operation\n\t" LOCK_PREFIX "decl %0\n\t" /* --sem->count */ - "js 2f\n" - "1:\n" - LOCK_SECTION_START("") - "2:\tlea %0,%%eax\n\t" - "call __down_failed\n\t" - "jmp 1b\n" - LOCK_SECTION_END + "jns 2f\n" + "\tlea %0,%%eax\n\t" + "call __down_failed\n" + "2:" :"+m" (sem->count) : :"memory","ax"); @@ -123,15 +120,12 @@ static inline int down_interruptible(struct semaphore * sem) might_sleep(); __asm__ __volatile__( "# atomic interruptible down operation\n\t" + "xorl %0,%0\n\t" LOCK_PREFIX "decl %1\n\t" /* --sem->count */ - "js 2f\n\t" - "xorl %0,%0\n" - "1:\n" - LOCK_SECTION_START("") - "2:\tlea %1,%%eax\n\t" - "call __down_failed_interruptible\n\t" - "jmp 1b\n" - LOCK_SECTION_END + "jns 2f\n\t" + "lea %1,%%eax\n\t" + "call __down_failed_interruptible\n" + "2:" :"=a" (result), "+m" (sem->count) : :"memory"); @@ -148,15 +142,12 @@ static inline int down_trylock(struct semaphore * sem) __asm__ __volatile__( "# atomic interruptible down operation\n\t" + "xorl %0,%0\n\t" LOCK_PREFIX "decl %1\n\t" /* --sem->count */ - "js 2f\n\t" - "xorl %0,%0\n" - "1:\n" - LOCK_SECTION_START("") - "2:\tlea %1,%%eax\n\t" + "jns 2f\n\t" + "lea %1,%%eax\n\t" "call __down_failed_trylock\n\t" - "jmp 1b\n" - LOCK_SECTION_END + "2:\n" :"=a" (result), "+m" (sem->count) : :"memory"); @@ -166,22 +157,16 @@ static inline int down_trylock(struct semaphore * sem) /* * Note! This is subtle. We jump to wake people up only if * the semaphore was negative (== somebody was waiting on it). - * The default case (no contention) will result in NO - * jumps for both down() and up(). */ static inline void up(struct semaphore * sem) { __asm__ __volatile__( "# atomic up operation\n\t" LOCK_PREFIX "incl %0\n\t" /* ++sem->count */ - "jle 2f\n" - "1:\n" - LOCK_SECTION_START("") - "2:\tlea %0,%%eax\n\t" - "call __up_wakeup\n\t" - "jmp 1b\n" - LOCK_SECTION_END - ".subsection 0\n" + "jg 1f\n\t" + "lea %0,%%eax\n\t" + "call __up_wakeup\n" + "1:" :"+m" (sem->count) : :"memory","ax"); -- cgit v1.2.1 From 8578bdf835d6d6ece6718858d351456367b8a5cf Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] Don't use lock section for mutexes and semaphores Lock sections cannot be handled by the dwarf2 unwinder. Disadvantage is a taken branch in the hot path. Signed-off-by: Andi Kleen --- include/asm-x86_64/mutex.h | 20 ++++++-------------- include/asm-x86_64/semaphore.h | 40 ++++++++++++++-------------------------- include/asm-x86_64/spinlock.h | 9 ++++----- 3 files changed, 24 insertions(+), 45 deletions(-) diff --git a/include/asm-x86_64/mutex.h b/include/asm-x86_64/mutex.h index 06fab6de2a88..16396b1de3e4 100644 --- a/include/asm-x86_64/mutex.h +++ b/include/asm-x86_64/mutex.h @@ -25,13 +25,9 @@ do { \ \ __asm__ __volatile__( \ LOCK_PREFIX " decl (%%rdi) \n" \ - " js 2f \n" \ - "1: \n" \ - \ - LOCK_SECTION_START("") \ - "2: call "#fail_fn" \n" \ - " jmp 1b \n" \ - LOCK_SECTION_END \ + " jns 1f \n" \ + " call "#fail_fn" \n" \ + "1:" \ \ :"=D" (dummy) \ : "D" (v) \ @@ -75,13 +71,9 @@ do { \ \ __asm__ __volatile__( \ LOCK_PREFIX " incl (%%rdi) \n" \ - " jle 2f \n" \ - "1: \n" \ - \ - LOCK_SECTION_START("") \ - "2: call "#fail_fn" \n" \ - " jmp 1b \n" \ - LOCK_SECTION_END \ + " jg 1f \n" \ + " call "#fail_fn" \n" \ + "1: " \ \ :"=D" (dummy) \ : "D" (v) \ diff --git a/include/asm-x86_64/semaphore.h b/include/asm-x86_64/semaphore.h index 064df08b9a0f..107bd90429e8 100644 --- a/include/asm-x86_64/semaphore.h +++ b/include/asm-x86_64/semaphore.h @@ -107,12 +107,9 @@ static inline void down(struct semaphore * sem) __asm__ __volatile__( "# atomic down operation\n\t" LOCK_PREFIX "decl %0\n\t" /* --sem->count */ - "js 2f\n" - "1:\n" - LOCK_SECTION_START("") - "2:\tcall __down_failed\n\t" - "jmp 1b\n" - LOCK_SECTION_END + "jns 1f\n\t" + "call __down_failed\n" + "1:" :"=m" (sem->count) :"D" (sem) :"memory"); @@ -130,14 +127,11 @@ static inline int down_interruptible(struct semaphore * sem) __asm__ __volatile__( "# atomic interruptible down operation\n\t" + "xorl %0,%0\n\t" LOCK_PREFIX "decl %1\n\t" /* --sem->count */ - "js 2f\n\t" - "xorl %0,%0\n" - "1:\n" - LOCK_SECTION_START("") - "2:\tcall __down_failed_interruptible\n\t" - "jmp 1b\n" - LOCK_SECTION_END + "jns 2f\n\t" + "call __down_failed_interruptible\n" + "2:\n" :"=a" (result), "=m" (sem->count) :"D" (sem) :"memory"); @@ -154,14 +148,11 @@ static inline int down_trylock(struct semaphore * sem) __asm__ __volatile__( "# atomic interruptible down operation\n\t" + "xorl %0,%0\n\t" LOCK_PREFIX "decl %1\n\t" /* --sem->count */ - "js 2f\n\t" - "xorl %0,%0\n" - "1:\n" - LOCK_SECTION_START("") - "2:\tcall __down_failed_trylock\n\t" - "jmp 1b\n" - LOCK_SECTION_END + "jns 2f\n\t" + "call __down_failed_trylock\n\t" + "2:\n" :"=a" (result), "=m" (sem->count) :"D" (sem) :"memory","cc"); @@ -179,12 +170,9 @@ static inline void up(struct semaphore * sem) __asm__ __volatile__( "# atomic up operation\n\t" LOCK_PREFIX "incl %0\n\t" /* ++sem->count */ - "jle 2f\n" - "1:\n" - LOCK_SECTION_START("") - "2:\tcall __up_wakeup\n\t" - "jmp 1b\n" - LOCK_SECTION_END + "jg 1f\n\t" + "call __up_wakeup\n" + "1:" :"=m" (sem->count) :"D" (sem) :"memory"); diff --git a/include/asm-x86_64/spinlock.h b/include/asm-x86_64/spinlock.h index a8e3d89f591f..1d0733b38a4b 100644 --- a/include/asm-x86_64/spinlock.h +++ b/include/asm-x86_64/spinlock.h @@ -22,14 +22,13 @@ #define __raw_spin_lock_string \ "\n1:\t" \ LOCK_PREFIX " ; decl %0\n\t" \ - "js 2f\n" \ - LOCK_SECTION_START("") \ - "2:\t" \ + "jns 2f\n" \ + "3:\n" \ "rep;nop\n\t" \ "cmpl $0,%0\n\t" \ - "jle 2b\n\t" \ + "jle 3b\n\t" \ "jmp 1b\n" \ - LOCK_SECTION_END + "2:\t" \ #define __raw_spin_lock_string_up \ "\n\tdecl %0" -- cgit v1.2.1 From a752d7194c4fb5a3e767c95542d04fc5decb1d52 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] fix is_at_popf() for compat tasks When testing for the REX instruction prefix, first check for 32-bit mode because in compat mode the REX prefix is an increment instruction. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen --- arch/x86_64/kernel/ptrace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c index d35ec1bc696a..3a52c7bcfd12 100644 --- a/arch/x86_64/kernel/ptrace.c +++ b/arch/x86_64/kernel/ptrace.c @@ -141,8 +141,11 @@ static int is_at_popf(struct task_struct *child, struct pt_regs *regs) case 0xf0: case 0xf2: case 0xf3: continue; - /* REX prefixes */ case 0x40 ... 0x4f: + if (regs->cs != __USER_CS) + /* 32-bit mode: register increment */ + return 0; + /* 64-bit mode: REX prefix */ continue; /* CHECKME: f0, f2, f3 */ -- cgit v1.2.1 From 8b059d2373c16b6d32787a49daf8ccf72dc61b71 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] Clean up spin/rwlocks - Inline spinlock strings into their inline functions - Convert macros to typesafe inlines - Replace some leftover __asm__ __volatile__s with asm volatile Signed-off-by: Andi Kleen --- include/asm-x86_64/rwlock.h | 14 +-------- include/asm-x86_64/spinlock.h | 71 ++++++++++++++++++++++++------------------- 2 files changed, 41 insertions(+), 44 deletions(-) diff --git a/include/asm-x86_64/rwlock.h b/include/asm-x86_64/rwlock.h index 28a080d23119..72aeebed920b 100644 --- a/include/asm-x86_64/rwlock.h +++ b/include/asm-x86_64/rwlock.h @@ -21,18 +21,6 @@ #define RW_LOCK_BIAS 0x01000000 #define RW_LOCK_BIAS_STR "0x01000000" -#define __build_read_lock(rw) \ - asm volatile(LOCK_PREFIX "subl $1,(%0)\n\t" \ - "jns 1f\n" \ - "call __read_lock_failed\n" \ - "1:\n" \ - ::"D" (rw), "i" (RW_LOCK_BIAS) : "memory") - -#define __build_write_lock(rw) \ - asm volatile(LOCK_PREFIX "subl %1,(%0)\n\t" \ - "jz 1f\n" \ - "\tcall __write_lock_failed\n\t" \ - "1:\n" \ - ::"D" (rw), "i" (RW_LOCK_BIAS) : "memory") +/* Actual code is in asm/spinlock.h or in arch/x86_64/lib/rwlock.S */ #endif diff --git a/include/asm-x86_64/spinlock.h b/include/asm-x86_64/spinlock.h index 1d0733b38a4b..be7a9e629fb2 100644 --- a/include/asm-x86_64/spinlock.h +++ b/include/asm-x86_64/spinlock.h @@ -16,30 +16,23 @@ * (the type definitions are in asm/spinlock_types.h) */ -#define __raw_spin_is_locked(x) \ - (*(volatile signed int *)(&(x)->slock) <= 0) - -#define __raw_spin_lock_string \ - "\n1:\t" \ - LOCK_PREFIX " ; decl %0\n\t" \ - "jns 2f\n" \ - "3:\n" \ - "rep;nop\n\t" \ - "cmpl $0,%0\n\t" \ - "jle 3b\n\t" \ - "jmp 1b\n" \ - "2:\t" \ - -#define __raw_spin_lock_string_up \ - "\n\tdecl %0" - -#define __raw_spin_unlock_string \ - "movl $1,%0" \ - :"=m" (lock->slock) : : "memory" +static inline int __raw_spin_is_locked(raw_spinlock_t *lock) +{ + return *(volatile signed int *)(&(lock)->slock) <= 0; +} static inline void __raw_spin_lock(raw_spinlock_t *lock) { - asm volatile(__raw_spin_lock_string : "=m" (lock->slock) : : "memory"); + asm volatile( + "\n1:\t" + LOCK_PREFIX " ; decl %0\n\t" + "jns 2f\n" + "3:\n" + "rep;nop\n\t" + "cmpl $0,%0\n\t" + "jle 3b\n\t" + "jmp 1b\n" + "2:\t" : "=m" (lock->slock) : : "memory"); } #define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock) @@ -48,7 +41,7 @@ static inline int __raw_spin_trylock(raw_spinlock_t *lock) { int oldval; - __asm__ __volatile__( + asm volatile( "xchgl %0,%1" :"=q" (oldval), "=m" (lock->slock) :"0" (0) : "memory"); @@ -58,13 +51,14 @@ static inline int __raw_spin_trylock(raw_spinlock_t *lock) static inline void __raw_spin_unlock(raw_spinlock_t *lock) { - __asm__ __volatile__( - __raw_spin_unlock_string - ); + asm volatile("movl $1,%0" :"=m" (lock->slock) :: "memory"); } -#define __raw_spin_unlock_wait(lock) \ - do { while (__raw_spin_is_locked(lock)) cpu_relax(); } while (0) +static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock) +{ + while (__raw_spin_is_locked(lock)) + cpu_relax(); +} /* * Read-write spinlocks, allowing multiple readers @@ -80,17 +74,32 @@ static inline void __raw_spin_unlock(raw_spinlock_t *lock) * with the high bit (sign) being the "contended" bit. */ -#define __raw_read_can_lock(x) ((int)(x)->lock > 0) -#define __raw_write_can_lock(x) ((x)->lock == RW_LOCK_BIAS) +static inline int __raw_read_can_lock(raw_rwlock_t *lock) +{ + return (int)(lock)->lock > 0; +} + +static inline int __raw_write_can_lock(raw_rwlock_t *lock) +{ + return (lock)->lock == RW_LOCK_BIAS; +} static inline void __raw_read_lock(raw_rwlock_t *rw) { - __build_read_lock(rw); + asm volatile(LOCK_PREFIX "subl $1,(%0)\n\t" + "jns 1f\n" + "call __read_lock_failed\n" + "1:\n" + ::"D" (rw), "i" (RW_LOCK_BIAS) : "memory"); } static inline void __raw_write_lock(raw_rwlock_t *rw) { - __build_write_lock(rw); + asm volatile(LOCK_PREFIX "subl %1,(%0)\n\t" + "jz 1f\n" + "\tcall __write_lock_failed\n\t" + "1:\n" + ::"D" (rw), "i" (RW_LOCK_BIAS) : "memory"); } static inline int __raw_read_trylock(raw_rwlock_t *lock) -- cgit v1.2.1 From fb2e28485679418e459583605f9b19807a72ceca Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] i386: Clean up spin/rwlocks - Inline spinlock strings into their inline functions - Convert macros to typesafe inlines - Replace some leftover __asm__ __volatile__s with asm volatile Signed-off-by: Andi Kleen --- include/asm-i386/rwlock.h | 14 +---- include/asm-i386/spinlock.h | 131 ++++++++++++++++++++++---------------------- 2 files changed, 68 insertions(+), 77 deletions(-) diff --git a/include/asm-i386/rwlock.h b/include/asm-i386/rwlock.h index f40ccbd8cb7f..c3e5db32fa48 100644 --- a/include/asm-i386/rwlock.h +++ b/include/asm-i386/rwlock.h @@ -20,18 +20,6 @@ #define RW_LOCK_BIAS 0x01000000 #define RW_LOCK_BIAS_STR "0x01000000" -#define __build_read_lock(rw, helper) \ - asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t" \ - "jns 1f\n" \ - "call " helper "\n\t" \ - "1:\n" \ - ::"a" (rw) : "memory") - -#define __build_write_lock(rw, helper) \ - asm volatile(LOCK_PREFIX " subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ - "jz 1f\n" \ - "call " helper "\n\t" \ - "1:\n" \ - ::"a" (rw) : "memory") +/* Code is in asm-i386/spinlock.h */ #endif diff --git a/include/asm-i386/spinlock.h b/include/asm-i386/spinlock.h index d1020363c41a..324329313af8 100644 --- a/include/asm-i386/spinlock.h +++ b/include/asm-i386/spinlock.h @@ -4,6 +4,7 @@ #include #include #include +#include #include /* @@ -17,67 +18,64 @@ * (the type definitions are in asm/spinlock_types.h) */ -#define __raw_spin_is_locked(x) \ - (*(volatile signed char *)(&(x)->slock) <= 0) - -#define __raw_spin_lock_string \ - "\n1:\t" \ - LOCK_PREFIX " ; decb %0\n\t" \ - "jns 3f\n" \ - "2:\t" \ - "rep;nop\n\t" \ - "cmpb $0,%0\n\t" \ - "jle 2b\n\t" \ - "jmp 1b\n" \ - "3:\n\t" - -/* - * NOTE: there's an irqs-on section here, which normally would have to be - * irq-traced, but on CONFIG_TRACE_IRQFLAGS we never use - * __raw_spin_lock_string_flags(). - */ -#define __raw_spin_lock_string_flags \ - "\n1:\t" \ - LOCK_PREFIX " ; decb %0\n\t" \ - "jns 5f\n" \ - "2:\t" \ - "testl $0x200, %1\n\t" \ - "jz 4f\n\t" \ - "sti\n" \ - "3:\t" \ - "rep;nop\n\t" \ - "cmpb $0, %0\n\t" \ - "jle 3b\n\t" \ - "cli\n\t" \ - "jmp 1b\n" \ - "4:\t" \ - "rep;nop\n\t" \ - "cmpb $0, %0\n\t" \ - "jg 1b\n\t" \ - "jmp 4b\n" \ - "5:\n\t" +static inline int __raw_spin_is_locked(raw_spinlock_t *x) +{ + return *(volatile signed char *)(&(x)->slock) <= 0; +} static inline void __raw_spin_lock(raw_spinlock_t *lock) { - asm(__raw_spin_lock_string : "+m" (lock->slock) : : "memory"); + asm volatile("\n1:\t" + LOCK_PREFIX " ; decb %0\n\t" + "jns 3f\n" + "2:\t" + "rep;nop\n\t" + "cmpb $0,%0\n\t" + "jle 2b\n\t" + "jmp 1b\n" + "3:\n\t" + : "+m" (lock->slock) : : "memory"); } /* * It is easier for the lock validator if interrupts are not re-enabled * in the middle of a lock-acquire. This is a performance feature anyway * so we turn it off: + * + * NOTE: there's an irqs-on section here, which normally would have to be + * irq-traced, but on CONFIG_TRACE_IRQFLAGS we never use this variant. */ #ifndef CONFIG_PROVE_LOCKING static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) { - asm(__raw_spin_lock_string_flags : "+m" (lock->slock) : "r" (flags) : "memory"); + asm volatile( + "\n1:\t" + LOCK_PREFIX " ; decb %0\n\t" + "jns 5f\n" + "2:\t" + "testl $0x200, %1\n\t" + "jz 4f\n\t" + "sti\n" + "3:\t" + "rep;nop\n\t" + "cmpb $0, %0\n\t" + "jle 3b\n\t" + "cli\n\t" + "jmp 1b\n" + "4:\t" + "rep;nop\n\t" + "cmpb $0, %0\n\t" + "jg 1b\n\t" + "jmp 4b\n" + "5:\n\t" + : "+m" (lock->slock) : "r" (flags) : "memory"); } #endif static inline int __raw_spin_trylock(raw_spinlock_t *lock) { char oldval; - __asm__ __volatile__( + asm volatile( "xchgb %b0,%1" :"=q" (oldval), "+m" (lock->slock) :"0" (0) : "memory"); @@ -93,38 +91,29 @@ static inline int __raw_spin_trylock(raw_spinlock_t *lock) #if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE) -#define __raw_spin_unlock_string \ - "movb $1,%0" \ - :"+m" (lock->slock) : : "memory" - - static inline void __raw_spin_unlock(raw_spinlock_t *lock) { - __asm__ __volatile__( - __raw_spin_unlock_string - ); + asm volatile("movb $1,%0" : "+m" (lock->slock) :: "memory"); } #else -#define __raw_spin_unlock_string \ - "xchgb %b0, %1" \ - :"=q" (oldval), "+m" (lock->slock) \ - :"0" (oldval) : "memory" - static inline void __raw_spin_unlock(raw_spinlock_t *lock) { char oldval = 1; - __asm__ __volatile__( - __raw_spin_unlock_string - ); + asm volatile("xchgb %b0, %1" + : "=q" (oldval), "+m" (lock->slock) + : "0" (oldval) : "memory"); } #endif -#define __raw_spin_unlock_wait(lock) \ - do { while (__raw_spin_is_locked(lock)) cpu_relax(); } while (0) +static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock) +{ + while (__raw_spin_is_locked(lock)) + cpu_relax(); +} /* * Read-write spinlocks, allowing multiple readers @@ -151,22 +140,36 @@ static inline void __raw_spin_unlock(raw_spinlock_t *lock) * read_can_lock - would read_trylock() succeed? * @lock: the rwlock in question. */ -#define __raw_read_can_lock(x) ((int)(x)->lock > 0) +static inline int __raw_read_can_lock(raw_rwlock_t *x) +{ + return (int)(x)->lock > 0; +} /** * write_can_lock - would write_trylock() succeed? * @lock: the rwlock in question. */ -#define __raw_write_can_lock(x) ((x)->lock == RW_LOCK_BIAS) +static inline int __raw_write_can_lock(raw_rwlock_t *x) +{ + return (x)->lock == RW_LOCK_BIAS; +} static inline void __raw_read_lock(raw_rwlock_t *rw) { - __build_read_lock(rw, "__read_lock_failed"); + asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t" + "jns 1f\n" + "call __read_lock_failed\n\t" + "1:\n" + ::"a" (rw) : "memory"); } static inline void __raw_write_lock(raw_rwlock_t *rw) { - __build_write_lock(rw, "__write_lock_failed"); + asm volatile(LOCK_PREFIX " subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" + "jz 1f\n" + "call __write_lock_failed\n\t" + "1:\n" + ::"a" (rw) : "memory"); } static inline int __raw_read_trylock(raw_rwlock_t *lock) -- cgit v1.2.1 From 8d379dad8f1670d233ac67b76b1c5a42ad3714a3 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] annotate arch/x86_64/lib/*.S Add unwind annotations to arch/x86_64/lib/*.S, and also use the macros provided by linux/linkage.h where-ever possible. Some of the alternative instructions handling needed to be adjusted so that the replacement code would also have valid unwind information. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/x86_64/lib/clear_page.S | 47 +++++++++++++++----------- arch/x86_64/lib/copy_page.S | 53 +++++++++++++++++++---------- arch/x86_64/lib/copy_user.S | 39 ++++++++++++++++------ arch/x86_64/lib/csum-copy.S | 26 ++++++++++++--- arch/x86_64/lib/getuser.S | 32 +++++++++++------- arch/x86_64/lib/iomap_copy.S | 10 ++++-- arch/x86_64/lib/memcpy.S | 69 +++++++++++++++++++++----------------- arch/x86_64/lib/memset.S | 79 ++++++++++++++++++++++++-------------------- arch/x86_64/lib/putuser.S | 32 +++++++++++------- 9 files changed, 244 insertions(+), 143 deletions(-) diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S index 1f81b79b796c..9a10a78bb4a4 100644 --- a/arch/x86_64/lib/clear_page.S +++ b/arch/x86_64/lib/clear_page.S @@ -1,10 +1,22 @@ +#include +#include + /* * Zero a page. * rdi page */ - .globl clear_page - .p2align 4 -clear_page: + ALIGN +clear_page_c: + CFI_STARTPROC + movl $4096/8,%ecx + xorl %eax,%eax + rep stosq + ret + CFI_ENDPROC +ENDPROC(clear_page) + +ENTRY(clear_page) + CFI_STARTPROC xorl %eax,%eax movl $4096/64,%ecx .p2align 4 @@ -23,28 +35,25 @@ clear_page: jnz .Lloop nop ret -clear_page_end: + CFI_ENDPROC +.Lclear_page_end: +ENDPROC(clear_page) /* Some CPUs run faster using the string instructions. It is also a lot simpler. Use this when possible */ #include + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp */ + .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ +2: + .previous .section .altinstructions,"a" .align 8 - .quad clear_page - .quad clear_page_c - .byte X86_FEATURE_REP_GOOD - .byte clear_page_end-clear_page - .byte clear_page_c_end-clear_page_c - .previous - - .section .altinstr_replacement,"ax" -clear_page_c: - movl $4096/8,%ecx - xorl %eax,%eax - rep - stosq - ret -clear_page_c_end: + .quad clear_page + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lclear_page_end - clear_page + .byte 2b - 1b .previous diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S index 8fa19d96a7ee..0ebb03b60e79 100644 --- a/arch/x86_64/lib/copy_page.S +++ b/arch/x86_64/lib/copy_page.S @@ -1,17 +1,33 @@ /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ +#include +#include +#include + + ALIGN +copy_page_c: + CFI_STARTPROC + movl $4096/8,%ecx + rep movsq + ret + CFI_ENDPROC +ENDPROC(copy_page_c) + /* Don't use streaming store because it's better when the target ends up in cache. */ /* Could vary the prefetch distance based on SMP/UP */ - .globl copy_page - .p2align 4 -copy_page: +ENTRY(copy_page) + CFI_STARTPROC subq $3*8,%rsp + CFI_ADJUST_CFA_OFFSET 3*8 movq %rbx,(%rsp) + CFI_REL_OFFSET rbx, 0 movq %r12,1*8(%rsp) + CFI_REL_OFFSET r12, 1*8 movq %r13,2*8(%rsp) + CFI_REL_OFFSET r13, 2*8 movl $(4096/64)-5,%ecx .p2align 4 @@ -72,30 +88,33 @@ copy_page: jnz .Loop2 movq (%rsp),%rbx + CFI_RESTORE rbx movq 1*8(%rsp),%r12 + CFI_RESTORE r12 movq 2*8(%rsp),%r13 + CFI_RESTORE r13 addq $3*8,%rsp + CFI_ADJUST_CFA_OFFSET -3*8 ret +.Lcopy_page_end: + CFI_ENDPROC +ENDPROC(copy_page) /* Some CPUs run faster using the string copy instructions. It is also a lot simpler. Use this when possible */ #include + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp */ + .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */ +2: + .previous .section .altinstructions,"a" .align 8 - .quad copy_page - .quad copy_page_c - .byte X86_FEATURE_REP_GOOD - .byte copy_page_c_end-copy_page_c - .byte copy_page_c_end-copy_page_c - .previous - - .section .altinstr_replacement,"ax" -copy_page_c: - movl $4096/8,%ecx - rep - movsq - ret -copy_page_c_end: + .quad copy_page + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lcopy_page_end - copy_page + .byte 2b - 1b .previous diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S index f64569b83b54..962f3a693c5e 100644 --- a/arch/x86_64/lib/copy_user.S +++ b/arch/x86_64/lib/copy_user.S @@ -4,6 +4,9 @@ * Functions to copy from and to user space. */ +#include +#include + #define FIX_ALIGNMENT 1 #include @@ -12,9 +15,8 @@ #include /* Standard copy_to_user with segment limit checking */ - .globl copy_to_user - .p2align 4 -copy_to_user: +ENTRY(copy_to_user) + CFI_STARTPROC GET_THREAD_INFO(%rax) movq %rdi,%rcx addq %rdx,%rcx @@ -25,9 +27,11 @@ copy_to_user: .byte 0xe9 /* 32bit jump */ .long .Lcug-1f 1: + CFI_ENDPROC +ENDPROC(copy_to_user) .section .altinstr_replacement,"ax" -3: .byte 0xe9 /* replacement jmp with 8 bit immediate */ +3: .byte 0xe9 /* replacement jmp with 32 bit immediate */ .long copy_user_generic_c-1b /* offset */ .previous .section .altinstructions,"a" @@ -40,9 +44,8 @@ copy_to_user: .previous /* Standard copy_from_user with segment limit checking */ - .globl copy_from_user - .p2align 4 -copy_from_user: +ENTRY(copy_from_user) + CFI_STARTPROC GET_THREAD_INFO(%rax) movq %rsi,%rcx addq %rdx,%rcx @@ -50,10 +53,13 @@ copy_from_user: cmpq threadinfo_addr_limit(%rax),%rcx jae bad_from_user /* FALL THROUGH to copy_user_generic */ + CFI_ENDPROC +ENDPROC(copy_from_user) .section .fixup,"ax" /* must zero dest */ bad_from_user: + CFI_STARTPROC movl %edx,%ecx xorl %eax,%eax rep @@ -61,6 +67,8 @@ bad_from_user: bad_to_user: movl %edx,%eax ret + CFI_ENDPROC +END(bad_from_user) .previous @@ -75,9 +83,8 @@ bad_to_user: * Output: * eax uncopied bytes or 0 if successful. */ - .globl copy_user_generic - .p2align 4 -copy_user_generic: +ENTRY(copy_user_generic) + CFI_STARTPROC .byte 0x66,0x66,0x90 /* 5 byte nop for replacement jump */ .byte 0x66,0x90 1: @@ -95,6 +102,8 @@ copy_user_generic: .previous .Lcug: pushq %rbx + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbx, 0 xorl %eax,%eax /*zero for the exception handler */ #ifdef FIX_ALIGNMENT @@ -168,9 +177,13 @@ copy_user_generic: decl %ecx jnz .Lloop_1 + CFI_REMEMBER_STATE .Lende: popq %rbx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rbx ret + CFI_RESTORE_STATE #ifdef FIX_ALIGNMENT /* align destination */ @@ -261,6 +274,9 @@ copy_user_generic: .Le_zero: movq %rdx,%rax jmp .Lende + CFI_ENDPROC +ENDPROC(copy_user_generic) + /* Some CPUs run faster using the string copy instructions. This is also a lot simpler. Use them when possible. @@ -282,6 +298,7 @@ copy_user_generic: * this please consider this. */ copy_user_generic_c: + CFI_STARTPROC movl %edx,%ecx shrl $3,%ecx andl $7,%edx @@ -294,6 +311,8 @@ copy_user_generic_c: ret 3: lea (%rdx,%rcx,8),%rax ret + CFI_ENDPROC +END(copy_user_generic_c) .section __ex_table,"a" .quad 1b,3b diff --git a/arch/x86_64/lib/csum-copy.S b/arch/x86_64/lib/csum-copy.S index 72fd55ee896e..f0dba36578ea 100644 --- a/arch/x86_64/lib/csum-copy.S +++ b/arch/x86_64/lib/csum-copy.S @@ -5,8 +5,9 @@ * License. See the file COPYING in the main directory of this archive * for more details. No warranty for anything given at all. */ - #include - #include +#include +#include +#include /* * Checksum copy with exception handling. @@ -53,19 +54,24 @@ .endm - .globl csum_partial_copy_generic - .p2align 4 -csum_partial_copy_generic: +ENTRY(csum_partial_copy_generic) + CFI_STARTPROC cmpl $3*64,%edx jle .Lignore .Lignore: subq $7*8,%rsp + CFI_ADJUST_CFA_OFFSET 7*8 movq %rbx,2*8(%rsp) + CFI_REL_OFFSET rbx, 2*8 movq %r12,3*8(%rsp) + CFI_REL_OFFSET r12, 3*8 movq %r14,4*8(%rsp) + CFI_REL_OFFSET r14, 4*8 movq %r13,5*8(%rsp) + CFI_REL_OFFSET r13, 5*8 movq %rbp,6*8(%rsp) + CFI_REL_OFFSET rbp, 6*8 movq %r8,(%rsp) movq %r9,1*8(%rsp) @@ -208,14 +214,22 @@ csum_partial_copy_generic: addl %ebx,%eax adcl %r9d,%eax /* carry */ + CFI_REMEMBER_STATE .Lende: movq 2*8(%rsp),%rbx + CFI_RESTORE rbx movq 3*8(%rsp),%r12 + CFI_RESTORE r12 movq 4*8(%rsp),%r14 + CFI_RESTORE r14 movq 5*8(%rsp),%r13 + CFI_RESTORE r13 movq 6*8(%rsp),%rbp + CFI_RESTORE rbp addq $7*8,%rsp + CFI_ADJUST_CFA_OFFSET -7*8 ret + CFI_RESTORE_STATE /* Exception handlers. Very simple, zeroing is done in the wrappers */ .Lbad_source: @@ -231,3 +245,5 @@ csum_partial_copy_generic: jz .Lende movl $-EFAULT,(%rax) jmp .Lende + CFI_ENDPROC +ENDPROC(csum_partial_copy_generic) diff --git a/arch/x86_64/lib/getuser.S b/arch/x86_64/lib/getuser.S index 3844d5e885a4..5448876261f8 100644 --- a/arch/x86_64/lib/getuser.S +++ b/arch/x86_64/lib/getuser.S @@ -27,25 +27,26 @@ */ #include +#include #include #include #include #include .text - .p2align 4 -.globl __get_user_1 -__get_user_1: +ENTRY(__get_user_1) + CFI_STARTPROC GET_THREAD_INFO(%r8) cmpq threadinfo_addr_limit(%r8),%rcx jae bad_get_user 1: movzb (%rcx),%edx xorl %eax,%eax ret + CFI_ENDPROC +ENDPROC(__get_user_1) - .p2align 4 -.globl __get_user_2 -__get_user_2: +ENTRY(__get_user_2) + CFI_STARTPROC GET_THREAD_INFO(%r8) addq $1,%rcx jc 20f @@ -57,10 +58,11 @@ __get_user_2: ret 20: decq %rcx jmp bad_get_user + CFI_ENDPROC +ENDPROC(__get_user_2) - .p2align 4 -.globl __get_user_4 -__get_user_4: +ENTRY(__get_user_4) + CFI_STARTPROC GET_THREAD_INFO(%r8) addq $3,%rcx jc 30f @@ -72,10 +74,11 @@ __get_user_4: ret 30: subq $3,%rcx jmp bad_get_user + CFI_ENDPROC +ENDPROC(__get_user_4) - .p2align 4 -.globl __get_user_8 -__get_user_8: +ENTRY(__get_user_8) + CFI_STARTPROC GET_THREAD_INFO(%r8) addq $7,%rcx jc 40f @@ -87,11 +90,16 @@ __get_user_8: ret 40: subq $7,%rcx jmp bad_get_user + CFI_ENDPROC +ENDPROC(__get_user_8) bad_get_user: + CFI_STARTPROC xorl %edx,%edx movq $(-EFAULT),%rax ret + CFI_ENDPROC +END(bad_get_user) .section __ex_table,"a" .quad 1b,bad_get_user diff --git a/arch/x86_64/lib/iomap_copy.S b/arch/x86_64/lib/iomap_copy.S index 8bbade5fea05..05a95e713da8 100644 --- a/arch/x86_64/lib/iomap_copy.S +++ b/arch/x86_64/lib/iomap_copy.S @@ -15,12 +15,16 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ +#include +#include + /* * override generic version in lib/iomap_copy.c */ - .globl __iowrite32_copy - .p2align 4 -__iowrite32_copy: +ENTRY(__iowrite32_copy) + CFI_STARTPROC movl %edx,%ecx rep movsd ret + CFI_ENDPROC +ENDPROC(__iowrite32_copy) diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S index 5554948b5554..967b22fa7d07 100644 --- a/arch/x86_64/lib/memcpy.S +++ b/arch/x86_64/lib/memcpy.S @@ -1,6 +1,10 @@ /* Copyright 2002 Andi Kleen */ - #include +#include +#include +#include +#include + /* * memcpy - Copy a memory block. * @@ -13,12 +17,26 @@ * rax original destination */ - .globl __memcpy - .globl memcpy - .p2align 4 -__memcpy: -memcpy: + ALIGN +memcpy_c: + CFI_STARTPROC + movq %rdi,%rax + movl %edx,%ecx + shrl $3,%ecx + andl $7,%edx + rep movsq + movl %edx,%ecx + rep movsb + ret + CFI_ENDPROC +ENDPROC(memcpy_c) + +ENTRY(__memcpy) +ENTRY(memcpy) + CFI_STARTPROC pushq %rbx + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbx, 0 movq %rdi,%rax movl %edx,%ecx @@ -86,36 +104,27 @@ memcpy: .Lende: popq %rbx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rbx ret .Lfinal: + CFI_ENDPROC +ENDPROC(memcpy) +ENDPROC(__memcpy) /* Some CPUs run faster using the string copy instructions. It is also a lot simpler. Use this when possible */ + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp */ + .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ +2: + .previous .section .altinstructions,"a" .align 8 - .quad memcpy - .quad memcpy_c - .byte X86_FEATURE_REP_GOOD - .byte .Lfinal-memcpy - .byte memcpy_c_end-memcpy_c - .previous - - .section .altinstr_replacement,"ax" - /* rdi destination - * rsi source - * rdx count - */ -memcpy_c: - movq %rdi,%rax - movl %edx,%ecx - shrl $3,%ecx - andl $7,%edx - rep - movsq - movl %edx,%ecx - rep - movsb - ret -memcpy_c_end: + .quad memcpy + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lfinal - memcpy + .byte 2b - 1b .previous diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S index ad397f2c7de8..09ed1f6b0eaa 100644 --- a/arch/x86_64/lib/memset.S +++ b/arch/x86_64/lib/memset.S @@ -1,4 +1,9 @@ /* Copyright 2002 Andi Kleen, SuSE Labs */ + +#include +#include +#include + /* * ISO C memset - set a memory block to a byte value. * @@ -8,11 +13,29 @@ * * rax original destination */ - .globl __memset - .globl memset - .p2align 4 -memset: -__memset: + ALIGN +memset_c: + CFI_STARTPROC + movq %rdi,%r9 + movl %edx,%r8d + andl $7,%r8d + movl %edx,%ecx + shrl $3,%ecx + /* expand byte value */ + movzbl %sil,%esi + movabs $0x0101010101010101,%rax + mulq %rsi /* with rax, clobbers rdx */ + rep stosq + movl %r8d,%ecx + rep stosb + movq %r9,%rax + ret + CFI_ENDPROC +ENDPROC(memset_c) + +ENTRY(memset) +ENTRY(__memset) + CFI_STARTPROC movq %rdi,%r10 movq %rdx,%r11 @@ -25,6 +48,7 @@ __memset: movl %edi,%r9d andl $7,%r9d jnz .Lbad_alignment + CFI_REMEMBER_STATE .Lafter_bad_alignment: movl %r11d,%ecx @@ -75,6 +99,7 @@ __memset: movq %r10,%rax ret + CFI_RESTORE_STATE .Lbad_alignment: cmpq $7,%r11 jbe .Lhandle_7 @@ -84,42 +109,26 @@ __memset: addq %r8,%rdi subq %r8,%r11 jmp .Lafter_bad_alignment +.Lfinal: + CFI_ENDPROC +ENDPROC(memset) +ENDPROC(__memset) /* Some CPUs run faster using the string instructions. It is also a lot simpler. Use this when possible */ #include + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp */ + .byte (memset_c - memset) - (2f - 1b) /* offset */ +2: + .previous .section .altinstructions,"a" .align 8 - .quad memset - .quad memset_c - .byte X86_FEATURE_REP_GOOD - .byte memset_c_end-memset_c - .byte memset_c_end-memset_c - .previous - - .section .altinstr_replacement,"ax" - /* rdi destination - * rsi value - * rdx count - */ -memset_c: - movq %rdi,%r9 - movl %edx,%r8d - andl $7,%r8d - movl %edx,%ecx - shrl $3,%ecx - /* expand byte value */ - movzbl %sil,%esi - movabs $0x0101010101010101,%rax - mulq %rsi /* with rax, clobbers rdx */ - rep - stosq - movl %r8d,%ecx - rep - stosb - movq %r9,%rax - ret -memset_c_end: + .quad memset + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lfinal - memset + .byte 2b - 1b .previous diff --git a/arch/x86_64/lib/putuser.S b/arch/x86_64/lib/putuser.S index 7f5593974e2d..4989f5a8fa9b 100644 --- a/arch/x86_64/lib/putuser.S +++ b/arch/x86_64/lib/putuser.S @@ -25,25 +25,26 @@ */ #include +#include #include #include #include #include .text - .p2align 4 -.globl __put_user_1 -__put_user_1: +ENTRY(__put_user_1) + CFI_STARTPROC GET_THREAD_INFO(%r8) cmpq threadinfo_addr_limit(%r8),%rcx jae bad_put_user 1: movb %dl,(%rcx) xorl %eax,%eax ret + CFI_ENDPROC +ENDPROC(__put_user_1) - .p2align 4 -.globl __put_user_2 -__put_user_2: +ENTRY(__put_user_2) + CFI_STARTPROC GET_THREAD_INFO(%r8) addq $1,%rcx jc 20f @@ -55,10 +56,11 @@ __put_user_2: ret 20: decq %rcx jmp bad_put_user + CFI_ENDPROC +ENDPROC(__put_user_2) - .p2align 4 -.globl __put_user_4 -__put_user_4: +ENTRY(__put_user_4) + CFI_STARTPROC GET_THREAD_INFO(%r8) addq $3,%rcx jc 30f @@ -70,10 +72,11 @@ __put_user_4: ret 30: subq $3,%rcx jmp bad_put_user + CFI_ENDPROC +ENDPROC(__put_user_4) - .p2align 4 -.globl __put_user_8 -__put_user_8: +ENTRY(__put_user_8) + CFI_STARTPROC GET_THREAD_INFO(%r8) addq $7,%rcx jc 40f @@ -85,10 +88,15 @@ __put_user_8: ret 40: subq $7,%rcx jmp bad_put_user + CFI_ENDPROC +ENDPROC(__put_user_8) bad_put_user: + CFI_STARTPROC movq $(-EFAULT),%rax ret + CFI_ENDPROC +END(bad_put_user) .section __ex_table,"a" .quad 1b,bad_put_user -- cgit v1.2.1 From 29a6c25bd63dbd52e73d25268402d065ad84c0fe Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] Fix gdt table size in trampoline.S Allows easier extension of the GDT by using the proper C symbol for the size in the descriptor. Signed-off-by: Eric W. Biederman Signed-off-by: Andi Kleen --- arch/x86_64/kernel/trampoline.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86_64/kernel/trampoline.S b/arch/x86_64/kernel/trampoline.S index 23a03eb91fc7..c79b99a9e2f6 100644 --- a/arch/x86_64/kernel/trampoline.S +++ b/arch/x86_64/kernel/trampoline.S @@ -64,7 +64,7 @@ idt_48: .word 0, 0 # idt base = 0L gdt_48: - .short __KERNEL32_CS + 7 # gdt limit + .short GDT_ENTRIES*8 - 1 # gdt limit .long cpu_gdt_table-__START_KERNEL_map .globl trampoline_end -- cgit v1.2.1 From 145106e81090fcdfb57b7f2e90b98c6445b6f33d Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] remove superflous BUG_ON's in nommu and gart There's no need to check for invalid DMA data direction in nommu and gart since we do it in dma-mapping.h anyway before calling the individual dma-ops. Signed-off-by: Muli Ben-Yehuda Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-gart.c | 3 --- arch/x86_64/kernel/pci-nommu.c | 1 - 2 files changed, 4 deletions(-) diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index 6d3e61baf7a0..16261a8a3303 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -239,8 +239,6 @@ dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) { unsigned long phys_mem, bus; - BUG_ON(dir == DMA_NONE); - if (!dev) dev = &fallback_dev; @@ -383,7 +381,6 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) unsigned long pages = 0; int need = 0, nextneed; - BUG_ON(dir == DMA_NONE); if (nents == 0) return 0; diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c index aad7609d8e92..df09ab05a1bd 100644 --- a/arch/x86_64/kernel/pci-nommu.c +++ b/arch/x86_64/kernel/pci-nommu.c @@ -59,7 +59,6 @@ int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, { int i; - BUG_ON(direction == DMA_NONE); for (i = 0; i < nents; i++ ) { struct scatterlist *s = &sg[i]; BUG_ON(!s->page); -- cgit v1.2.1 From d4d35854a1cf7fb170c8e133a5e40a9d19e7fc17 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] remove lock prefix from is_at_popf() tests The lock prefix will cause an exception when used with the popf instruction, so no need to continue searching after it's found. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen --- arch/x86_64/kernel/ptrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c index 3a52c7bcfd12..412c6a8bf906 100644 --- a/arch/x86_64/kernel/ptrace.c +++ b/arch/x86_64/kernel/ptrace.c @@ -138,7 +138,7 @@ static int is_at_popf(struct task_struct *child, struct pt_regs *regs) case 0x26: case 0x2e: case 0x36: case 0x3e: case 0x64: case 0x65: - case 0xf0: case 0xf2: case 0xf3: + case 0xf2: case 0xf3: continue; case 0x40 ... 0x4f: @@ -148,7 +148,7 @@ static int is_at_popf(struct task_struct *child, struct pt_regs *regs) /* 64-bit mode: REX prefix */ continue; - /* CHECKME: f0, f2, f3 */ + /* CHECKME: f2, f3 */ /* * pushf: NOTE! We should probably not let -- cgit v1.2.1 From 9ca33eb6981549c0d1b7aea7f99f1ba602161356 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] Use early CPU identify before early command line parsing This makes it possible to modify CPU flags in command line options without hacks. And remove another copy in head64.c Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head64.c | 19 ------------------- arch/x86_64/kernel/setup.c | 4 ++-- 2 files changed, 2 insertions(+), 21 deletions(-) diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index bacbd75c63ad..f2461fde9f8f 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -56,24 +56,6 @@ static void __init copy_bootdata(char *real_mode_data) printk("Bootdata ok (command line is %s)\n", saved_command_line); } -static void __init setup_boot_cpu_data(void) -{ - unsigned int dummy, eax; - - /* get vendor info */ - cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level, - (unsigned int *)&boot_cpu_data.x86_vendor_id[0], - (unsigned int *)&boot_cpu_data.x86_vendor_id[8], - (unsigned int *)&boot_cpu_data.x86_vendor_id[4]); - - /* get cpu type */ - cpuid(1, &eax, &dummy, &dummy, - (unsigned int *) &boot_cpu_data.x86_capability); - boot_cpu_data.x86 = (eax >> 8) & 0xf; - boot_cpu_data.x86_model = (eax >> 4) & 0xf; - boot_cpu_data.x86_mask = eax & 0xf; -} - void __init x86_64_start_kernel(char * real_mode_data) { char *s; @@ -117,6 +99,5 @@ void __init x86_64_start_kernel(char * real_mode_data) if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE) panic("Kernel too big for kernel mapping\n"); - setup_boot_cpu_data(); start_kernel(); } diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index ddc3b2d9a165..73f1cdd140fe 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -545,10 +545,10 @@ void __init setup_arch(char **cmdline_p) data_resource.start = virt_to_phys(&_etext); data_resource.end = virt_to_phys(&_edata)-1; - parse_cmdline_early(cmdline_p); - early_identify_cpu(&boot_cpu_data); + parse_cmdline_early(cmdline_p); + /* * partially used pages are not usable - thus * we are rounding upwards: -- cgit v1.2.1 From 33df0d19ea425d28bd5afb48898af32237fe81af Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] Allow early_param and identical __setup to exist We currently assume that boot parameters which are handled by early_param() will not overlap boot parameters handled by __setup: if they do, behaviour is dependent on link order, usually meaning __setup will not get called. ACPI wants to use early_param("pci"), and pci uses __setup("pci="), so we modify the core to let them coexist: "pci=noacpi" will now get passed to both. Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen --- init/main.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/init/main.c b/init/main.c index 8651a720a092..6b69564738c1 100644 --- a/init/main.c +++ b/init/main.c @@ -162,16 +162,19 @@ extern struct obs_kernel_param __setup_start[], __setup_end[]; static int __init obsolete_checksetup(char *line) { struct obs_kernel_param *p; + int had_early_param = 0; p = __setup_start; do { int n = strlen(p->str); if (!strncmp(line, p->str, n)) { if (p->early) { - /* Already done in parse_early_param? (Needs - * exact match on param part) */ + /* Already done in parse_early_param? + * (Needs exact match on param part). + * Keep iterating, as we can have early + * params and __setups of same names 8( */ if (line[n] == '\0' || line[n] == '=') - return 1; + had_early_param = 1; } else if (!p->setup_func) { printk(KERN_WARNING "Parameter %s is obsolete," " ignored\n", p->str); @@ -181,7 +184,8 @@ static int __init obsolete_checksetup(char *line) } p++; } while (p < __setup_end); - return 0; + + return had_early_param; } /* -- cgit v1.2.1 From 1a3f239ddf9208f2e52d36fef1c1c4518cbbbabe Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] i386: Replace i386 open-coded cmdline parsing with This patch replaces the open-coded early commandline parsing throughout the i386 boot code with the generic mechanism (already used by ppc, powerpc, ia64 and s390). The code was inconsistent with whether it deletes the option from the cmdline or not, meaning some of these will get passed through the environment into init. This transformation is mainly mechanical, but there are some notable parts: 1) Grammar: s/linux never set's it up/linux never sets it up/ 2) Remove hacked-in earlyprintk= option scanning. When someone actually implements CONFIG_EARLY_PRINTK, then they can use early_param(). [AK: actually it is implemented, but I'm adding the early_param it in the next x86-64 patch] 3) Move declaration of generic_apic_probe() from setup.c into asm/apic.h 4) Various parameters now moved into their appropriate files (thanks Andi). 5) All parse functions which examine arg need to check for NULL, except one where it has subtle humor value. AK: readded acpi_sci handling which was completely dropped AK: moved some more variables into acpi/boot.c Cc: len.brown@intel.com Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen --- arch/i386/kernel/acpi/boot.c | 76 ++++++++- arch/i386/kernel/apic.c | 15 ++ arch/i386/kernel/io_apic.c | 24 ++- arch/i386/kernel/machine_kexec.c | 23 +++ arch/i386/kernel/setup.c | 350 +++++++++++++-------------------------- arch/i386/kernel/smpboot.c | 13 ++ arch/i386/mach-generic/probe.c | 58 ++++--- arch/i386/mm/init.c | 18 +- include/asm-i386/acpi.h | 14 -- include/asm-i386/apic.h | 4 +- include/asm-i386/io_apic.h | 11 ++ include/asm-i386/pgtable.h | 2 - 12 files changed, 319 insertions(+), 289 deletions(-) diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index ee003bc0e8b1..87e2ab50b694 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -36,6 +36,8 @@ #include #include +int __initdata acpi_force = 0; + #ifdef CONFIG_X86_64 extern void __init clustered_apic_check(void); @@ -860,8 +862,6 @@ static void __init acpi_process_madt(void) return; } -extern int acpi_force; - #ifdef __i386__ static int __init disable_acpi_irq(struct dmi_system_id *d) @@ -1163,3 +1163,75 @@ int __init acpi_boot_init(void) return 0; } + +static int __init parse_acpi(char *arg) +{ + if (!arg) + return -EINVAL; + + /* "acpi=off" disables both ACPI table parsing and interpreter */ + if (strcmp(arg, "off") == 0) { + disable_acpi(); + } + /* acpi=force to over-ride black-list */ + else if (strcmp(arg, "force") == 0) { + acpi_force = 1; + acpi_ht = 1; + acpi_disabled = 0; + } + /* acpi=strict disables out-of-spec workarounds */ + else if (strcmp(arg, "strict") == 0) { + acpi_strict = 1; + } + /* Limit ACPI just to boot-time to enable HT */ + else if (strcmp(arg, "ht") == 0) { + if (!acpi_force) + disable_acpi(); + acpi_ht = 1; + } + /* "acpi=noirq" disables ACPI interrupt routing */ + else if (strcmp(arg, "noirq") == 0) { + acpi_noirq_set(); + } else { + /* Core will printk when we return error. */ + return -EINVAL; + } + return 0; +} +early_param("acpi", parse_acpi); + +/* FIXME: Using pci= for an ACPI parameter is a travesty. */ +static int __init parse_pci(char *arg) +{ + if (arg && strcmp(arg, "noacpi") == 0) + acpi_disable_pci(); + return 0; +} +early_param("pci", parse_pci); + +#ifdef CONFIG_X86_IO_APIC +static int __init parse_acpi_skip_timer_override(char *arg) +{ + acpi_skip_timer_override = 1; + return 0; +} +early_param("acpi_skip_timer_override", parse_acpi_skip_timer_override); +#endif /* CONFIG_X86_IO_APIC */ + +static int __init setup_acpi_sci(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "edge")) + acpi_sci_flags.trigger = 1; + else if (!strcmp(s, "level")) + acpi_sci_flags.trigger = 3; + else if (!strcmp(s, "high")) + acpi_sci_flags.polarity = 1; + else if (!strcmp(s, "low")) + acpi_sci_flags.polarity = 3; + else + return -EINVAL; + return 0; +} +early_param("acpi_sci", setup_acpi_sci); diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index 1a34fc57800b..ce75e71b694f 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -1372,3 +1372,18 @@ int __init APIC_init_uniprocessor (void) return 0; } + +static int __init parse_lapic(char *arg) +{ + lapic_enable(); + return 0; +} +early_param("lapic", parse_lapic); + +static int __init parse_nolapic(char *arg) +{ + lapic_disable(); + return 0; +} +early_param("nolapic", parse_nolapic); + diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index b713f27d7e86..fd0df75cfbda 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -66,7 +66,7 @@ int sis_apic_bug = -1; */ int nr_ioapic_registers[MAX_IO_APICS]; -int disable_timer_pin_1 __initdata; +static int disable_timer_pin_1 __initdata; /* * Rough estimation of how many shared IRQs there are, can @@ -2691,3 +2691,25 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a } #endif /* CONFIG_ACPI */ + +static int __init parse_disable_timer_pin_1(char *arg) +{ + disable_timer_pin_1 = 1; + return 0; +} +early_param("disable_timer_pin_1", parse_disable_timer_pin_1); + +static int __init parse_enable_timer_pin_1(char *arg) +{ + disable_timer_pin_1 = -1; + return 0; +} +early_param("enable_timer_pin_1", parse_enable_timer_pin_1); + +static int __init parse_noapic(char *arg) +{ + /* disable IO-APIC */ + disable_ioapic_setup(); + return 0; +} +early_param("noapic", parse_noapic); diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c index 6b1ae6ba76f0..66c3dc99a655 100644 --- a/arch/i386/kernel/machine_kexec.c +++ b/arch/i386/kernel/machine_kexec.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -209,3 +210,25 @@ NORET_TYPE void machine_kexec(struct kimage *image) rnk = (relocate_new_kernel_t) reboot_code_buffer; (*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae); } + +/* crashkernel=size@addr specifies the location to reserve for + * a crash kernel. By reserving this memory we guarantee + * that linux never sets it up as a DMA target. + * Useful for holding code to do something appropriate + * after a kernel panic. + */ +static int __init parse_crashkernel(char *arg) +{ + unsigned long size, base; + size = memparse(arg, &arg); + if (*arg == '@') { + base = memparse(arg+1, &arg); + /* FIXME: Do I want a sanity check + * to validate the memory range? + */ + crashk_res.start = base; + crashk_res.end = base + size - 1; + } + return 0; +} +early_param("crashkernel", parse_crashkernel); diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 71a540362b78..c6e31ed386f5 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -96,11 +96,6 @@ unsigned long mmu_cr4_features; #endif EXPORT_SYMBOL(acpi_disabled); -#ifdef CONFIG_ACPI -int __initdata acpi_force = 0; -extern acpi_interrupt_flags acpi_sci_flags; -#endif - /* for MCA, but anyone else can use it if they want */ unsigned int machine_id; #ifdef CONFIG_MCA @@ -148,7 +143,6 @@ EXPORT_SYMBOL(ist_info); struct e820map e820; extern void early_cpu_init(void); -extern void generic_apic_probe(char *); extern int root_mountflags; unsigned long saved_videomode; @@ -700,238 +694,132 @@ static inline void copy_edd(void) } #endif -static void __init parse_cmdline_early (char ** cmdline_p) -{ - char c = ' ', *to = command_line, *from = saved_command_line; - int len = 0; - int userdef = 0; +static int __initdata user_defined_memmap = 0; - /* Save unparsed command line copy for /proc/cmdline */ - saved_command_line[COMMAND_LINE_SIZE-1] = '\0'; +/* + * "mem=nopentium" disables the 4MB page tables. + * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM + * to , overriding the bios size. + * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from + * to +, overriding the bios size. + * + * HPA tells me bootloaders need to parse mem=, so no new + * option should be mem= [also see Documentation/i386/boot.txt] + */ +static int __init parse_mem(char *arg) +{ + if (!arg) + return -EINVAL; - for (;;) { - if (c != ' ') - goto next_char; - /* - * "mem=nopentium" disables the 4MB page tables. - * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM - * to , overriding the bios size. - * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from - * to +, overriding the bios size. - * - * HPA tells me bootloaders need to parse mem=, so no new - * option should be mem= [also see Documentation/i386/boot.txt] + if (strcmp(arg, "nopentium") == 0) { + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); + disable_pse = 1; + } else { + /* If the user specifies memory size, we + * limit the BIOS-provided memory map to + * that size. exactmap can be used to specify + * the exact map. mem=number can be used to + * trim the existing memory map. */ - if (!memcmp(from, "mem=", 4)) { - if (to != command_line) - to--; - if (!memcmp(from+4, "nopentium", 9)) { - from += 9+4; - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); - disable_pse = 1; - } else { - /* If the user specifies memory size, we - * limit the BIOS-provided memory map to - * that size. exactmap can be used to specify - * the exact map. mem=number can be used to - * trim the existing memory map. - */ - unsigned long long mem_size; + unsigned long long mem_size; - mem_size = memparse(from+4, &from); - limit_regions(mem_size); - userdef=1; - } - } - - else if (!memcmp(from, "memmap=", 7)) { - if (to != command_line) - to--; - if (!memcmp(from+7, "exactmap", 8)) { -#ifdef CONFIG_CRASH_DUMP - /* If we are doing a crash dump, we - * still need to know the real mem - * size before original memory map is - * reset. - */ - find_max_pfn(); - saved_max_pfn = max_pfn; -#endif - from += 8+7; - e820.nr_map = 0; - userdef = 1; - } else { - /* If the user specifies memory size, we - * limit the BIOS-provided memory map to - * that size. exactmap can be used to specify - * the exact map. mem=number can be used to - * trim the existing memory map. - */ - unsigned long long start_at, mem_size; - - mem_size = memparse(from+7, &from); - if (*from == '@') { - start_at = memparse(from+1, &from); - add_memory_region(start_at, mem_size, E820_RAM); - } else if (*from == '#') { - start_at = memparse(from+1, &from); - add_memory_region(start_at, mem_size, E820_ACPI); - } else if (*from == '$') { - start_at = memparse(from+1, &from); - add_memory_region(start_at, mem_size, E820_RESERVED); - } else { - limit_regions(mem_size); - userdef=1; - } - } - } - - else if (!memcmp(from, "noexec=", 7)) - noexec_setup(from + 7); + mem_size = memparse(arg, &arg); + limit_regions(mem_size); + user_defined_memmap = 1; + } + return 0; +} +early_param("mem", parse_mem); +static int __init parse_memmap(char *arg) +{ + if (!arg) + return -EINVAL; -#ifdef CONFIG_X86_SMP - /* - * If the BIOS enumerates physical processors before logical, - * maxcpus=N at enumeration-time can be used to disable HT. + if (strcmp(arg, "exactmap") == 0) { +#ifdef CONFIG_CRASH_DUMP + /* If we are doing a crash dump, we + * still need to know the real mem + * size before original memory map is + * reset. */ - else if (!memcmp(from, "maxcpus=", 8)) { - extern unsigned int maxcpus; - - maxcpus = simple_strtoul(from + 8, NULL, 0); - } + find_max_pfn(); + saved_max_pfn = max_pfn; #endif - -#ifdef CONFIG_ACPI - /* "acpi=off" disables both ACPI table parsing and interpreter */ - else if (!memcmp(from, "acpi=off", 8)) { - disable_acpi(); - } - - /* acpi=force to over-ride black-list */ - else if (!memcmp(from, "acpi=force", 10)) { - acpi_force = 1; - acpi_ht = 1; - acpi_disabled = 0; - } - - /* acpi=strict disables out-of-spec workarounds */ - else if (!memcmp(from, "acpi=strict", 11)) { - acpi_strict = 1; - } - - /* Limit ACPI just to boot-time to enable HT */ - else if (!memcmp(from, "acpi=ht", 7)) { - if (!acpi_force) - disable_acpi(); - acpi_ht = 1; - } - - /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */ - else if (!memcmp(from, "pci=noacpi", 10)) { - acpi_disable_pci(); - } - /* "acpi=noirq" disables ACPI interrupt routing */ - else if (!memcmp(from, "acpi=noirq", 10)) { - acpi_noirq_set(); + e820.nr_map = 0; + user_defined_memmap = 1; + } else { + /* If the user specifies memory size, we + * limit the BIOS-provided memory map to + * that size. exactmap can be used to specify + * the exact map. mem=number can be used to + * trim the existing memory map. + */ + unsigned long long start_at, mem_size; + + mem_size = memparse(arg, &arg); + if (*arg == '@') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_RAM); + } else if (*arg == '#') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_ACPI); + } else if (*arg == '$') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_RESERVED); + } else { + limit_regions(mem_size); + user_defined_memmap = 1; } + } + return 0; +} +early_param("memmap", parse_memmap); - else if (!memcmp(from, "acpi_sci=edge", 13)) - acpi_sci_flags.trigger = 1; - - else if (!memcmp(from, "acpi_sci=level", 14)) - acpi_sci_flags.trigger = 3; - - else if (!memcmp(from, "acpi_sci=high", 13)) - acpi_sci_flags.polarity = 1; - - else if (!memcmp(from, "acpi_sci=low", 12)) - acpi_sci_flags.polarity = 3; - -#ifdef CONFIG_X86_IO_APIC - else if (!memcmp(from, "acpi_skip_timer_override", 24)) - acpi_skip_timer_override = 1; - - if (!memcmp(from, "disable_timer_pin_1", 19)) - disable_timer_pin_1 = 1; - if (!memcmp(from, "enable_timer_pin_1", 18)) - disable_timer_pin_1 = -1; +#ifdef CONFIG_PROC_VMCORE +/* elfcorehdr= specifies the location of elf core header + * stored by the crashed kernel. + */ +static int __init parse_elfcorehdr(char *arg) +{ + if (!arg) + return -EINVAL; - /* disable IO-APIC */ - else if (!memcmp(from, "noapic", 6)) - disable_ioapic_setup(); -#endif /* CONFIG_X86_IO_APIC */ -#endif /* CONFIG_ACPI */ + elfcorehdr_addr = memparse(arg, &arg); + return 0; +} +early_param("elfcorehdr", parse_elfcorehdr); +#endif /* CONFIG_PROC_VMCORE */ -#ifdef CONFIG_X86_LOCAL_APIC - /* enable local APIC */ - else if (!memcmp(from, "lapic", 5)) - lapic_enable(); +/* + * highmem=size forces highmem to be exactly 'size' bytes. + * This works even on boxes that have no highmem otherwise. + * This also works to reduce highmem size on bigger boxes. + */ +static int __init parse_highmem(char *arg) +{ + if (!arg) + return -EINVAL; - /* disable local APIC */ - else if (!memcmp(from, "nolapic", 6)) - lapic_disable(); -#endif /* CONFIG_X86_LOCAL_APIC */ + highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT; + return 0; +} +early_param("highmem", parse_highmem); -#ifdef CONFIG_KEXEC - /* crashkernel=size@addr specifies the location to reserve for - * a crash kernel. By reserving this memory we guarantee - * that linux never set's it up as a DMA target. - * Useful for holding code to do something appropriate - * after a kernel panic. - */ - else if (!memcmp(from, "crashkernel=", 12)) { - unsigned long size, base; - size = memparse(from+12, &from); - if (*from == '@') { - base = memparse(from+1, &from); - /* FIXME: Do I want a sanity check - * to validate the memory range? - */ - crashk_res.start = base; - crashk_res.end = base + size - 1; - } - } -#endif -#ifdef CONFIG_PROC_VMCORE - /* elfcorehdr= specifies the location of elf core header - * stored by the crashed kernel. - */ - else if (!memcmp(from, "elfcorehdr=", 11)) - elfcorehdr_addr = memparse(from+11, &from); -#endif +/* + * vmalloc=size forces the vmalloc area to be exactly 'size' + * bytes. This can be used to increase (or decrease) the + * vmalloc area - the default is 128m. + */ +static int __init parse_vmalloc(char *arg) +{ + if (!arg) + return -EINVAL; - /* - * highmem=size forces highmem to be exactly 'size' bytes. - * This works even on boxes that have no highmem otherwise. - * This also works to reduce highmem size on bigger boxes. - */ - else if (!memcmp(from, "highmem=", 8)) - highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT; - - /* - * vmalloc=size forces the vmalloc area to be exactly 'size' - * bytes. This can be used to increase (or decrease) the - * vmalloc area - the default is 128m. - */ - else if (!memcmp(from, "vmalloc=", 8)) - __VMALLOC_RESERVE = memparse(from+8, &from); - - next_char: - c = *(from++); - if (!c) - break; - if (COMMAND_LINE_SIZE <= ++len) - break; - *(to++) = c; - } - *to = '\0'; - *cmdline_p = command_line; - if (userdef) { - printk(KERN_INFO "user-defined physical RAM map:\n"); - print_memory_map("user"); - } + __VMALLOC_RESERVE = memparse(arg, &arg); + return 0; } +early_param("vmalloc", parse_vmalloc); /* * Callback for efi_memory_walk. @@ -1507,17 +1395,15 @@ void __init setup_arch(char **cmdline_p) data_resource.start = virt_to_phys(_etext); data_resource.end = virt_to_phys(_edata)-1; - parse_cmdline_early(cmdline_p); + parse_early_param(); -#ifdef CONFIG_EARLY_PRINTK - { - char *s = strstr(*cmdline_p, "earlyprintk="); - if (s) { - setup_early_printk(strchr(s, '=') + 1); - printk("early console enabled\n"); - } + if (user_defined_memmap) { + printk(KERN_INFO "user-defined physical RAM map:\n"); + print_memory_map("user"); } -#endif + + strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; max_low_pfn = setup_memory(); @@ -1546,7 +1432,7 @@ void __init setup_arch(char **cmdline_p) dmi_scan_machine(); #ifdef CONFIG_X86_GENERICARCH - generic_apic_probe(*cmdline_p); + generic_apic_probe(); #endif if (efi_enabled) efi_map_memmap(); diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 9367af76ce37..517eb3874550 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -1491,3 +1491,16 @@ void __init smp_intr_init(void) /* IPI for generic function call */ set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); } + +/* + * If the BIOS enumerates physical processors before logical, + * maxcpus=N at enumeration-time can be used to disable HT. + */ +static int __init parse_maxcpus(char *arg) +{ + extern unsigned int maxcpus; + + maxcpus = simple_strtoul(arg, NULL, 0); + return 0; +} +early_param("maxcpus", parse_maxcpus); diff --git a/arch/i386/mach-generic/probe.c b/arch/i386/mach-generic/probe.c index 793d1b473251..94b1fd9cbe3c 100644 --- a/arch/i386/mach-generic/probe.c +++ b/arch/i386/mach-generic/probe.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -29,7 +30,24 @@ struct genapic *apic_probe[] __initdata = { NULL, }; -static int cmdline_apic; +static int cmdline_apic __initdata; +static int __init parse_apic(char *arg) +{ + int i; + + if (!arg) + return -EINVAL; + + for (i = 0; apic_probe[i]; i++) { + if (!strcmp(apic_probe[i]->name, arg)) { + genapic = apic_probe[i]; + cmdline_apic = 1; + return 0; + } + } + return -ENOENT; +} +early_param("apic", parse_apic); void __init generic_bigsmp_probe(void) { @@ -48,40 +66,20 @@ void __init generic_bigsmp_probe(void) } } -void __init generic_apic_probe(char *command_line) +void __init generic_apic_probe(void) { - char *s; - int i; - int changed = 0; - - s = strstr(command_line, "apic="); - if (s && (s == command_line || isspace(s[-1]))) { - char *p = strchr(s, ' '), old; - if (!p) - p = strchr(s, '\0'); - old = *p; - *p = 0; - for (i = 0; !changed && apic_probe[i]; i++) { - if (!strcmp(apic_probe[i]->name, s+5)) { - changed = 1; + if (!cmdline_apic) { + int i; + for (i = 0; apic_probe[i]; i++) { + if (apic_probe[i]->probe()) { genapic = apic_probe[i]; + break; } } - if (!changed) - printk(KERN_ERR "Unknown genapic `%s' specified.\n", s); - *p = old; - cmdline_apic = changed; - } - for (i = 0; !changed && apic_probe[i]; i++) { - if (apic_probe[i]->probe()) { - changed = 1; - genapic = apic_probe[i]; - } + /* Not visible without early console */ + if (!apic_probe[i]) + panic("Didn't find an APIC driver"); } - /* Not visible without early console */ - if (!changed) - panic("Didn't find an APIC driver"); - printk(KERN_INFO "Using APIC driver %s\n", genapic->name); } diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index d0c2fdf6a4d7..951386606d09 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -435,16 +435,22 @@ u64 __supported_pte_mask __read_mostly = ~_PAGE_NX; * on Enable * off Disable */ -void __init noexec_setup(const char *str) +static int __init noexec_setup(char *str) { - if (!strncmp(str, "on",2) && cpu_has_nx) { - __supported_pte_mask |= _PAGE_NX; - disable_nx = 0; - } else if (!strncmp(str,"off",3)) { + if (!str || !strcmp(str, "on")) { + if (cpu_has_nx) { + __supported_pte_mask |= _PAGE_NX; + disable_nx = 0; + } + } else if (!strcmp(str,"off")) { disable_nx = 1; __supported_pte_mask &= ~_PAGE_NX; - } + } else + return -EINVAL; + + return 0; } +early_param("noexec", noexec_setup); int nx_enabled = 0; #ifdef CONFIG_X86_PAE diff --git a/include/asm-i386/acpi.h b/include/asm-i386/acpi.h index 20f523954218..6016632d032f 100644 --- a/include/asm-i386/acpi.h +++ b/include/asm-i386/acpi.h @@ -131,21 +131,7 @@ static inline void disable_acpi(void) extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq); #ifdef CONFIG_X86_IO_APIC -extern int skip_ioapic_setup; extern int acpi_skip_timer_override; - -static inline void disable_ioapic_setup(void) -{ - skip_ioapic_setup = 1; -} - -static inline int ioapic_setup_disabled(void) -{ - return skip_ioapic_setup; -} - -#else -static inline void disable_ioapic_setup(void) { } #endif static inline void acpi_noirq_set(void) { acpi_noirq = 1; } diff --git a/include/asm-i386/apic.h b/include/asm-i386/apic.h index 2c1e371cebb6..8c5331ceca7b 100644 --- a/include/asm-i386/apic.h +++ b/include/asm-i386/apic.h @@ -42,6 +42,8 @@ static inline void lapic_enable(void) } while (0) +extern void generic_apic_probe(void); + #ifdef CONFIG_X86_LOCAL_APIC /* @@ -117,8 +119,6 @@ extern void enable_APIC_timer(void); extern void enable_NMI_through_LVT0 (void * dummy); -extern int disable_timer_pin_1; - void smp_send_timer_broadcast_ipi(struct pt_regs *regs); void switch_APIC_timer_to_ipi(void *cpumask); void switch_ipi_to_APIC_timer(void *cpumask); diff --git a/include/asm-i386/io_apic.h b/include/asm-i386/io_apic.h index 5092e819b8a2..5d309275a1dc 100644 --- a/include/asm-i386/io_apic.h +++ b/include/asm-i386/io_apic.h @@ -188,6 +188,16 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned /* 1 if "noapic" boot option passed */ extern int skip_ioapic_setup; +static inline void disable_ioapic_setup(void) +{ + skip_ioapic_setup = 1; +} + +static inline int ioapic_setup_disabled(void) +{ + return skip_ioapic_setup; +} + /* * If we use the IO-APIC for IRQ routing, disable automatic * assignment of PCI IRQ's. @@ -206,6 +216,7 @@ extern int (*ioapic_renumber_irq)(int ioapic, int irq); #else /* !CONFIG_X86_IO_APIC */ #define io_apic_assign_pci_irqs 0 +static inline void disable_ioapic_setup(void) { } #endif extern int assign_irq_vector(int irq); diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index 09697fec3d2b..64140f2f1b95 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -391,8 +391,6 @@ extern pte_t *lookup_address(unsigned long address); static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;} #endif -extern void noexec_setup(const char *str); - #if defined(CONFIG_HIGHPTE) #define pte_offset_map(dir, address) \ ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + pte_index(address)) -- cgit v1.2.1 From 2c8c0e6b8d7700a990da8d24eff767f9ca223b96 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] Convert x86-64 to early param Instead of hackish manual parsing Requires earlier i386 patchkit, but also fixes i386 early_printk again. I removed some obsolete really early parameters which didn't do anything useful. Also made a few parameters that needed it early (mostly oops printing setup) Also removed one panic check that wasn't visible without early console anyways (the early console is now initialized after that panic) This cleans up a lot of code. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/apic.c | 34 ++++--- arch/x86_64/kernel/e820.c | 53 ++++++++-- arch/x86_64/kernel/early_printk.c | 20 ++-- arch/x86_64/kernel/head64.c | 15 --- arch/x86_64/kernel/io_apic.c | 15 ++- arch/x86_64/kernel/machine_kexec.c | 28 ++++++ arch/x86_64/kernel/pci-dma.c | 7 +- arch/x86_64/kernel/setup.c | 197 ++++--------------------------------- arch/x86_64/kernel/setup64.c | 8 +- arch/x86_64/kernel/smpboot.c | 6 +- arch/x86_64/kernel/traps.c | 24 +++-- arch/x86_64/mm/numa.c | 9 +- include/asm-x86_64/apic.h | 4 +- include/asm-x86_64/e820.h | 3 +- include/asm-x86_64/pgtable.h | 1 - include/asm-x86_64/proto.h | 10 -- 16 files changed, 159 insertions(+), 275 deletions(-) diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 496cd1bd2ae5..660602c51f51 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -36,6 +36,7 @@ #include #include #include +#include int apic_verbosity; int apic_runs_main_timer; @@ -546,18 +547,24 @@ static void apic_pm_activate(void) { } static int __init apic_set_verbosity(char *str) { + if (str == NULL) { + skip_ioapic_setup = 0; + ioapic_force = 1; + return 0; + } if (strcmp("debug", str) == 0) apic_verbosity = APIC_DEBUG; else if (strcmp("verbose", str) == 0) apic_verbosity = APIC_VERBOSE; - else + else { printk(KERN_WARNING "APIC Verbosity level %s not recognised" - " use apic=verbose or apic=debug", str); + " use apic=verbose or apic=debug\n", str); + return -EINVAL; + } - return 1; + return 0; } - -__setup("apic=", apic_set_verbosity); +early_param("apic", apic_set_verbosity); /* * Detect and enable local APICs on non-SMP boards. @@ -1078,14 +1085,17 @@ int __init APIC_init_uniprocessor (void) static __init int setup_disableapic(char *str) { disable_apic = 1; - return 1; -} + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + return 0; +} +early_param("disableapic", setup_disableapic); +/* same as disableapic, for compatibility */ static __init int setup_nolapic(char *str) { - disable_apic = 1; - return 1; + return setup_disableapic(str); } +early_param("nolapic", setup_nolapic); static __init int setup_noapictimer(char *str) { @@ -1118,11 +1128,5 @@ static __init int setup_apicpmtimer(char *s) } __setup("apicpmtimer", setup_apicpmtimer); -/* dummy parsing: see setup.c */ - -__setup("disableapic", setup_disableapic); -__setup("nolapic", setup_nolapic); /* same as disableapic, for compatibility */ - __setup("noapictimer", setup_noapictimer); -/* no "lapic" flag - we only use the lapic when the BIOS tells us so. */ diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 1362aad4f3aa..a4739dc82937 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -596,31 +596,64 @@ void __init setup_memory_region(void) e820_print_map(who); } -void __init parse_memopt(char *p, char **from) -{ - end_user_pfn = memparse(p, from); +static int __init parse_memopt(char *p) +{ + if (!p) + return -EINVAL; + end_user_pfn = memparse(p, &p); end_user_pfn >>= PAGE_SHIFT; + return 0; } +early_param("mem", parse_memopt); + +static int userdef __initdata; -void __init parse_memmapopt(char *p, char **from) +static int __init parse_memmap_opt(char *p) { + char *oldp; unsigned long long start_at, mem_size; - mem_size = memparse(p, from); - p = *from; + if (!strcmp(p, "exactmap")) { +#ifdef CONFIG_CRASH_DUMP + /* If we are doing a crash dump, we + * still need to know the real mem + * size before original memory map is + * reset. + */ + saved_max_pfn = e820_end_of_ram(); +#endif + end_pfn_map = 0; + e820.nr_map = 0; + userdef = 1; + return 0; + } + + oldp = p; + mem_size = memparse(p, &p); + if (p == oldp) + return -EINVAL; if (*p == '@') { - start_at = memparse(p+1, from); + start_at = memparse(p+1, &p); add_memory_region(start_at, mem_size, E820_RAM); } else if (*p == '#') { - start_at = memparse(p+1, from); + start_at = memparse(p+1, &p); add_memory_region(start_at, mem_size, E820_ACPI); } else if (*p == '$') { - start_at = memparse(p+1, from); + start_at = memparse(p+1, &p); add_memory_region(start_at, mem_size, E820_RESERVED); } else { end_user_pfn = (mem_size >> PAGE_SHIFT); } - p = *from; + return *p == '\0' ? 0 : -EINVAL; +} +early_param("memmap", parse_memmap_opt); + +void finish_e820_parsing(void) +{ + if (userdef) { + printk(KERN_INFO "user-defined physical RAM map:\n"); + e820_print_map("user"); + } } unsigned long pci_mem_start = 0xaeedbabe; diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c index 140051e07fa6..e22ecd54870d 100644 --- a/arch/x86_64/kernel/early_printk.c +++ b/arch/x86_64/kernel/early_printk.c @@ -215,20 +215,16 @@ void early_printk(const char *fmt, ...) static int __initdata keep_early; -int __init setup_early_printk(char *opt) +static int __init setup_early_printk(char *buf) { - char *space; - char buf[256]; + if (!buf) + return 0; if (early_console_initialized) - return 1; - - strlcpy(buf,opt,sizeof(buf)); - space = strchr(buf, ' '); - if (space) - *space = 0; + return 0; + early_console_initialized = 1; - if (strstr(buf,"keep")) + if (!strcmp(buf,"keep")) keep_early = 1; if (!strncmp(buf, "serial", 6)) { @@ -248,11 +244,12 @@ int __init setup_early_printk(char *opt) early_console = &simnow_console; keep_early = 1; } - early_console_initialized = 1; register_console(early_console); return 0; } +early_param("earlyprintk", setup_early_printk); + void __init disable_early_printk(void) { if (!early_console_initialized || !early_console) @@ -266,4 +263,3 @@ void __init disable_early_printk(void) } } -__setup("earlyprintk=", setup_early_printk); diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index f2461fde9f8f..6716cbfc34ac 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -58,7 +58,6 @@ static void __init copy_bootdata(char *real_mode_data) void __init x86_64_start_kernel(char * real_mode_data) { - char *s; int i; for (i = 0; i < 256; i++) @@ -85,19 +84,5 @@ void __init x86_64_start_kernel(char * real_mode_data) #ifdef CONFIG_SMP cpu_set(0, cpu_online_map); #endif - s = strstr(saved_command_line, "earlyprintk="); - if (s != NULL) - setup_early_printk(strchr(s, '=') + 1); -#ifdef CONFIG_NUMA - s = strstr(saved_command_line, "numa="); - if (s != NULL) - numa_setup(s+5); -#endif - if (strstr(saved_command_line, "disableapic")) - disable_apic = 1; - /* You need early console to see that */ - if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE) - panic("Kernel too big for kernel mapping\n"); - start_kernel(); } diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index a1412042b918..afac3dbb3729 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -48,7 +48,7 @@ int sis_apic_bug; /* not actually supported, dummy for compile */ static int no_timer_check; -int disable_timer_pin_1 __initdata; +static int disable_timer_pin_1 __initdata; int timer_over_8254 __initdata = 0; @@ -253,18 +253,17 @@ int ioapic_force; static int __init disable_ioapic_setup(char *str) { skip_ioapic_setup = 1; - return 1; + return 0; } +early_param("noapic", disable_ioapic_setup); -static int __init enable_ioapic_setup(char *str) +/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ +static int __init disable_timer_pin_setup(char *arg) { - ioapic_force = 1; - skip_ioapic_setup = 0; + disable_timer_pin_1 = 1; return 1; } - -__setup("noapic", disable_ioapic_setup); -__setup("apic", enable_ioapic_setup); +__setup("disable_timer_pin_1", disable_timer_pin_setup); static int __init setup_disable_8254_timer(char *s) { diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c index 106076b370fc..2e94c072d84a 100644 --- a/arch/x86_64/kernel/machine_kexec.c +++ b/arch/x86_64/kernel/machine_kexec.c @@ -226,3 +226,31 @@ NORET_TYPE void machine_kexec(struct kimage *image) rnk = (relocate_new_kernel_t) control_code_buffer; (*rnk)(page_list, control_code_buffer, image->start, start_pgtable); } + +/* crashkernel=size@addr specifies the location to reserve for + * a crash kernel. By reserving this memory we guarantee + * that linux never set's it up as a DMA target. + * Useful for holding code to do something appropriate + * after a kernel panic. + */ +static int __init setup_crashkernel(char *arg) +{ + unsigned long size, base; + char *p; + if (!arg) + return -EINVAL; + size = memparse(arg, &p); + if (arg == p) + return -EINVAL; + if (*p == '@') { + base = memparse(p+1, &p); + /* FIXME: Do I want a sanity check to validate the + * memory range? Yes you do, but it's too early for + * e820 -AK */ + crashk_res.start = base; + crashk_res.end = base + size - 1; + } + return 0; +} +early_param("crashkernel", setup_crashkernel); + diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c index 9c44f4f2433d..4dcb671bd19f 100644 --- a/arch/x86_64/kernel/pci-dma.c +++ b/arch/x86_64/kernel/pci-dma.c @@ -236,6 +236,9 @@ __init int iommu_setup(char *p) { iommu_merge = 1; + if (!p) + return -EINVAL; + while (*p) { if (!strncmp(p,"off",3)) no_iommu = 1; @@ -278,9 +281,9 @@ __init int iommu_setup(char *p) if (*p == ',') ++p; } - return 1; + return 0; } -__setup("iommu=", iommu_setup); +early_param("iommu", iommu_setup); void __init pci_iommu_alloc(void) { diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 73f1cdd140fe..f55540a6084e 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -76,11 +76,6 @@ unsigned long mmu_cr4_features; int acpi_disabled; EXPORT_SYMBOL(acpi_disabled); -#ifdef CONFIG_ACPI -extern int __initdata acpi_ht; -extern acpi_interrupt_flags acpi_sci_flags; -int __initdata acpi_force = 0; -#endif int acpi_numa __initdata; @@ -276,183 +271,22 @@ static void __init probe_roms(void) } } -/* Check for full argument with no trailing characters */ -static int fullarg(char *p, char *arg) +#ifdef CONFIG_PROC_VMCORE +/* elfcorehdr= specifies the location of elf core header + * stored by the crashed kernel. This option will be passed + * by kexec loader to the capture kernel. + */ +static int __init setup_elfcorehdr(char *arg) { - int l = strlen(arg); - return !memcmp(p, arg, l) && (p[l] == 0 || isspace(p[l])); + char *end; + if (!arg) + return -EINVAL; + elfcorehdr_addr = memparse(arg, &end); + return end > arg ? 0 : -EINVAL; } - -static __init void parse_cmdline_early (char ** cmdline_p) -{ - char c = ' ', *to = command_line, *from = COMMAND_LINE; - int len = 0; - int userdef = 0; - - for (;;) { - if (c != ' ') - goto next_char; - -#ifdef CONFIG_SMP - /* - * If the BIOS enumerates physical processors before logical, - * maxcpus=N at enumeration-time can be used to disable HT. - */ - else if (!memcmp(from, "maxcpus=", 8)) { - extern unsigned int maxcpus; - - maxcpus = simple_strtoul(from + 8, NULL, 0); - } -#endif -#ifdef CONFIG_ACPI - /* "acpi=off" disables both ACPI table parsing and interpreter init */ - if (fullarg(from,"acpi=off")) - disable_acpi(); - - if (fullarg(from, "acpi=force")) { - /* add later when we do DMI horrors: */ - acpi_force = 1; - acpi_disabled = 0; - } - - /* acpi=ht just means: do ACPI MADT parsing - at bootup, but don't enable the full ACPI interpreter */ - if (fullarg(from, "acpi=ht")) { - if (!acpi_force) - disable_acpi(); - acpi_ht = 1; - } - else if (fullarg(from, "pci=noacpi")) - acpi_disable_pci(); - else if (fullarg(from, "acpi=noirq")) - acpi_noirq_set(); - - else if (fullarg(from, "acpi_sci=edge")) - acpi_sci_flags.trigger = 1; - else if (fullarg(from, "acpi_sci=level")) - acpi_sci_flags.trigger = 3; - else if (fullarg(from, "acpi_sci=high")) - acpi_sci_flags.polarity = 1; - else if (fullarg(from, "acpi_sci=low")) - acpi_sci_flags.polarity = 3; - - /* acpi=strict disables out-of-spec workarounds */ - else if (fullarg(from, "acpi=strict")) { - acpi_strict = 1; - } - else if (fullarg(from, "acpi_skip_timer_override")) - acpi_skip_timer_override = 1; +early_param("elfcorehdr", setup_elfcorehdr); #endif - if (fullarg(from, "disable_timer_pin_1")) - disable_timer_pin_1 = 1; - if (fullarg(from, "enable_timer_pin_1")) - disable_timer_pin_1 = -1; - - if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) { - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); - disable_apic = 1; - } - - if (fullarg(from, "noapic")) - skip_ioapic_setup = 1; - - if (fullarg(from,"apic")) { - skip_ioapic_setup = 0; - ioapic_force = 1; - } - - if (!memcmp(from, "mem=", 4)) - parse_memopt(from+4, &from); - - if (!memcmp(from, "memmap=", 7)) { - /* exactmap option is for used defined memory */ - if (!memcmp(from+7, "exactmap", 8)) { -#ifdef CONFIG_CRASH_DUMP - /* If we are doing a crash dump, we - * still need to know the real mem - * size before original memory map is - * reset. - */ - saved_max_pfn = e820_end_of_ram(); -#endif - from += 8+7; - end_pfn_map = 0; - e820.nr_map = 0; - userdef = 1; - } - else { - parse_memmapopt(from+7, &from); - userdef = 1; - } - } - -#ifdef CONFIG_NUMA - if (!memcmp(from, "numa=", 5)) - numa_setup(from+5); -#endif - - if (!memcmp(from,"iommu=",6)) { - iommu_setup(from+6); - } - - if (fullarg(from,"oops=panic")) - panic_on_oops = 1; - - if (!memcmp(from, "noexec=", 7)) - nonx_setup(from + 7); - -#ifdef CONFIG_KEXEC - /* crashkernel=size@addr specifies the location to reserve for - * a crash kernel. By reserving this memory we guarantee - * that linux never set's it up as a DMA target. - * Useful for holding code to do something appropriate - * after a kernel panic. - */ - else if (!memcmp(from, "crashkernel=", 12)) { - unsigned long size, base; - size = memparse(from+12, &from); - if (*from == '@') { - base = memparse(from+1, &from); - /* FIXME: Do I want a sanity check - * to validate the memory range? - */ - crashk_res.start = base; - crashk_res.end = base + size - 1; - } - } -#endif - -#ifdef CONFIG_PROC_VMCORE - /* elfcorehdr= specifies the location of elf core header - * stored by the crashed kernel. This option will be passed - * by kexec loader to the capture kernel. - */ - else if(!memcmp(from, "elfcorehdr=", 11)) - elfcorehdr_addr = memparse(from+11, &from); -#endif - -#ifdef CONFIG_HOTPLUG_CPU - else if (!memcmp(from, "additional_cpus=", 16)) - setup_additional_cpus(from+16); -#endif - - next_char: - c = *(from++); - if (!c) - break; - if (COMMAND_LINE_SIZE <= ++len) - break; - *(to++) = c; - } - if (userdef) { - printk(KERN_INFO "user-defined physical RAM map:\n"); - e820_print_map("user"); - } - *to = '\0'; - *cmdline_p = command_line; -} - #ifndef CONFIG_NUMA static void __init contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) @@ -547,7 +381,12 @@ void __init setup_arch(char **cmdline_p) early_identify_cpu(&boot_cpu_data); - parse_cmdline_early(cmdline_p); + strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; + + parse_early_param(); + + finish_e820_parsing(); /* * partially used pages are not usable - thus diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 417de564456e..646caa0a20bb 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -46,8 +46,10 @@ Control non executable mappings for 64bit processes. on Enable(default) off Disable */ -int __init nonx_setup(char *str) +static int __init nonx_setup(char *str) { + if (!str) + return -EINVAL; if (!strncmp(str, "on", 2)) { __supported_pte_mask |= _PAGE_NX; do_not_nx = 0; @@ -55,9 +57,9 @@ int __init nonx_setup(char *str) do_not_nx = 1; __supported_pte_mask &= ~_PAGE_NX; } - return 1; + return 0; } -__setup("noexec=", nonx_setup); /* parsed early actually */ +early_param("noexec", nonx_setup); int force_personality32 = 0; diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index ae7d6d84e352..0b9f28b05e7d 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -1270,11 +1270,11 @@ void __cpu_die(unsigned int cpu) printk(KERN_ERR "CPU %u didn't die...\n", cpu); } -__init int setup_additional_cpus(char *s) +static __init int setup_additional_cpus(char *s) { - return get_option(&s, &additional_cpus); + return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL; } -__setup("additional_cpus=", setup_additional_cpus); +early_param("additional_cpus", setup_additional_cpus); #else /* ... !CONFIG_HOTPLUG_CPU */ diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 56d7ff0c894c..9ec2b1d5893d 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -1119,24 +1119,30 @@ void __init trap_init(void) } -/* Actual parsing is done early in setup.c. */ -static int __init oops_dummy(char *s) +static int __init oops_setup(char *s) { - panic_on_oops = 1; - return 1; + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; } -__setup("oops=", oops_dummy); +early_param("oops", oops_setup); static int __init kstack_setup(char *s) { + if (!s) + return -EINVAL; kstack_depth_to_print = simple_strtoul(s,NULL,0); - return 1; + return 0; } -__setup("kstack=", kstack_setup); +early_param("kstack", kstack_setup); #ifdef CONFIG_STACK_UNWIND static int __init call_trace_setup(char *s) { + if (!s) + return -EINVAL; if (strcmp(s, "old") == 0) call_trace = -1; else if (strcmp(s, "both") == 0) @@ -1145,7 +1151,7 @@ static int __init call_trace_setup(char *s) call_trace = 1; else if (strcmp(s, "new") == 0) call_trace = 2; - return 1; + return 0; } -__setup("call_trace=", call_trace_setup); +early_param("call_trace", call_trace_setup); #endif diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index b2fac14baac0..d64d6d93d04b 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -348,9 +348,10 @@ void __init paging_init(void) } } -/* [numa=off] */ -__init int numa_setup(char *opt) +static __init int numa_setup(char *opt) { + if (!opt) + return -EINVAL; if (!strncmp(opt,"off",3)) numa_off = 1; #ifdef CONFIG_NUMA_EMU @@ -366,9 +367,11 @@ __init int numa_setup(char *opt) if (!strncmp(opt,"hotadd=", 7)) hotadd_percent = simple_strtoul(opt+7, NULL, 10); #endif - return 1; + return 0; } +early_param("numa", numa_setup); + /* * Setup early cpu_to_node. * diff --git a/include/asm-x86_64/apic.h b/include/asm-x86_64/apic.h index 8ed0f4d67b8d..29ee735278f2 100644 --- a/include/asm-x86_64/apic.h +++ b/include/asm-x86_64/apic.h @@ -17,6 +17,7 @@ extern int apic_verbosity; extern int apic_runs_main_timer; +extern int ioapic_force; /* * Define the default level of output to be very little @@ -93,9 +94,6 @@ extern void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector, #define K8_APIC_EXT_INT_MSG_EXT 0x7 #define K8_APIC_EXT_LVT_ENTRY_THRESHOLD 0 -extern int disable_timer_pin_1; - - void smp_send_timer_broadcast_ipi(void); void switch_APIC_timer_to_ipi(void *cpumask); void switch_ipi_to_APIC_timer(void *cpumask); diff --git a/include/asm-x86_64/e820.h b/include/asm-x86_64/e820.h index 670a3388e70a..dba012151856 100644 --- a/include/asm-x86_64/e820.h +++ b/include/asm-x86_64/e820.h @@ -55,8 +55,7 @@ extern void e820_setup_gap(void); extern unsigned long e820_hole_size(unsigned long start_pfn, unsigned long end_pfn); -extern void __init parse_memopt(char *p, char **end); -extern void __init parse_memmapopt(char *p, char **end); +extern void finish_e820_parsing(void); extern struct e820map e820; diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index 0c1e2422400a..d79e7441b513 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -21,7 +21,6 @@ extern unsigned long __supported_pte_mask; #define swapper_pg_dir init_level4_pgt -extern int nonx_setup(char *str); extern void paging_init(void); extern void clear_kernel_mapping(unsigned long addr, unsigned long size); diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index 58fec91318e4..151826f8ce69 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -53,9 +53,6 @@ extern int nohpet; extern unsigned long vxtime_hz; extern void time_init_gtod(void); -extern int numa_setup(char *opt); - -extern int setup_early_printk(char *); extern void early_printk(const char *fmt, ...) __attribute__((format(printf,1,2))); extern void early_identify_cpu(struct cpuinfo_x86 *c); @@ -104,13 +101,7 @@ extern void select_idle_routine(const struct cpuinfo_x86 *c); extern unsigned long table_start, table_end; extern int exception_trace; -extern int using_apic_timer; -extern int disable_apic; extern unsigned cpu_khz; -extern int ioapic_force; -extern int skip_ioapic_setup; -extern int acpi_ht; -extern int acpi_disabled; extern void no_iommu_init(void); extern int force_iommu, no_iommu; @@ -132,7 +123,6 @@ extern int fix_aperture; extern int reboot_force; extern int notsc_setup(char *); -extern int setup_additional_cpus(char *); extern void smp_local_timer_interrupt(struct pt_regs * regs); -- cgit v1.2.1 From 43c85c9c5dff76efc1e411d3302840027ea92004 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] Remove need for early lockdep init I think it was only needed for the printks and we can do them later. I put in a single early_printk so that we know the kernel is alive (early_printk doesn't need any locks) This makes some things easier for initialization of unwind for lockdep, which is needed by later patches. cc: mingo@elte.hu Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head64.c | 8 +------- arch/x86_64/kernel/setup.c | 2 ++ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index 6716cbfc34ac..9561eb3c5b5c 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -45,15 +45,12 @@ static void __init copy_bootdata(char *real_mode_data) new_data = *(int *) (x86_boot_params + NEW_CL_POINTER); if (!new_data) { if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) { - printk("so old bootloader that it does not support commandline?!\n"); return; } new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET; - printk("old bootloader convention, maybe loadlin?\n"); } command_line = (char *) ((u64)(new_data)); memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE); - printk("Bootdata ok (command line is %s)\n", saved_command_line); } void __init x86_64_start_kernel(char * real_mode_data) @@ -65,10 +62,7 @@ void __init x86_64_start_kernel(char * real_mode_data) asm volatile("lidt %0" :: "m" (idt_descr)); clear_bss(); - /* - * This must be called really, really early: - */ - lockdep_init(); + early_printk("Kernel alive\n"); /* * switch to init_level4_pgt from boot_level4_pgt diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index f55540a6084e..9c4d93c53c06 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -353,6 +353,8 @@ static void discover_ebda(void) void __init setup_arch(char **cmdline_p) { + printk(KERN_INFO "Command line: %s\n", saved_command_line); + ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); screen_info = SCREEN_INFO; edid_info = EDID_INFO; -- cgit v1.2.1 From df3bb57d2c0160ccd1ee51322f50aa295c3b0858 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] i386/x86-64: Move acpi_disabled variables into acpi/boot.c Removes code duplication between i386/x86-64. Not needed anymore in setup.c since early_param cleanup Cc: len.brown@intel.com Signed-off-by: Andi Kleen --- arch/i386/kernel/acpi/boot.c | 7 +++++++ arch/i386/kernel/setup.c | 7 ------- arch/x86_64/kernel/setup.c | 3 --- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index 87e2ab50b694..f3761b98c8dd 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -38,6 +38,13 @@ int __initdata acpi_force = 0; +#ifdef CONFIG_ACPI +int acpi_disabled = 0; +#else +int acpi_disabled = 1; +#endif +EXPORT_SYMBOL(acpi_disabled); + #ifdef CONFIG_X86_64 extern void __init clustered_apic_check(void); diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index c6e31ed386f5..ea17567dbe72 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -89,13 +89,6 @@ EXPORT_SYMBOL(boot_cpu_data); unsigned long mmu_cr4_features; -#ifdef CONFIG_ACPI - int acpi_disabled = 0; -#else - int acpi_disabled = 1; -#endif -EXPORT_SYMBOL(acpi_disabled); - /* for MCA, but anyone else can use it if they want */ unsigned int machine_id; #ifdef CONFIG_MCA diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 9c4d93c53c06..0f86976811d6 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -74,9 +74,6 @@ EXPORT_SYMBOL(boot_cpu_data); unsigned long mmu_cr4_features; -int acpi_disabled; -EXPORT_SYMBOL(acpi_disabled); - int acpi_numa __initdata; /* Boot loader ID as an integer, for the benefit of proc_dointvec */ -- cgit v1.2.1 From c31fbb1ad890b11f037c16496e53f28877c12722 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] Clean up acpi_numa variable Move it into srat.c No need to clutter up setup.c for it And remove use in setup.c completely - it only guarded a printk which can be done unconditionally. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/setup.c | 5 +---- arch/x86_64/mm/srat.c | 2 ++ 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 0f86976811d6..b85528597aa8 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -74,8 +74,6 @@ EXPORT_SYMBOL(boot_cpu_data); unsigned long mmu_cr4_features; -int acpi_numa __initdata; - /* Boot loader ID as an integer, for the benefit of proc_dointvec */ int bootloader_type; @@ -812,8 +810,7 @@ static void srat_detect_node(void) node = first_node(node_online_map); numa_set_node(cpu, node); - if (acpi_numa > 0) - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); #endif } diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c index 502fce65e96a..ca10701e7a90 100644 --- a/arch/x86_64/mm/srat.c +++ b/arch/x86_64/mm/srat.c @@ -21,6 +21,8 @@ #include #include +int acpi_numa __initdata; + #if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \ defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \ && !defined(CONFIG_MEMORY_HOTPLUG) -- cgit v1.2.1 From 3bd4d18cbab622c504f131f3c0029c3aa29c05be Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] Move e820 map into e820.c Minor cleanup. Keep setup.c free from unrelated clutter. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/e820.c | 2 ++ arch/x86_64/kernel/setup.c | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index a4739dc82937..164d0b83dc92 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -24,6 +24,8 @@ #include #include +struct e820map e820 __initdata; + /* * PFN of last memory page. */ diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index b85528597aa8..4be29832e3dc 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -97,7 +97,6 @@ struct sys_desc_table_struct { struct edid_info edid_info; EXPORT_SYMBOL_GPL(edid_info); -struct e820map e820; extern int root_mountflags; -- cgit v1.2.1 From 131cfd7bd54767ec8959e013f83839442a54d546 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] Add sparse annotation to vsyscall.c Fixes linux/arch/x86_64/kernel/vsyscall.c:276:7: warning: constant 0x0f40000000000 is so big it is long linux/arch/x86_64/kernel/vsyscall.c:80:14: warning: incorrect type in argument 1 (different address spaces) linux/arch/x86_64/kernel/vsyscall.c:80:14: expected void const volatile [noderef] *addr linux/arch/x86_64/kernel/vsyscall.c:80:14: got void * linux/arch/x86_64/kernel/vsyscall.c:200:7: warning: incorrect type in assignment (different address spaces) linux/arch/x86_64/kernel/vsyscall.c:200:7: expected unsigned short [usertype] *map1 linux/arch/x86_64/kernel/vsyscall.c:200:7: got void [noderef] * linux/arch/x86_64/kernel/vsyscall.c:203:7: warning: incorrect type in assignment (different address spaces) linux/arch/x86_64/kernel/vsyscall.c:203:7: expected unsigned short [usertype] *map2 linux/arch/x86_64/kernel/vsyscall.c:203:7: got void [noderef] * linux/arch/x86_64/kernel/vsyscall.c:215:10: warning: incorrect type in argument 1 (different address spaces) linux/arch/x86_64/kernel/vsyscall.c:215:10: expected void volatile [noderef] *addr linux/arch/x86_64/kernel/vsyscall.c:215:10: got unsigned short [usertype] *map2 linux/arch/x86_64/kernel/vsyscall.c:217:10: warning: incorrect type in argument 1 (different address spaces) linux/arch/x86_64/kernel/vsyscall.c:217:10: expected void volatile [noderef] *addr linux/arch/x86_64/kernel/vsyscall.c:217:10: got unsigned short [usertype] *map1 Signed-off-by: Andi Kleen --- arch/x86_64/kernel/vsyscall.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index 902783bc4d53..ac48c3857ddb 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -77,7 +77,8 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) __vxtime.tsc_quot) >> 32; /* See comment in x86_64 do_gettimeofday. */ } else { - usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) - + usec += ((readl((void __iomem *) + fix_to_virt(VSYSCALL_HPET) + 0xf0) - __vxtime.last) * __vxtime.quot) >> 32; } } while (read_seqretry(&__xtime_lock, sequence)); @@ -191,7 +192,8 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) { extern u16 vsysc1, vsysc2; - u16 *map1, *map2; + u16 __iomem *map1; + u16 __iomem *map2; int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); if (!write) return ret; @@ -206,11 +208,11 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, goto out; } if (!sysctl_vsyscall) { - *map1 = SYSCALL; - *map2 = SYSCALL; + writew(SYSCALL, map1); + writew(SYSCALL, map2); } else { - *map1 = NOP2; - *map2 = NOP2; + writew(NOP2, map1); + writew(NOP2, map2); } iounmap(map2); out: -- cgit v1.2.1 From dd2994f619752fb731f21c89ad16536dd6673948 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] Add sparse annotations to quiet sparse in arch/x86_64/mm/fault.c Fixes linux/arch/x86_64/mm/fault.c:125:7: warning: incorrect type in argument 1 (different address spaces) linux/arch/x86_64/mm/fault.c:125:7: expected void [noderef] * linux/arch/x86_64/mm/fault.c:125:7: got unsigned char *[assigned] instr linux/arch/x86_64/mm/fault.c:163:8: warning: incorrect type in argument 1 (different address spaces) linux/arch/x86_64/mm/fault.c:163:8: expected void [noderef] * linux/arch/x86_64/mm/fault.c:163:8: got unsigned char *[assigned] instr linux/arch/x86_64/mm/fault.c:179:9: warning: incorrect type in argument 1 (different address spaces) linux/arch/x86_64/mm/fault.c:179:9: expected void [noderef] * linux/arch/x86_64/mm/fault.c:179:9: got unsigned long * Signed-off-by: Andi Kleen --- arch/x86_64/mm/fault.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index ac8ea66ccb94..449a437f5d70 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -102,7 +102,7 @@ void bust_spinlocks(int yes) static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, unsigned long error_code) { - unsigned char *instr; + unsigned char __user *instr; int scan_more = 1; int prefetch = 0; unsigned char *max_instr; @@ -111,7 +111,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, if (error_code & PF_INSTR) return 0; - instr = (unsigned char *)convert_rip_to_linear(current, regs); + instr = (unsigned char __user *)convert_rip_to_linear(current, regs); max_instr = instr + 15; if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) @@ -122,7 +122,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, unsigned char instr_hi; unsigned char instr_lo; - if (__get_user(opcode, instr)) + if (__get_user(opcode, (char __user *)instr)) break; instr_hi = opcode & 0xf0; @@ -160,7 +160,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, case 0x00: /* Prefetch instruction is 0x0F0D or 0x0F18 */ scan_more = 0; - if (__get_user(opcode, instr)) + if (__get_user(opcode, (char __user *)instr)) break; prefetch = (instr_lo == 0xF) && (opcode == 0x0D || opcode == 0x18); @@ -176,7 +176,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, static int bad_address(void *p) { unsigned long dummy; - return __get_user(dummy, (unsigned long *)p); + return __get_user(dummy, (unsigned long __user *)p); } void dump_pagetable(unsigned long address) -- cgit v1.2.1 From ddb15ec130d38cb8076a9926040c7435b126db65 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] Fix most sparse warnings in sys_ia32.c Mostly by adding casts. I didn't touch the "invalid access past ..." which are caused by the sigset conversion. Signed-off-by: Andi Kleen --- arch/x86_64/ia32/sys_ia32.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c index 9c130993380d..33cd0a4fe388 100644 --- a/arch/x86_64/ia32/sys_ia32.c +++ b/arch/x86_64/ia32/sys_ia32.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include #include @@ -389,7 +390,9 @@ sys32_rt_sigprocmask(int how, compat_sigset_t __user *set, } } set_fs (KERNEL_DS); - ret = sys_rt_sigprocmask(how, set ? &s : NULL, oset ? &s : NULL, + ret = sys_rt_sigprocmask(how, + set ? (sigset_t __user *)&s : NULL, + oset ? (sigset_t __user *)&s : NULL, sigsetsize); set_fs (old_fs); if (ret) return ret; @@ -541,7 +544,7 @@ sys32_sysinfo(struct sysinfo32 __user *info) int bitcount = 0; set_fs (KERNEL_DS); - ret = sys_sysinfo(&s); + ret = sys_sysinfo((struct sysinfo __user *)&s); set_fs (old_fs); /* Check to see if any memory value is too large for 32-bit and scale @@ -589,7 +592,7 @@ sys32_sched_rr_get_interval(compat_pid_t pid, struct compat_timespec __user *int mm_segment_t old_fs = get_fs (); set_fs (KERNEL_DS); - ret = sys_sched_rr_get_interval(pid, &t); + ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t); set_fs (old_fs); if (put_compat_timespec(&t, interval)) return -EFAULT; @@ -605,7 +608,7 @@ sys32_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize) mm_segment_t old_fs = get_fs(); set_fs (KERNEL_DS); - ret = sys_rt_sigpending(&s, sigsetsize); + ret = sys_rt_sigpending((sigset_t __user *)&s, sigsetsize); set_fs (old_fs); if (!ret) { switch (_NSIG_WORDS) { @@ -630,7 +633,7 @@ sys32_rt_sigqueueinfo(int pid, int sig, compat_siginfo_t __user *uinfo) if (copy_siginfo_from_user32(&info, uinfo)) return -EFAULT; set_fs (KERNEL_DS); - ret = sys_rt_sigqueueinfo(pid, sig, &info); + ret = sys_rt_sigqueueinfo(pid, sig, (siginfo_t __user *)&info); set_fs (old_fs); return ret; } @@ -666,9 +669,6 @@ sys32_sysctl(struct sysctl_ia32 __user *args32) size_t oldlen; int __user *namep; long ret; - extern int do_sysctl(int *name, int nlen, void *oldval, size_t *oldlenp, - void *newval, size_t newlen); - if (copy_from_user(&a32, args32, sizeof (a32))) return -EFAULT; @@ -692,7 +692,8 @@ sys32_sysctl(struct sysctl_ia32 __user *args32) set_fs(KERNEL_DS); lock_kernel(); - ret = do_sysctl(namep, a32.nlen, oldvalp, &oldlen, newvalp, (size_t) a32.newlen); + ret = do_sysctl(namep, a32.nlen, oldvalp, (size_t __user *)&oldlen, + newvalp, (size_t) a32.newlen); unlock_kernel(); set_fs(old_fs); @@ -743,7 +744,8 @@ sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, s32 count) return -EFAULT; set_fs(KERNEL_DS); - ret = sys_sendfile(out_fd, in_fd, offset ? &of : NULL, count); + ret = sys_sendfile(out_fd, in_fd, offset ? (off_t __user *)&of : NULL, + count); set_fs(old_fs); if (offset && put_user(of, offset)) @@ -831,7 +833,7 @@ long sys32_ustat(unsigned dev, struct ustat32 __user *u32p) seg = get_fs(); set_fs(KERNEL_DS); - ret = sys_ustat(dev,&u); + ret = sys_ustat(dev, (struct ustat __user *)&u); set_fs(seg); if (ret >= 0) { if (!access_ok(VERIFY_WRITE,u32p,sizeof(struct ustat32)) || -- cgit v1.2.1 From 52d522f53f137c7903db22f9196a48ad8658fb2b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] Fix sparse warnings in compat aout code Signed-off-by: Andi Kleen --- arch/x86_64/ia32/ia32_aout.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c index 3bf58af98936..396d3c100011 100644 --- a/arch/x86_64/ia32/ia32_aout.c +++ b/arch/x86_64/ia32/ia32_aout.c @@ -333,7 +333,8 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) return error; } - error = bprm->file->f_op->read(bprm->file, (char *)text_addr, + error = bprm->file->f_op->read(bprm->file, + (char __user *)text_addr, ex.a_text+ex.a_data, &pos); if ((signed long)error < 0) { send_sig(SIGKILL, current, 0); @@ -366,7 +367,8 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) down_write(¤t->mm->mmap_sem); do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); up_write(¤t->mm->mmap_sem); - bprm->file->f_op->read(bprm->file,(char *)N_TXTADDR(ex), + bprm->file->f_op->read(bprm->file, + (char __user *)N_TXTADDR(ex), ex.a_text+ex.a_data, &pos); flush_icache_range((unsigned long) N_TXTADDR(ex), (unsigned long) N_TXTADDR(ex) + @@ -477,7 +479,7 @@ static int load_aout_library(struct file *file) do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); up_write(¤t->mm->mmap_sem); - file->f_op->read(file, (char *)start_addr, + file->f_op->read(file, (char __user *)start_addr, ex.a_text + ex.a_data, &pos); flush_icache_range((unsigned long) start_addr, (unsigned long) start_addr + ex.a_text + ex.a_data); -- cgit v1.2.1 From 2b94ab2fd55768030e177a6ec224dedd031ad06b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20Luis=20V=E1zquez=20Cao?= Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] Replace local_save_flags+local_irq_disable with The combination of "local_save_flags" and "local_irq_disable" seems to be equivalent to "local_irq_save" (see code snips below). Consequently, replace occurrences of local_save_flags+local_irq_disable with local_irq_save. * local_irq_save #define raw_local_irq_save(flags) \ do { (flags) = __raw_local_irq_save(); } while (0) static inline unsigned long __raw_local_irq_save(void) { unsigned long flags = __raw_local_save_flags(); raw_local_irq_disable(); return flags; } * local_save_flags #define raw_local_save_flags(flags) \ do { (flags) = __raw_local_save_flags(); } while (0) Signed-off-by: Fernando Vazquez Signed-off-by: Andi Kleen --- arch/x86_64/kernel/apic.c | 3 +-- arch/x86_64/kernel/genapic_flat.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 660602c51f51..b95d1e378b96 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -468,8 +468,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) apic_pm_state.apic_tmict = apic_read(APIC_TMICT); apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); - local_save_flags(flags); - local_irq_disable(); + local_irq_save(flags); disable_local_APIC(); local_irq_restore(flags); return 0; diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c index 2ceb8b872b57..50ad153eaac4 100644 --- a/arch/x86_64/kernel/genapic_flat.c +++ b/arch/x86_64/kernel/genapic_flat.c @@ -49,8 +49,7 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector) unsigned long cfg; unsigned long flags; - local_save_flags(flags); - local_irq_disable(); + local_irq_save(flags); /* * Wait for idle. -- cgit v1.2.1 From 91cd444e56ebe0c2acd9576a045d77490b26f607 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] x86: Remove unneeded externs in acpi/boot.c And move one into proto.h Cc: len.brown@intel.com Signed-off-by: Andi Kleen --- arch/i386/kernel/acpi/boot.c | 3 --- include/asm-x86_64/proto.h | 2 ++ 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index f3761b98c8dd..c809a3f4c0bf 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -47,9 +47,6 @@ EXPORT_SYMBOL(acpi_disabled); #ifdef CONFIG_X86_64 -extern void __init clustered_apic_check(void); - -extern int gsi_irq_sharing(int gsi); #include static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 0; } diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index 151826f8ce69..b73d0c76613c 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -124,6 +124,8 @@ extern int fix_aperture; extern int reboot_force; extern int notsc_setup(char *); +extern int gsi_irq_sharing(int gsi); + extern void smp_local_timer_interrupt(struct pt_regs * regs); long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); -- cgit v1.2.1 From 2ade2920dcefdf5595c6380ebed131c964190855 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] i386/x86-64: rename is_at_popf(), add iret to tests and fix is_at_popf() needs to test for the iret instruction as well as popf. So add that test and rename it to is_setting_trap_flag(). Also change max insn length from 16 to 15 to match reality. LAHF / SAHF can't affect TF, so the comment in x86_64 is removed. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen --- arch/i386/kernel/ptrace.c | 10 +++++----- arch/x86_64/kernel/ptrace.c | 12 +++++------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c index d3db03f4085d..775f50e9395b 100644 --- a/arch/i386/kernel/ptrace.c +++ b/arch/i386/kernel/ptrace.c @@ -185,17 +185,17 @@ static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_ return addr; } -static inline int is_at_popf(struct task_struct *child, struct pt_regs *regs) +static inline int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) { int i, copied; - unsigned char opcode[16]; + unsigned char opcode[15]; unsigned long addr = convert_eip_to_linear(child, regs); copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); for (i = 0; i < copied; i++) { switch (opcode[i]) { - /* popf */ - case 0x9d: + /* popf and iret */ + case 0x9d: case 0xcf: return 1; /* opcode and address size prefixes */ case 0x66: case 0x67: @@ -247,7 +247,7 @@ static void set_singlestep(struct task_struct *child) * don't mark it as being "us" that set it, so that we * won't clear it by hand later. */ - if (is_at_popf(child, regs)) + if (is_setting_trap_flag(child, regs)) return; child->ptrace |= PT_DTRACE; diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c index 412c6a8bf906..addc14af0c56 100644 --- a/arch/x86_64/kernel/ptrace.c +++ b/arch/x86_64/kernel/ptrace.c @@ -116,17 +116,17 @@ unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *r return addr; } -static int is_at_popf(struct task_struct *child, struct pt_regs *regs) +static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) { int i, copied; - unsigned char opcode[16]; + unsigned char opcode[15]; unsigned long addr = convert_rip_to_linear(child, regs); copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); for (i = 0; i < copied; i++) { switch (opcode[i]) { - /* popf */ - case 0x9d: + /* popf and iret */ + case 0x9d: case 0xcf: return 1; /* CHECKME: 64 65 */ @@ -189,10 +189,8 @@ static void set_singlestep(struct task_struct *child) * ..but if TF is changed by the instruction we will trace, * don't mark it as being "us" that set it, so that we * won't clear it by hand later. - * - * AK: this is not enough, LAHF and IRET can change TF in user space too. */ - if (is_at_popf(child, regs)) + if (is_setting_trap_flag(child, regs)) return; child->ptrace |= PT_DTRACE; -- cgit v1.2.1 From de684652f34f57cb60d4d78d09139a0e0c5e7b1b Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] print whether CONFIG_IOMMU_DEBUG is enabled Signed-off-by: Muli Ben-Yehuda Signed-off-by: Jon Mason Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-calgary.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index caf84e4e4ca9..ebe4e930b64d 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -127,15 +127,19 @@ static void tce_cache_blast(struct iommu_table *tbl); /* enable this to stress test the chip's TCE cache */ #ifdef CONFIG_IOMMU_DEBUG +int debugging __read_mostly = 1; + static inline void tce_cache_blast_stress(struct iommu_table *tbl) { tce_cache_blast(tbl); } -#else +#else /* debugging is disabled */ +int debugging __read_mostly = 0; + static inline void tce_cache_blast_stress(struct iommu_table *tbl) { } -#endif /* BLAST_TCE_CACHE_ON_UNMAP */ +#endif /* CONFIG_IOMMU_DEBUG */ static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) { @@ -944,8 +948,10 @@ void __init detect_calgary(void) if (calgary_found) { iommu_detected = 1; calgary_detected = 1; - printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected. " - "TCE table spec is %d.\n", specified_table_size); + printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); + printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " + "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, + debugging ? "enabled" : "disabled"); } return; -- cgit v1.2.1 From 796e4390e0378e1e57c033349610cfc741696a3d Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] only verify the allocation bitmap if CONFIG_IOMMU_DEBUG is on Introduce new function verify_bit_range(). Define two versions, one for CONFIG_IOMMU_DEBUG enabled and one for disabled. Previously we were checking that the bitmap was consistent every time we allocated or freed an entry in the TCE table, which is good for debugging but incurs an unnecessary penalty on non debug builds. Signed-off-by: Muli Ben-Yehuda Signed-off-by: Jon Mason Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-calgary.c | 44 ++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index ebe4e930b64d..7c43cb0f71a3 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -133,12 +133,35 @@ static inline void tce_cache_blast_stress(struct iommu_table *tbl) { tce_cache_blast(tbl); } + +static inline unsigned long verify_bit_range(unsigned long* bitmap, + int expected, unsigned long start, unsigned long end) +{ + unsigned long idx = start; + + BUG_ON(start >= end); + + while (idx < end) { + if (!!test_bit(idx, bitmap) != expected) + return idx; + ++idx; + } + + /* all bits have the expected value */ + return ~0UL; +} #else /* debugging is disabled */ int debugging __read_mostly = 0; static inline void tce_cache_blast_stress(struct iommu_table *tbl) { } + +static inline unsigned long verify_bit_range(unsigned long* bitmap, + int expected, unsigned long start, unsigned long end) +{ + return ~0UL; +} #endif /* CONFIG_IOMMU_DEBUG */ static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) @@ -162,6 +185,7 @@ static void iommu_range_reserve(struct iommu_table *tbl, { unsigned long index; unsigned long end; + unsigned long badbit; index = start_addr >> PAGE_SHIFT; @@ -173,14 +197,15 @@ static void iommu_range_reserve(struct iommu_table *tbl, if (end > tbl->it_size) /* don't go off the table */ end = tbl->it_size; - while (index < end) { - if (test_bit(index, tbl->it_map)) + badbit = verify_bit_range(tbl->it_map, 0, index, end); + if (badbit != ~0UL) { + if (printk_ratelimit()) printk(KERN_ERR "Calgary: entry already allocated at " "0x%lx tbl %p dma 0x%lx npages %u\n", - index, tbl, start_addr, npages); - ++index; + badbit, tbl, start_addr, npages); } - set_bit_string(tbl->it_map, start_addr >> PAGE_SHIFT, npages); + + set_bit_string(tbl->it_map, index, npages); } static unsigned long iommu_range_alloc(struct iommu_table *tbl, @@ -247,7 +272,7 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned int npages) { unsigned long entry; - unsigned long i; + unsigned long badbit; entry = dma_addr >> PAGE_SHIFT; @@ -255,11 +280,12 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, tce_free(tbl, entry, npages); - for (i = 0; i < npages; ++i) { - if (!test_bit(entry + i, tbl->it_map)) + badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages); + if (badbit != ~0UL) { + if (printk_ratelimit()) printk(KERN_ERR "Calgary: bit is off at 0x%lx " "tbl %p dma 0x%Lx entry 0x%lx npages %u\n", - entry + i, tbl, dma_addr, entry, npages); + badbit, tbl, dma_addr, entry, npages); } __clear_bit_string(tbl->it_map, entry, npages); -- cgit v1.2.1 From 4ccf4ae3144360ab9c00d0b53427f43369287bfb Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] remove tce_cache_blast_stress() tce_cache_blast_stress was useful during bringup to stress the IOMMU's cache flushing. Now that we quiesce DMAs on every cache flush, using _stress() brings the machine down to its knees once you put it under load. Remove this debug / bringup code that isn't useful anymore completely. Signed-off-by: Muli Ben-Yehuda Signed-off-by: Jon Mason Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-calgary.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 7c43cb0f71a3..cd866135e4ef 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -129,11 +129,6 @@ static void tce_cache_blast(struct iommu_table *tbl); #ifdef CONFIG_IOMMU_DEBUG int debugging __read_mostly = 1; -static inline void tce_cache_blast_stress(struct iommu_table *tbl) -{ - tce_cache_blast(tbl); -} - static inline unsigned long verify_bit_range(unsigned long* bitmap, int expected, unsigned long start, unsigned long end) { @@ -153,10 +148,6 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap, #else /* debugging is disabled */ int debugging __read_mostly = 0; -static inline void tce_cache_blast_stress(struct iommu_table *tbl) -{ -} - static inline unsigned long verify_bit_range(unsigned long* bitmap, int expected, unsigned long start, unsigned long end) { @@ -289,8 +280,6 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, } __clear_bit_string(tbl->it_map, entry, npages); - - tce_cache_blast_stress(tbl); } static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, -- cgit v1.2.1 From 4ea8a5d8b57cd504b4b2de1212523848e7ab50cf Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] Calgary IOMMU: eradicate sole remaining 80 chars per line offender Signed-off-by: Muli Ben-Yehuda Signed-off-by: Jon Mason Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-calgary.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index cd866135e4ef..466588f95601 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -86,7 +86,8 @@ #define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ #define MAX_NUM_CHASSIS 8 /* max number of chassis */ -#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) /* max dev->bus->number */ +/* MAX_PHB_BUS_NUM is the maximal possible dev->bus->number */ +#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) #define PHBS_PER_CALGARY 4 /* register offsets in Calgary's internal register space */ -- cgit v1.2.1 From 5a1b3999d6cb7ab87f1f3b1700bc91839fd6fa29 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] x86: Some preparationary cleanup for stack trace - Remove unused all_contexts parameter No caller used it - Move skip argument into the structure (needed for followon patches) Cc: mingo@elte.hu Signed-off-by: Andi Kleen --- arch/i386/kernel/stacktrace.c | 11 +++-------- arch/s390/kernel/stacktrace.c | 17 ++++++++--------- arch/x86_64/kernel/stacktrace.c | 14 +++++--------- include/linux/stacktrace.h | 7 ++++--- kernel/lockdep.c | 5 ++++- 5 files changed, 24 insertions(+), 30 deletions(-) diff --git a/arch/i386/kernel/stacktrace.c b/arch/i386/kernel/stacktrace.c index e62a037ab399..ae3c32a87add 100644 --- a/arch/i386/kernel/stacktrace.c +++ b/arch/i386/kernel/stacktrace.c @@ -61,12 +61,8 @@ save_context_stack(struct stack_trace *trace, unsigned int skip, /* * Save stack-backtrace addresses into a stack_trace buffer. - * If all_contexts is set, all contexts (hardirq, softirq and process) - * are saved. If not set then only the current context is saved. */ -void save_stack_trace(struct stack_trace *trace, - struct task_struct *task, int all_contexts, - unsigned int skip) +void save_stack_trace(struct stack_trace *trace, struct task_struct *task) { unsigned long ebp; unsigned long *stack = &ebp; @@ -85,10 +81,9 @@ void save_stack_trace(struct stack_trace *trace, struct thread_info *context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - ebp = save_context_stack(trace, skip, context, stack, ebp); + ebp = save_context_stack(trace, trace->skip, context, stack, ebp); stack = (unsigned long *)context->previous_esp; - if (!all_contexts || !stack || - trace->nr_entries >= trace->max_entries) + if (!stack || trace->nr_entries >= trace->max_entries) break; trace->entries[trace->nr_entries++] = ULONG_MAX; if (trace->nr_entries >= trace->max_entries) diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index de83f38288d0..d9428a0fc8fb 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -59,9 +59,7 @@ static inline unsigned long save_context_stack(struct stack_trace *trace, } } -void save_stack_trace(struct stack_trace *trace, - struct task_struct *task, int all_contexts, - unsigned int skip) +void save_stack_trace(struct stack_trace *trace, struct task_struct *task) { register unsigned long sp asm ("15"); unsigned long orig_sp; @@ -69,22 +67,23 @@ void save_stack_trace(struct stack_trace *trace, sp &= PSW_ADDR_INSN; orig_sp = sp; - sp = save_context_stack(trace, &skip, sp, + sp = save_context_stack(trace, &trace->skip, sp, S390_lowcore.panic_stack - PAGE_SIZE, S390_lowcore.panic_stack); - if ((sp != orig_sp) && !all_contexts) + if ((sp != orig_sp) && !trace->all_contexts) return; - sp = save_context_stack(trace, &skip, sp, + sp = save_context_stack(trace, &trace->skip, sp, S390_lowcore.async_stack - ASYNC_SIZE, S390_lowcore.async_stack); - if ((sp != orig_sp) && !all_contexts) + if ((sp != orig_sp) && !trace->all_contexts) return; if (task) - save_context_stack(trace, &skip, sp, + save_context_stack(trace, &trace->skip, sp, (unsigned long) task_stack_page(task), (unsigned long) task_stack_page(task) + THREAD_SIZE); else - save_context_stack(trace, &skip, sp, S390_lowcore.thread_info, + save_context_stack(trace, &trace->skip, sp, + S390_lowcore.thread_info, S390_lowcore.thread_info + THREAD_SIZE); return; } diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c index 32cf55eb9af8..1c022af8fe1e 100644 --- a/arch/x86_64/kernel/stacktrace.c +++ b/arch/x86_64/kernel/stacktrace.c @@ -109,9 +109,10 @@ out_restore: * Save stack-backtrace addresses into a stack_trace buffer: */ static inline unsigned long -save_context_stack(struct stack_trace *trace, unsigned int skip, +save_context_stack(struct stack_trace *trace, unsigned long stack, unsigned long stack_end) { + int skip = trace->skip; unsigned long addr; #ifdef CONFIG_FRAME_POINTER @@ -159,12 +160,8 @@ save_context_stack(struct stack_trace *trace, unsigned int skip, /* * Save stack-backtrace addresses into a stack_trace buffer. - * If all_contexts is set, all contexts (hardirq, softirq and process) - * are saved. If not set then only the current context is saved. */ -void save_stack_trace(struct stack_trace *trace, - struct task_struct *task, int all_contexts, - unsigned int skip) +void save_stack_trace(struct stack_trace *trace, struct task_struct *task) { unsigned long stack = (unsigned long)&stack; int i, nr_stacks = 0, stacks_done[MAX_STACKS]; @@ -207,9 +204,8 @@ void save_stack_trace(struct stack_trace *trace, return; stacks_done[nr_stacks] = stack_end; - stack = save_context_stack(trace, skip, stack, stack_end); - if (!all_contexts || !stack || - trace->nr_entries >= trace->max_entries) + stack = save_context_stack(trace, stack, stack_end); + if (!stack || trace->nr_entries >= trace->max_entries) return; trace->entries[trace->nr_entries++] = ULONG_MAX; if (trace->nr_entries >= trace->max_entries) diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index 9cc81e572224..50e2b01e517c 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -5,15 +5,16 @@ struct stack_trace { unsigned int nr_entries, max_entries; unsigned long *entries; + int skip; /* input argument: How many entries to skip */ + int all_contexts; /* input argument: if true do than one stack */ }; extern void save_stack_trace(struct stack_trace *trace, - struct task_struct *task, int all_contexts, - unsigned int skip); + struct task_struct *task); extern void print_stack_trace(struct stack_trace *trace, int spaces); #else -# define save_stack_trace(trace, task, all, skip) do { } while (0) +# define save_stack_trace(trace, task) do { } while (0) # define print_stack_trace(trace) do { } while (0) #endif diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 9bad17884513..900b4cb1a024 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -224,7 +224,10 @@ static int save_trace(struct stack_trace *trace) trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; trace->entries = stack_trace + nr_stack_trace_entries; - save_stack_trace(trace, NULL, 0, 3); + trace->skip = 3; + trace->all_contexts = 0; + + save_stack_trace(trace, NULL); trace->max_entries = trace->nr_entries; -- cgit v1.2.1 From 3fa7c794fe4dc127f7fac3fad4d13628e68f89ce Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] Avoid recursion in lockdep when stack tracer takes locks The new dwarf2 unwinder needs to take locks to do backtraces inside modules. This patch makes sure lockdep which calls stacktrace is not reentered. Thanks to Ingo for suggesting this simpler approach. Cc: mingo@elte.hu Signed-off-by: Andi Kleen --- kernel/lockdep.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 900b4cb1a024..c088e5542e84 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -227,7 +227,11 @@ static int save_trace(struct stack_trace *trace) trace->skip = 3; trace->all_contexts = 0; + /* Make sure to not recurse in case the the unwinder needs to tak +e locks. */ + lockdep_off(); save_stack_trace(trace, NULL); + lockdep_on(); trace->max_entries = trace->nr_entries; -- cgit v1.2.1 From b7f5e3c7742d5332b78b831131f43fc3630e6322 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] Don't access the APIC in safe_smp_processor_id when it is not mapped yet Lockdep can call the dwarf2 unwinder early, and the dwarf2 code uses safe_smp_processor_id which tries to access the local APIC page. But that doesn't work before the APIC code has set up its fixmap. Check for this case and always return boot cpu then. Cc: jbeulich@novell.com Cc: mingo@elte.hu Signed-off-by: Andi Kleen --- arch/x86_64/kernel/apic.c | 2 ++ arch/x86_64/kernel/smp.c | 2 +- include/asm-x86_64/apic.h | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index b95d1e378b96..6472e321cad7 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -38,6 +38,7 @@ #include #include +int apic_mapped; int apic_verbosity; int apic_runs_main_timer; int apic_calibrate_pmtmr __initdata; @@ -600,6 +601,7 @@ void __init init_apic_mappings(void) apic_phys = mp_lapic_addr; set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + apic_mapped = 1; apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys); /* diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 06af6ca60129..2df988bbf312 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -526,7 +526,7 @@ int safe_smp_processor_id(void) { unsigned apicid, i; - if (disable_apic) + if (disable_apic || !apic_mapped) return 0; apicid = hard_smp_processor_id(); diff --git a/include/asm-x86_64/apic.h b/include/asm-x86_64/apic.h index 29ee735278f2..9e66d32330c9 100644 --- a/include/asm-x86_64/apic.h +++ b/include/asm-x86_64/apic.h @@ -18,6 +18,7 @@ extern int apic_verbosity; extern int apic_runs_main_timer; extern int ioapic_force; +extern int apic_mapped; /* * Define the default level of output to be very little -- cgit v1.2.1 From c9538ed49272fb244ac06ba643ff076a68a77e12 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] Move unwind_init earlier Needed for use of the unwinder in lockdep, because lockdep runs really early too. Cc: jbeulich@novell.com Signed-off-by: Andi Kleen --- init/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/main.c b/init/main.c index 6b69564738c1..913e48d658ee 100644 --- a/init/main.c +++ b/init/main.c @@ -468,6 +468,7 @@ asmlinkage void __init start_kernel(void) * Need to run as early as possible, to initialize the * lockdep hash: */ + unwind_init(); lockdep_init(); local_irq_disable(); @@ -506,7 +507,6 @@ asmlinkage void __init start_kernel(void) __stop___param - __start___param, &unknown_bootoption); sort_main_extable(); - unwind_init(); trap_init(); rcu_init(); init_IRQ(); -- cgit v1.2.1 From c0b766f13d8e1189ce4d00e54700c9d96b543b9a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] Merge stacktrace and show_trace This unifies the standard backtracer and the new stacktrace in memory backtracer. The standard one is converted to use callbacks and then reimplement stacktrace using new callbacks. The main advantage is that stacktrace can now use the new dwarf2 unwinder and avoid false positives in many cases. I kept it simple to make sure the standard backtracer stays reliable. Cc: mingo@elte.hu Signed-off-by: Andi Kleen --- arch/x86_64/kernel/stacktrace.c | 214 +++++----------------------------------- arch/x86_64/kernel/traps.c | 99 ++++++++++++++----- include/asm-x86_64/stacktrace.h | 18 ++++ 3 files changed, 120 insertions(+), 211 deletions(-) create mode 100644 include/asm-x86_64/stacktrace.h diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c index 1c022af8fe1e..6026b31d037e 100644 --- a/arch/x86_64/kernel/stacktrace.c +++ b/arch/x86_64/kernel/stacktrace.c @@ -7,211 +7,49 @@ */ #include #include +#include +#include -#include - -static inline int -in_range(unsigned long start, unsigned long addr, unsigned long end) +static void save_stack_warning(void *data, char *msg) { - return addr >= start && addr <= end; } -static unsigned long -get_stack_end(struct task_struct *task, unsigned long stack) +static void +save_stack_warning_symbol(void *data, char *msg, unsigned long symbol) { - unsigned long stack_start, stack_end, flags; - int i, cpu; - - /* - * The most common case is that we are in the task stack: - */ - stack_start = (unsigned long)task->thread_info; - stack_end = stack_start + THREAD_SIZE; - - if (in_range(stack_start, stack, stack_end)) - return stack_end; - - /* - * We are in an interrupt if irqstackptr is set: - */ - raw_local_irq_save(flags); - cpu = safe_smp_processor_id(); - stack_end = (unsigned long)cpu_pda(cpu)->irqstackptr; - - if (stack_end) { - stack_start = stack_end & ~(IRQSTACKSIZE-1); - if (in_range(stack_start, stack, stack_end)) - goto out_restore; - /* - * We get here if we are in an IRQ context but we - * are also in an exception stack. - */ - } - - /* - * Iterate over all exception stacks, and figure out whether - * 'stack' is in one of them: - */ - for (i = 0; i < N_EXCEPTION_STACKS; i++) { - /* - * set 'end' to the end of the exception stack. - */ - stack_end = per_cpu(init_tss, cpu).ist[i]; - stack_start = stack_end - EXCEPTION_STKSZ; - - /* - * Is 'stack' above this exception frame's end? - * If yes then skip to the next frame. - */ - if (stack >= stack_end) - continue; - /* - * Is 'stack' above this exception frame's start address? - * If yes then we found the right frame. - */ - if (stack >= stack_start) - goto out_restore; - - /* - * If this is a debug stack, and if it has a larger size than - * the usual exception stacks, then 'stack' might still - * be within the lower portion of the debug stack: - */ -#if DEBUG_STKSZ > EXCEPTION_STKSZ - if (i == DEBUG_STACK - 1 && stack >= stack_end - DEBUG_STKSZ) { - /* - * Black magic. A large debug stack is composed of - * multiple exception stack entries, which we - * iterate through now. Dont look: - */ - do { - stack_end -= EXCEPTION_STKSZ; - stack_start -= EXCEPTION_STKSZ; - } while (stack < stack_start); - - goto out_restore; - } -#endif - } - /* - * Ok, 'stack' is not pointing to any of the system stacks. - */ - stack_end = 0; - -out_restore: - raw_local_irq_restore(flags); - - return stack_end; } - -/* - * Save stack-backtrace addresses into a stack_trace buffer: - */ -static inline unsigned long -save_context_stack(struct stack_trace *trace, - unsigned long stack, unsigned long stack_end) +static int save_stack_stack(void *data, char *name) { - int skip = trace->skip; - unsigned long addr; - -#ifdef CONFIG_FRAME_POINTER - unsigned long prev_stack = 0; + struct stack_trace *trace = (struct stack_trace *)data; + return trace->all_contexts ? 0 : -1; +} - while (in_range(prev_stack, stack, stack_end)) { - pr_debug("stack: %p\n", (void *)stack); - addr = (unsigned long)(((unsigned long *)stack)[1]); - pr_debug("addr: %p\n", (void *)addr); - if (!skip) - trace->entries[trace->nr_entries++] = addr-1; - else - skip--; - if (trace->nr_entries >= trace->max_entries) - break; - if (!addr) - return 0; - /* - * Stack frames must go forwards (otherwise a loop could - * happen if the stackframe is corrupted), so we move - * prev_stack forwards: - */ - prev_stack = stack; - stack = (unsigned long)(((unsigned long *)stack)[0]); - } - pr_debug("invalid: %p\n", (void *)stack); -#else - while (stack < stack_end) { - addr = ((unsigned long *)stack)[0]; - stack += sizeof(long); - if (__kernel_text_address(addr)) { - if (!skip) - trace->entries[trace->nr_entries++] = addr-1; - else - skip--; - if (trace->nr_entries >= trace->max_entries) - break; - } +static void save_stack_address(void *data, unsigned long addr) +{ + struct stack_trace *trace = (struct stack_trace *)data; + if (trace->skip > 0) { + trace->skip--; + return; } -#endif - return stack; + if (trace->nr_entries < trace->max_entries - 1) + trace->entries[trace->nr_entries++] = addr; } -#define MAX_STACKS 10 +static struct stacktrace_ops save_stack_ops = { + .warning = save_stack_warning, + .warning_symbol = save_stack_warning_symbol, + .stack = save_stack_stack, + .address = save_stack_address, +}; /* * Save stack-backtrace addresses into a stack_trace buffer. */ void save_stack_trace(struct stack_trace *trace, struct task_struct *task) { - unsigned long stack = (unsigned long)&stack; - int i, nr_stacks = 0, stacks_done[MAX_STACKS]; - - WARN_ON(trace->nr_entries || !trace->max_entries); - - if (!task) - task = current; - - pr_debug("task: %p, ti: %p\n", task, task->thread_info); - - if (!task || task == current) { - /* Grab rbp right from our regs: */ - asm ("mov %%rbp, %0" : "=r" (stack)); - pr_debug("rbp: %p\n", (void *)stack); - } else { - /* rbp is the last reg pushed by switch_to(): */ - stack = task->thread.rsp; - pr_debug("other task rsp: %p\n", (void *)stack); - stack = (unsigned long)(((unsigned long *)stack)[0]); - pr_debug("other task rbp: %p\n", (void *)stack); - } - - while (1) { - unsigned long stack_end = get_stack_end(task, stack); - - pr_debug("stack: %p\n", (void *)stack); - pr_debug("stack end: %p\n", (void *)stack_end); - - /* - * Invalid stack addres? - */ - if (!stack_end) - return; - /* - * Were we in this stack already? (recursion) - */ - for (i = 0; i < nr_stacks; i++) - if (stacks_done[i] == stack_end) - return; - stacks_done[nr_stacks] = stack_end; - - stack = save_context_stack(trace, stack, stack_end); - if (!stack || trace->nr_entries >= trace->max_entries) - return; - trace->entries[trace->nr_entries++] = ULONG_MAX; - if (trace->nr_entries >= trace->max_entries) - return; - if (++nr_stacks >= MAX_STACKS) - return; - } + dump_trace(task, NULL, NULL, &save_stack_ops, trace); + trace->entries[trace->nr_entries++] = ULONG_MAX; } +EXPORT_SYMBOL(save_stack_trace); diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 9ec2b1d5893d..4ac18b02eada 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -45,6 +45,7 @@ #include #include #include +#include asmlinkage void divide_error(void); asmlinkage void debug(void); @@ -142,7 +143,7 @@ void printk_address(unsigned long address) #endif static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, const char **idp) + unsigned *usedp, char **idp) { static char ids[][8] = { [DEBUG_STACK - 1] = "#DB", @@ -234,13 +235,19 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, return NULL; } -static int show_trace_unwind(struct unwind_frame_info *info, void *context) +struct ops_and_data { + struct stacktrace_ops *ops; + void *data; +}; + +static int dump_trace_unwind(struct unwind_frame_info *info, void *context) { + struct ops_and_data *oad = (struct ops_and_data *)context; int n = 0; while (unwind(info) == 0 && UNW_PC(info)) { n++; - printk_address(UNW_PC(info)); + oad->ops->address(oad->data, UNW_PC(info)); if (arch_unw_user_mode(info)) break; } @@ -254,45 +261,51 @@ static int show_trace_unwind(struct unwind_frame_info *info, void *context) * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack) +void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack, + struct stacktrace_ops *ops, void *data) { const unsigned cpu = safe_smp_processor_id(); unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; unsigned used = 0; - printk("\nCall Trace:\n"); - if (!tsk) tsk = current; if (call_trace >= 0) { int unw_ret = 0; struct unwind_frame_info info; + struct ops_and_data oad = { .ops = ops, .data = data }; if (regs) { if (unwind_init_frame_info(&info, tsk, regs) == 0) - unw_ret = show_trace_unwind(&info, NULL); + unw_ret = dump_trace_unwind(&info, &oad); } else if (tsk == current) - unw_ret = unwind_init_running(&info, show_trace_unwind, NULL); + unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad); else { if (unwind_init_blocked(&info, tsk) == 0) - unw_ret = show_trace_unwind(&info, NULL); + unw_ret = dump_trace_unwind(&info, &oad); } if (unw_ret > 0) { if (call_trace == 1 && !arch_unw_user_mode(&info)) { - print_symbol("DWARF2 unwinder stuck at %s\n", + ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n", UNW_PC(&info)); if ((long)UNW_SP(&info) < 0) { - printk("Leftover inexact backtrace:\n"); + ops->warning(data, "Leftover inexact backtrace:\n"); stack = (unsigned long *)UNW_SP(&info); } else - printk("Full inexact backtrace again:\n"); + ops->warning(data, "Full inexact backtrace again:\n"); } else if (call_trace >= 1) return; else - printk("Full inexact backtrace again:\n"); + ops->warning(data, "Full inexact backtrace again:\n"); } else - printk("Inexact backtrace:\n"); + ops->warning(data, "Inexact backtrace:\n"); + } + if (!stack) { + unsigned long dummy; + stack = &dummy; + if (tsk && tsk != current) + stack = (unsigned long *)tsk->thread.rsp; } /* @@ -312,7 +325,7 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s * down the cause of the crash will be able to figure \ * out the call path that was taken. \ */ \ - printk_address(addr); \ + ops->address(data, addr); \ } \ } while (0) @@ -321,16 +334,17 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s * current stack address. If the stacks consist of nested * exceptions */ - for ( ; ; ) { - const char *id; + for (;;) { + char *id; unsigned long *estack_end; estack_end = in_exception_stack(cpu, (unsigned long)stack, &used, &id); if (estack_end) { - printk(" <%s>", id); + if (ops->stack(data, id) < 0) + break; HANDLE_STACK (stack < estack_end); - printk(" "); + ops->stack(data, ""); /* * We link to the next stack via the * second-to-last pointer (index -2 to end) in the @@ -345,7 +359,8 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s (IRQSTACKSIZE - 64) / sizeof(*irqstack); if (stack >= irqstack && stack < irqstack_end) { - printk(" "); + if (ops->stack(data, "IRQ") < 0) + break; HANDLE_STACK (stack < irqstack_end); /* * We link to the next stack (which would be @@ -354,7 +369,7 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s */ stack = (unsigned long *) (irqstack_end[-1]); irqstack_end = NULL; - printk(" "); + ops->stack(data, "EOI"); continue; } } @@ -362,15 +377,53 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s } /* - * This prints the process stack: + * This handles the process stack: */ HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0); #undef HANDLE_STACK +} +EXPORT_SYMBOL(dump_trace); +static void +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + print_symbol(msg, symbol); + printk("\n"); +} + +static void print_trace_warning(void *data, char *msg) +{ + printk("%s\n", msg); +} + +static int print_trace_stack(void *data, char *name) +{ + printk(" <%s> ", name); + return 0; +} + +static void print_trace_address(void *data, unsigned long addr) +{ + printk_address(addr); +} + +static struct stacktrace_ops print_trace_ops = { + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, +}; + +void +show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack) +{ + printk("\nCall Trace:\n"); + dump_trace(tsk, regs, stack, &print_trace_ops, NULL); printk("\n"); } -static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long * rsp) +static void +_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp) { unsigned long *stack; int i; diff --git a/include/asm-x86_64/stacktrace.h b/include/asm-x86_64/stacktrace.h new file mode 100644 index 000000000000..5eb9799bef76 --- /dev/null +++ b/include/asm-x86_64/stacktrace.h @@ -0,0 +1,18 @@ +#ifndef _ASM_STACKTRACE_H +#define _ASM_STACKTRACE_H 1 + +/* Generic stack tracer with callbacks */ + +struct stacktrace_ops { + void (*warning)(void *data, char *msg); + /* msg must contain %s for the symbol */ + void (*warning_symbol)(void *data, char *msg, unsigned long symbol); + void (*address)(void *data, unsigned long address); + /* On negative return stop dumping */ + int (*stack)(void *data, char *name); +}; + +void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack, + struct stacktrace_ops *ops, void *data); + +#endif -- cgit v1.2.1 From be7a91709b90825990e571b2f20cea937d5eef6c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] Check for end of stack trace before falling back Signed-off-by: Andi Kleen --- arch/x86_64/kernel/traps.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 4ac18b02eada..28e53342f294 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -292,6 +292,8 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s if ((long)UNW_SP(&info) < 0) { ops->warning(data, "Leftover inexact backtrace:\n"); stack = (unsigned long *)UNW_SP(&info); + if (!stack) + return; } else ops->warning(data, "Full inexact backtrace again:\n"); } else if (call_trace >= 1) -- cgit v1.2.1 From 2b14a78cd07a52001b8c3865ed615d8b9b905b78 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] i386: Do stacktracer conversion too Following x86-64 patches. Reuses code from them in fact. Convert the standard backtracer to do all output using callbacks. Use the x86-64 stack tracer implementation that uses these callbacks to implement the stacktrace interface. This allows to use the new dwarf2 unwinder for stacktrace and get better backtraces. Cc: mingo@elte.hu Signed-off-by: Andi Kleen --- arch/i386/kernel/Makefile | 1 + arch/i386/kernel/stacktrace.c | 93 ------------------------------------ arch/i386/kernel/traps.c | 108 +++++++++++++++++++++++++++++++----------- include/asm-i386/stacktrace.h | 1 + 4 files changed, 82 insertions(+), 121 deletions(-) delete mode 100644 arch/i386/kernel/stacktrace.c create mode 100644 include/asm-i386/stacktrace.h diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index dab497472deb..1a884b6e6e5c 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -81,4 +81,5 @@ $(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \ $(call if_changed,syscall) k8-y += ../../x86_64/kernel/k8.o +stacktrace-y += ../../x86_64/kernel/stacktrace.o diff --git a/arch/i386/kernel/stacktrace.c b/arch/i386/kernel/stacktrace.c deleted file mode 100644 index ae3c32a87add..000000000000 --- a/arch/i386/kernel/stacktrace.c +++ /dev/null @@ -1,93 +0,0 @@ -/* - * arch/i386/kernel/stacktrace.c - * - * Stack trace management functions - * - * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar - */ -#include -#include - -static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) -{ - return p > (void *)tinfo && - p < (void *)tinfo + THREAD_SIZE - 3; -} - -/* - * Save stack-backtrace addresses into a stack_trace buffer: - */ -static inline unsigned long -save_context_stack(struct stack_trace *trace, unsigned int skip, - struct thread_info *tinfo, unsigned long *stack, - unsigned long ebp) -{ - unsigned long addr; - -#ifdef CONFIG_FRAME_POINTER - while (valid_stack_ptr(tinfo, (void *)ebp)) { - addr = *(unsigned long *)(ebp + 4); - if (!skip) - trace->entries[trace->nr_entries++] = addr; - else - skip--; - if (trace->nr_entries >= trace->max_entries) - break; - /* - * break out of recursive entries (such as - * end_of_stack_stop_unwind_function): - */ - if (ebp == *(unsigned long *)ebp) - break; - - ebp = *(unsigned long *)ebp; - } -#else - while (valid_stack_ptr(tinfo, stack)) { - addr = *stack++; - if (__kernel_text_address(addr)) { - if (!skip) - trace->entries[trace->nr_entries++] = addr; - else - skip--; - if (trace->nr_entries >= trace->max_entries) - break; - } - } -#endif - - return ebp; -} - -/* - * Save stack-backtrace addresses into a stack_trace buffer. - */ -void save_stack_trace(struct stack_trace *trace, struct task_struct *task) -{ - unsigned long ebp; - unsigned long *stack = &ebp; - - WARN_ON(trace->nr_entries || !trace->max_entries); - - if (!task || task == current) { - /* Grab ebp right from our regs: */ - asm ("movl %%ebp, %0" : "=r" (ebp)); - } else { - /* ebp is the last reg pushed by switch_to(): */ - ebp = *(unsigned long *) task->thread.esp; - } - - while (1) { - struct thread_info *context = (struct thread_info *) - ((unsigned long)stack & (~(THREAD_SIZE - 1))); - - ebp = save_context_stack(trace, trace->skip, context, stack, ebp); - stack = (unsigned long *)context->previous_esp; - if (!stack || trace->nr_entries >= trace->max_entries) - break; - trace->entries[trace->nr_entries++] = ULONG_MAX; - if (trace->nr_entries >= trace->max_entries) - break; - } -} - diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 3c85c89f68d8..4ced4285163b 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -51,6 +51,7 @@ #include #include #include +#include #include @@ -118,26 +119,16 @@ static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) p < (void *)tinfo + THREAD_SIZE - 3; } -/* - * Print one address/symbol entries per line. - */ -static inline void print_addr_and_symbol(unsigned long addr, char *log_lvl) -{ - printk(" [<%08lx>] ", addr); - - print_symbol("%s\n", addr); -} - static inline unsigned long print_context_stack(struct thread_info *tinfo, unsigned long *stack, unsigned long ebp, - char *log_lvl) + struct stacktrace_ops *ops, void *data) { unsigned long addr; #ifdef CONFIG_FRAME_POINTER while (valid_stack_ptr(tinfo, (void *)ebp)) { addr = *(unsigned long *)(ebp + 4); - print_addr_and_symbol(addr, log_lvl); + ops->address(data, addr); /* * break out of recursive entries (such as * end_of_stack_stop_unwind_function): @@ -150,28 +141,35 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo, while (valid_stack_ptr(tinfo, stack)) { addr = *stack++; if (__kernel_text_address(addr)) - print_addr_and_symbol(addr, log_lvl); + ops->address(data, addr); } #endif return ebp; } +struct ops_and_data { + struct stacktrace_ops *ops; + void *data; +}; + static asmlinkage int -show_trace_unwind(struct unwind_frame_info *info, void *log_lvl) +dump_trace_unwind(struct unwind_frame_info *info, void *data) { + struct ops_and_data *oad = (struct ops_and_data *)data; int n = 0; while (unwind(info) == 0 && UNW_PC(info)) { n++; - print_addr_and_symbol(UNW_PC(info), log_lvl); + oad->ops->address(oad->data, UNW_PC(info)); if (arch_unw_user_mode(info)) break; } return n; } -static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, char *log_lvl) +void dump_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, + struct stacktrace_ops *ops, void *data) { unsigned long ebp; @@ -181,31 +179,37 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (call_trace >= 0) { int unw_ret = 0; struct unwind_frame_info info; + struct ops_and_data oad = { .ops = ops, .data = data }; if (regs) { if (unwind_init_frame_info(&info, task, regs) == 0) - unw_ret = show_trace_unwind(&info, log_lvl); + unw_ret = dump_trace_unwind(&info, &oad); } else if (task == current) - unw_ret = unwind_init_running(&info, show_trace_unwind, log_lvl); + unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad); else { if (unwind_init_blocked(&info, task) == 0) - unw_ret = show_trace_unwind(&info, log_lvl); + unw_ret = dump_trace_unwind(&info, &oad); } if (unw_ret > 0) { if (call_trace == 1 && !arch_unw_user_mode(&info)) { - print_symbol("DWARF2 unwinder stuck at %s\n", + ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n", UNW_PC(&info)); if (UNW_SP(&info) >= PAGE_OFFSET) { - printk("Leftover inexact backtrace:\n"); + ops->warning(data, "Leftover inexact backtrace:\n"); stack = (void *)UNW_SP(&info); } else - printk("Full inexact backtrace again:\n"); + ops->warning(data, "Full inexact backtrace again:\n"); } else if (call_trace >= 1) return; else - printk("Full inexact backtrace again:\n"); + ops->warning(data, "Full inexact backtrace again:\n"); } else - printk("Inexact backtrace:\n"); + ops->warning(data, "Inexact backtrace:\n"); + } else if (!stack) { + unsigned long dummy; + stack = &dummy; + if (task && task != current) + stack = (unsigned long *)task->thread.esp; } if (task == current) { @@ -220,15 +224,63 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, struct thread_info *context; context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - ebp = print_context_stack(context, stack, ebp, log_lvl); + ebp = print_context_stack(context, stack, ebp, ops, data); + /* Should be after the line below, but somewhere + in early boot context comes out corrupted and we + can't reference it -AK */ + if (ops->stack(data, "IRQ") < 0) + break; stack = (unsigned long*)context->previous_esp; if (!stack) break; - printk("%s =======================\n", log_lvl); } } +EXPORT_SYMBOL(dump_trace); + +static void +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + printk(data); + print_symbol(msg, symbol); + printk("\n"); +} + +static void print_trace_warning(void *data, char *msg) +{ + printk("%s%s\n", (char *)data, msg); +} + +static int print_trace_stack(void *data, char *name) +{ + return 0; +} + +/* + * Print one address/symbol entries per line. + */ +static void print_trace_address(void *data, unsigned long addr) +{ + printk("%s [<%08lx>] ", (char *)data, addr); + print_symbol("%s\n", addr); +} + +static struct stacktrace_ops print_trace_ops = { + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, +}; + +static void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long * stack, char *log_lvl) +{ + dump_trace(task, regs, stack, &print_trace_ops, log_lvl); + printk("%s =======================\n", log_lvl); +} -void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long * stack) +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long * stack) { show_trace_log_lvl(task, regs, stack, ""); } diff --git a/include/asm-i386/stacktrace.h b/include/asm-i386/stacktrace.h new file mode 100644 index 000000000000..7d1f6a5cbfca --- /dev/null +++ b/include/asm-i386/stacktrace.h @@ -0,0 +1 @@ +#include -- cgit v1.2.1 From 950fee84557416a3427dd404a13addc4be7b3e6c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] i386: Terminate backtrace fallback early if unwinder stack pointer is zero Cc: jbeulich@novell.com Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 4ced4285163b..86fa7e47f301 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -197,6 +197,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (UNW_SP(&info) >= PAGE_OFFSET) { ops->warning(data, "Leftover inexact backtrace:\n"); stack = (void *)UNW_SP(&info); + if (!stack) + return; } else ops->warning(data, "Full inexact backtrace again:\n"); } else if (call_trace >= 1) -- cgit v1.2.1 From a32cf3975bed3b84491f8ffeb24abe8c45d86ab0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] i386: Get ebp from unwinder state when continuing fallback backtrace Cc: jbeulich@novell.com Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 22 ++++++++++++++-------- include/asm-i386/unwind.h | 3 +++ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 86fa7e47f301..bdf949c30c7c 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -171,7 +171,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, struct stacktrace_ops *ops, void *data) { - unsigned long ebp; + unsigned long ebp = 0; if (!task) task = current; @@ -199,6 +199,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, stack = (void *)UNW_SP(&info); if (!stack) return; + ebp = UNW_FP(&info); } else ops->warning(data, "Full inexact backtrace again:\n"); } else if (call_trace >= 1) @@ -207,20 +208,25 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, ops->warning(data, "Full inexact backtrace again:\n"); } else ops->warning(data, "Inexact backtrace:\n"); - } else if (!stack) { + } + if (!stack) { unsigned long dummy; stack = &dummy; if (task && task != current) stack = (unsigned long *)task->thread.esp; } - if (task == current) { - /* Grab ebp right from our regs */ - asm ("movl %%ebp, %0" : "=r" (ebp) : ); - } else { - /* ebp is the last reg pushed by switch_to */ - ebp = *(unsigned long *) task->thread.esp; +#ifdef CONFIG_FRAME_POINTER + if (!ebp) { + if (task == current) { + /* Grab ebp right from our regs */ + asm ("movl %%ebp, %0" : "=r" (ebp) : ); + } else { + /* ebp is the last reg pushed by switch_to */ + ebp = *(unsigned long *) task->thread.esp; + } } +#endif while (1) { struct thread_info *context; diff --git a/include/asm-i386/unwind.h b/include/asm-i386/unwind.h index 4c1a0b968569..f0ac399bae3c 100644 --- a/include/asm-i386/unwind.h +++ b/include/asm-i386/unwind.h @@ -28,6 +28,8 @@ struct unwind_frame_info #define FRAME_LINK_OFFSET 0 #define STACK_BOTTOM(tsk) STACK_LIMIT((tsk)->thread.esp0) #define STACK_TOP(tsk) ((tsk)->thread.esp0) +#else +#define UNW_FP(frame) ((void)(frame), 0) #endif #define STACK_LIMIT(ptr) (((ptr) - 1) & ~(THREAD_SIZE - 1)) @@ -88,6 +90,7 @@ static inline int arch_unw_user_mode(const struct unwind_frame_info *info) #define UNW_PC(frame) ((void)(frame), 0) #define UNW_SP(frame) ((void)(frame), 0) +#define UNW_FP(frame) ((void)(frame), 0) static inline int arch_unw_user_mode(const void *info) { -- cgit v1.2.1 From 3b8d1fe0f28202ce1592d9fbc216959b49b72c95 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] Don't force frame pointers for lockdep Now that stacktrace supports dwarf2 don't force frame pointers for lockdep anymore Cc: mingo@elte.hu Signed-off-by: Andi Kleen --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 554ee688a9f8..c795a17a7043 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -218,7 +218,7 @@ config LOCKDEP bool depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT select STACKTRACE - select FRAME_POINTER + select FRAME_POINTER if !X86 select KALLSYMS select KALLSYMS_ALL -- cgit v1.2.1 From 5758d5dfef1c514200bda3f29ba700f1c3e3bc99 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] i386: fix dubious segment register clear in cpu_init() Fix a very dubious piece of code in arch/i386/kernel/cpu/common.c:cpu_init(). This clears out %fs and %gs, but clobbers %eax in the process without telling gcc. It turns out that gcc happens to be not using %eax at that point anyway so it doesn't matter much, but it looks like a bomb waiting to go off. This does end up saving an instruction, because gcc wants %eax==0 for the set_debugreg()s below. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 70c87de582c7..730a7ab75009 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -675,7 +675,7 @@ old_gdt: #endif /* Clear %fs and %gs. */ - asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); + asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0)); /* Clear all 6 debug registers: */ set_debugreg(0, 0); -- cgit v1.2.1 From 3ca113ea74836a80645c79adba24caaa7a74120c Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] i386: don't taint UP K7's running SMP kernels. We have a test that looks for invalid pairings of certain athlon/durons that weren't designed for SMP, and taint accordingly (with 'S') if we find such a configuration. However, this test shouldn't fire if there's only a single CPU present. It's perfectly valid for an SMP kernel to boot on UP hardware for example. AK: changed to num_possible_cpus() Signed-off-by: Dave Jones Signed-off-by: Andi Kleen --- arch/i386/kernel/smpboot.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 517eb3874550..020d873b7d21 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -177,6 +177,9 @@ static void __devinit smp_store_cpu_info(int id) */ if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) { + if (num_possible_cpus() == 1) + goto valid_k7; + /* Athlon 660/661 is valid. */ if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1))) goto valid_k7; -- cgit v1.2.1 From d28c4393a7bf558538e9def269c1caeab6ec056f Mon Sep 17 00:00:00 2001 From: "Prasanna S.P" Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] x86: error_code is not safe for kprobes This patch moves the entry.S:error_entry to .kprobes.text section, since code marked unsafe for kprobes jumps directly to entry.S::error_entry, that must be marked unsafe as well. This patch also moves all the ".previous.text" asm directives to ".previous" for kprobes section. AK: Following a similar i386 patch from Chuck Ebbert AK: Also merged Jeremy's fix in. +From: Jeremy Fitzhardinge KPROBE_ENTRY does a .section .kprobes.text, and expects its users to do a .previous at the end of the function. Unfortunately, if any code within the function switches sections, for example .fixup, then the .previous ends up putting all subsequent code into .fixup. Worse, any subsequent .fixup code gets intermingled with the code its supposed to be fixing (which is also in .fixup). It's surprising this didn't cause more havok. The fix is to use .pushsection/.popsection, so this stuff nests properly. A further cleanup would be to get rid of all .section/.previous pairs, since they're inherently fragile. +From: Chuck Ebbert <76306.1226@compuserve.com> Because code marked unsafe for kprobes jumps directly to entry.S::error_code, that must be marked unsafe as well. The easiest way to do that is to move the page fault entry point to just before error_code and let it inherit the same section. Also moved all the ".previous" asm directives for kprobes sections to column 1 and removed ".text" from them. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen --- arch/i386/kernel/entry.S | 25 +++++++++++++------------ arch/x86_64/kernel/entry.S | 19 +++++++------------ include/linux/linkage.h | 6 +++++- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 87f9f60b803b..ba22ec8fab54 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -591,11 +591,9 @@ ENTRY(name) \ /* The include is where all of the SMP etc. interrupts come from */ #include "entry_arch.h" -ENTRY(divide_error) - RING0_INT_FRAME - pushl $0 # no error code - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_divide_error +KPROBE_ENTRY(page_fault) + RING0_EC_FRAME + pushl $do_page_fault CFI_ADJUST_CFA_OFFSET 4 ALIGN error_code: @@ -645,6 +643,7 @@ error_code: call *%edi jmp ret_from_exception CFI_ENDPROC +KPROBE_END(page_fault) ENTRY(coprocessor_error) RING0_INT_FRAME @@ -720,7 +719,8 @@ debug_stack_correct: call do_debug jmp ret_from_exception CFI_ENDPROC - .previous .text +KPROBE_END(debug) + /* * NMI is doubly nasty. It can happen _while_ we're handling * a debug fault, and the debug fault hasn't yet been able to @@ -816,7 +816,7 @@ KPROBE_ENTRY(int3) call do_int3 jmp ret_from_exception CFI_ENDPROC - .previous .text +KPROBE_END(int3) ENTRY(overflow) RING0_INT_FRAME @@ -881,7 +881,7 @@ KPROBE_ENTRY(general_protection) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC - .previous .text +KPROBE_END(general_protection) ENTRY(alignment_check) RING0_EC_FRAME @@ -890,13 +890,14 @@ ENTRY(alignment_check) jmp error_code CFI_ENDPROC -KPROBE_ENTRY(page_fault) - RING0_EC_FRAME - pushl $do_page_fault +ENTRY(divide_error) + RING0_INT_FRAME + pushl $0 # no error code + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_divide_error CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC - .previous .text #ifdef CONFIG_X86_MCE ENTRY(machine_check) diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 2092f565aa87..780f9b26169f 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -819,7 +819,7 @@ paranoid_schedule\trace: * Exception entry point. This expects an error code/orig_rax on the stack * and the exception handler in %rax. */ -ENTRY(error_entry) +KPROBE_ENTRY(error_entry) _frame RDI /* rdi slot contains rax, oldrax contains error code */ cld @@ -903,7 +903,7 @@ error_kernelspace: cmpq $gs_change,RIP(%rsp) je error_swapgs jmp error_sti -END(error_entry) +KPROBE_END(error_entry) /* Reload gs selector with exception handling */ /* edi: new selector */ @@ -1025,8 +1025,7 @@ ENDPROC(execve) KPROBE_ENTRY(page_fault) errorentry do_page_fault -END(page_fault) - .previous .text +KPROBE_END(page_fault) ENTRY(coprocessor_error) zeroentry do_coprocessor_error @@ -1047,8 +1046,7 @@ KPROBE_ENTRY(debug) CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_debug, DEBUG_STACK paranoidexit -END(debug) - .previous .text +KPROBE_END(debug) /* runs on exception stack */ KPROBE_ENTRY(nmi) @@ -1062,8 +1060,7 @@ KPROBE_ENTRY(nmi) jmp paranoid_exit1 CFI_ENDPROC #endif -END(nmi) - .previous .text +KPROBE_END(nmi) KPROBE_ENTRY(int3) INTR_FRAME @@ -1072,8 +1069,7 @@ KPROBE_ENTRY(int3) paranoidentry do_int3, DEBUG_STACK jmp paranoid_exit1 CFI_ENDPROC -END(int3) - .previous .text +KPROBE_END(int3) ENTRY(overflow) zeroentry do_overflow @@ -1121,8 +1117,7 @@ END(stack_segment) KPROBE_ENTRY(general_protection) errorentry do_general_protection -END(general_protection) - .previous .text +KPROBE_END(general_protection) ENTRY(alignment_check) errorentry do_alignment_check diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 932021f872d5..6c9873f88287 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -35,9 +35,13 @@ #endif #define KPROBE_ENTRY(name) \ - .section .kprobes.text, "ax"; \ + .pushsection .kprobes.text, "ax"; \ ENTRY(name) +#define KPROBE_END(name) \ + END(name); \ + .popsection + #ifndef END #define END(name) \ .size name, .-name -- cgit v1.2.1 From cbf9b4bb76c9ce53b7fdde0dffcd000951b5f0d4 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] X86_64 monotonic_clock goes backwards I've noticed some erratic behavior while testing the X86_64 version of monotonic_clock(). While spinning in a loop reading monotonic clock values (pinned to a single cpu) I noticed that the difference between subsequent values occasionally went negative (time going backwards). I found that in the following code: this_offset = get_cycles_sync(); /* FIXME: 1000 or 1000000? */ --> offset = (this_offset - last_offset)*1000 / cpu_khz; } return base + offset; the offset sometimes turns out to be 0, even though this_offset > last_offset. +Added fix From: Toyo Abe The x86_64-mm-monotonic-clock.patch in 2.6.18-rc4-mm2 made a change to the updating of monotonic_base. It now uses cycles_2_ns(). I suggest that a set_cyc2ns_scale() should be done prior to the setup_irq(). Because cycles_2_ns() can be called from the timer ISR right after the irq0 is enabled. Signed-off-by: Toyo Abe Signed-off-by: Dimitri Sivanich Signed-off-by: Andi Kleen --- arch/x86_64/kernel/time.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index d66c7f750e75..97115e608ed8 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -276,6 +276,7 @@ static void set_rtc_mmss(unsigned long nowtime) * Note: This function is required to return accurate * time even in the absence of multiple timer ticks. */ +static inline unsigned long long cycles_2_ns(unsigned long long cyc); unsigned long long monotonic_clock(void) { unsigned long seq; @@ -300,8 +301,7 @@ unsigned long long monotonic_clock(void) base = monotonic_base; } while (read_seqretry(&xtime_lock, seq)); this_offset = get_cycles_sync(); - /* FIXME: 1000 or 1000000? */ - offset = (this_offset - last_offset)*1000 / cpu_khz; + offset = cycles_2_ns(this_offset - last_offset); } return base + offset; } @@ -405,8 +405,7 @@ void main_timer_handler(struct pt_regs *regs) offset %= USEC_PER_TICK; } - /* FIXME: 1000 or 1000000? */ - monotonic_base += (tsc - vxtime.last_tsc) * 1000000 / cpu_khz; + monotonic_base += cycles_2_ns(tsc - vxtime.last_tsc); vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot; @@ -929,10 +928,8 @@ void __init time_init(void) vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; vxtime.last_tsc = get_cycles_sync(); - setup_irq(0, &irq0); - set_cyc2ns_scale(cpu_khz); - + setup_irq(0, &irq0); hotcpu_notifier(time_cpu_notifier, 0); time_cpu_notifier(NULL, CPU_ONLINE, (void *)(long)smp_processor_id()); -- cgit v1.2.1 From 1edf777803bdd2aeeb04cf44508fd9b88737fba8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] i386/x86-64: Improve Kconfig description of CRASH_DUMP Improve Kconfig description of CONFIG_CRASH_DUMP. Previously it was too brief to be useful. Cc: vgoyal@in.ibm.com Cc: ebiederm@xmission.com Signed-off-by: Andi Kleen --- arch/i386/Kconfig | 7 +++++++ arch/x86_64/Kconfig | 9 ++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index c134a545b37e..e34a4dbd6b92 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -761,6 +761,13 @@ config CRASH_DUMP depends on HIGHMEM help Generate crash dump after being started by kexec. + This should be normally only set in special crash dump kernels + which are loaded in the main kernel with kexec-tools into + a specially reserved region and then later executed after + a crash by kdump/kexec. The crash dump kernel must be compiled + to a memory address not used by the main kernel or BIOS using + PHYSICAL_START. + For more details see Documentation/kdump/kdump.txt config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 6cd4878625f1..927b392a4aa0 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -488,7 +488,14 @@ config CRASH_DUMP bool "kernel crash dumps (EXPERIMENTAL)" depends on EXPERIMENTAL help - Generate crash dump after being started by kexec. + Generate crash dump after being started by kexec. + This should be normally only set in special crash dump kernels + which are loaded in the main kernel with kexec-tools into + a specially reserved region and then later executed after + a crash by kdump/kexec. The crash dump kernel must be compiled + to a memory address not used by the main kernel or BIOS using + PHYSICAL_START. + For more details see Documentation/kdump/kdump.txt config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) -- cgit v1.2.1 From 2717941c6a1d8fa543ddca337d450ab30ef91543 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] Make boot_param_data pure BSS Since it's all zero. Actually I think gcc 4+ will do that automatically, but earlier compilers won't Signed-off-by: Andi Kleen --- arch/x86_64/kernel/setup64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 646caa0a20bb..b09e60fa96b4 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -24,7 +24,7 @@ #include #include -char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,}; +char x86_boot_params[BOOT_PARAM_SIZE] __initdata; cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; -- cgit v1.2.1 From 5e4edbb711417be40f350a319db39888a4edd450 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] i386: Fix warning in mpparse.c Fix linux/arch/i386/kernel/mpparse.c: In function #MP_bus_info#: linux/arch/i386/kernel/mpparse.c:232: warning: comparison is always false due to limited range of data type Signed-off-by: Andi Kleen --- arch/i386/kernel/mpparse.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index bdaa75a5bc1c..772a4233bfe6 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -229,12 +229,14 @@ static void __init MP_bus_info (struct mpc_config_bus *m) mpc_oem_bus_info(m, str, translation_table[mpc_record]); +#if MAX_MP_BUSSES < 256 if (m->mpc_busid >= MAX_MP_BUSSES) { printk(KERN_WARNING "MP table busid value (%d) for bustype %s " " is too large, max. supported is %d\n", m->mpc_busid, str, MAX_MP_BUSSES - 1); return; } +#endif if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) { mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; -- cgit v1.2.1 From 273819a2d982faace30e587b86a0683882251fe7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] make fault notifier unconditional and export it It's needed for external debuggers and overhead is very small. Also make the actual notifier chain they use static Cc: jbeulich@novell.com Signed-off-by: Andi Kleen --- arch/x86_64/mm/fault.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 449a437f5d70..3ae3e7bb2fce 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -40,8 +40,7 @@ #define PF_RSVD (1<<3) #define PF_INSTR (1<<4) -#ifdef CONFIG_KPROBES -ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); +static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); /* Hook to register for page fault notifications */ int register_page_fault_notifier(struct notifier_block *nb) @@ -49,11 +48,13 @@ int register_page_fault_notifier(struct notifier_block *nb) vmalloc_sync_all(); return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); } +EXPORT_SYMBOL_GPL(register_page_fault_notifier); int unregister_page_fault_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); } +EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); static inline int notify_page_fault(enum die_val val, const char *str, struct pt_regs *regs, long err, int trap, int sig) @@ -67,13 +68,6 @@ static inline int notify_page_fault(enum die_val val, const char *str, }; return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); } -#else -static inline int notify_page_fault(enum die_val val, const char *str, - struct pt_regs *regs, long err, int trap, int sig) -{ - return NOTIFY_DONE; -} -#endif void bust_spinlocks(int yes) { -- cgit v1.2.1 From 474c256841074b913e76e392082373e12103a75d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] i386: make fault notifier unconditional and export it It's needed for external debuggers and overhead is very small. Also make the actual notifier chain they use static Cc: jbeulich@novell.com Signed-off-by: Andi Kleen --- arch/i386/mm/fault.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index f7279468323a..0ce86168a0b1 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -30,18 +30,20 @@ extern void die(const char *,struct pt_regs *,long); -#ifdef CONFIG_KPROBES -ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); +static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); + int register_page_fault_notifier(struct notifier_block *nb) { vmalloc_sync_all(); return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); } +EXPORT_SYMBOL_GPL(register_page_fault_notifier); int unregister_page_fault_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); } +EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); static inline int notify_page_fault(enum die_val val, const char *str, struct pt_regs *regs, long err, int trap, int sig) @@ -55,14 +57,6 @@ static inline int notify_page_fault(enum die_val val, const char *str, }; return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); } -#else -static inline int notify_page_fault(enum die_val val, const char *str, - struct pt_regs *regs, long err, int trap, int sig) -{ - return NOTIFY_DONE; -} -#endif - /* * Unlock any spinlocks which will prevent us from getting the -- cgit v1.2.1 From e8924acb2ef46b96c93f97025815ef3843cb67a2 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] i386: Make acpi_force static acpi_force can become static. Cc: len.brown@intel.com Signed-off-by: Adrian Bunk Signed-off-by: Andi Kleen --- arch/i386/kernel/acpi/boot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index c809a3f4c0bf..0fba420d668a 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -36,7 +36,7 @@ #include #include -int __initdata acpi_force = 0; +static int __initdata acpi_force = 0; #ifdef CONFIG_ACPI int acpi_disabled = 0; -- cgit v1.2.1 From 3d08a256da8aed5300bd0752200ece426f49b050 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] i386: Make enable_local_apic static enable_local_apic can now become static. Cc: len.brown@intel.com Signed-off-by: Adrian Bunk Signed-off-by: Andi Kleen --- arch/i386/kernel/apic.c | 13 ++++++++++++- include/asm-i386/apic.h | 12 ------------ 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index ce75e71b694f..90faae5c5d30 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -52,7 +52,18 @@ static cpumask_t timer_bcast_ipi; /* * Knob to control our willingness to enable the local APIC. */ -int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ +static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ + +static inline void lapic_disable(void) +{ + enable_local_apic = -1; + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); +} + +static inline void lapic_enable(void) +{ + enable_local_apic = 1; +} /* * Debug level diff --git a/include/asm-i386/apic.h b/include/asm-i386/apic.h index 8c5331ceca7b..3a42b7d6fc92 100644 --- a/include/asm-i386/apic.h +++ b/include/asm-i386/apic.h @@ -16,20 +16,8 @@ #define APIC_VERBOSE 1 #define APIC_DEBUG 2 -extern int enable_local_apic; extern int apic_verbosity; -static inline void lapic_disable(void) -{ - enable_local_apic = -1; - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); -} - -static inline void lapic_enable(void) -{ - enable_local_apic = 1; -} - /* * Define the default level of output to be very little * This can be turned up by using apic=verbose for more -- cgit v1.2.1 From 02ba1a32dbd3d406530a17a2643a8f0f8cbf3acc Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] i386: move kernel_thread_helper into entry.S And add proper CFI annotation to it which was previously impossible. This prevents "stuck" messages by the dwarf2 unwinder when reaching the top of a kernel stack. Includes feedback from Jan Beulich Cc: jbeulich@novell.com Signed-off-by: Andi Kleen --- arch/i386/kernel/entry.S | 13 +++++++++++++ arch/i386/kernel/process.c | 9 --------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index ba22ec8fab54..dede506e5bd0 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -950,6 +950,19 @@ ENTRY(arch_unwind_init_running) ENDPROC(arch_unwind_init_running) #endif +ENTRY(kernel_thread_helper) + pushl $0 # fake return address for unwinder + CFI_STARTPROC + movl %edx,%eax + push %edx + CFI_ADJUST_CFA_OFFSET 4 + call *%ebx + push %eax + CFI_ADJUST_CFA_OFFSET 4 + call do_exit + CFI_ENDPROC +ENDPROC(kernel_thread_helper) + .section .rodata,"a" #include "syscall_table.S" diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index b741c3e1a5eb..220aeca59c3a 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -321,15 +321,6 @@ void show_regs(struct pt_regs * regs) * the "args". */ extern void kernel_thread_helper(void); -__asm__(".section .text\n" - ".align 4\n" - "kernel_thread_helper:\n\t" - "movl %edx,%eax\n\t" - "pushl %edx\n\t" - "call *%ebx\n\t" - "pushl %eax\n\t" - "call do_exit\n" - ".previous"); /* * Create a kernel thread -- cgit v1.2.1 From 522e93e3fcdbf00ba85c72fde6df28cfc0486a65 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] i386: Descriptor and trap table cleanups. The implementation comes from Zach's [RFC, PATCH 10/24] i386 Vmi descriptor changes: Descriptor and trap table cleanups. Add cleanly written accessors for IDT and GDT gates so the subarch may override them. Note that this allows the hypervisor to transparently tweak the DPL of the descriptors as well as the RPL of segments in those descriptors, with no unnecessary kernel code modification. It also allows the hypervisor implementation of the VMI to tweak the gates, allowing for custom exception frames or extra layers of indirection above the guest fault / IRQ handlers. Signed-off-by: Zachary Amsden Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 24 ++-------- include/asm-i386/desc.h | 121 +++++++++++++++++++++++++++++------------------ 2 files changed, 81 insertions(+), 64 deletions(-) diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index bdf949c30c7c..00d643f3de41 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -1165,20 +1165,6 @@ void __init trap_init_f00f_bug(void) } #endif -#define _set_gate(gate_addr,type,dpl,addr,seg) \ -do { \ - int __d0, __d1; \ - __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \ - "movw %4,%%dx\n\t" \ - "movl %%eax,%0\n\t" \ - "movl %%edx,%1" \ - :"=m" (*((long *) (gate_addr))), \ - "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \ - :"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \ - "3" ((char *) (addr)),"2" ((seg) << 16)); \ -} while (0) - - /* * This needs to use 'idt_table' rather than 'idt', and * thus use the _nonmapped_ version of the IDT, as the @@ -1187,7 +1173,7 @@ do { \ */ void set_intr_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,14,0,addr,__KERNEL_CS); + _set_gate(n, DESCTYPE_INT, addr, __KERNEL_CS); } /* @@ -1195,22 +1181,22 @@ void set_intr_gate(unsigned int n, void *addr) */ static inline void set_system_intr_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n, 14, 3, addr, __KERNEL_CS); + _set_gate(n, DESCTYPE_INT | DESCTYPE_DPL3, addr, __KERNEL_CS); } static void __init set_trap_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,15,0,addr,__KERNEL_CS); + _set_gate(n, DESCTYPE_TRAP, addr, __KERNEL_CS); } static void __init set_system_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,15,3,addr,__KERNEL_CS); + _set_gate(n, DESCTYPE_TRAP | DESCTYPE_DPL3, addr, __KERNEL_CS); } static void __init set_task_gate(unsigned int n, unsigned int gdt_entry) { - _set_gate(idt_table+n,5,0,0,(gdt_entry<<3)); + _set_gate(n, DESCTYPE_TASK, (void *)0, (gdt_entry<<3)); } diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h index 89b8b82c82b3..5db9e96e8dc1 100644 --- a/include/asm-i386/desc.h +++ b/include/asm-i386/desc.h @@ -33,50 +33,99 @@ static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address; } +/* + * This is the ldt that every process will get unless we need + * something other than this. + */ +extern struct desc_struct default_ldt[]; +extern struct desc_struct idt_table[]; +extern void set_intr_gate(unsigned int irq, void * addr); + +static inline void pack_descriptor(__u32 *a, __u32 *b, + unsigned long base, unsigned long limit, unsigned char type, unsigned char flags) +{ + *a = ((base & 0xffff) << 16) | (limit & 0xffff); + *b = (base & 0xff000000) | ((base & 0xff0000) >> 16) | + ((type & 0xff) << 8) | ((flags & 0xf) << 12); +} + +static inline void pack_gate(__u32 *a, __u32 *b, + unsigned long base, unsigned short seg, unsigned char type, unsigned char flags) +{ + *a = (seg << 16) | (base & 0xffff); + *b = (base & 0xffff0000) | ((type & 0xff) << 8) | (flags & 0xff); +} + +#define DESCTYPE_LDT 0x82 /* present, system, DPL-0, LDT */ +#define DESCTYPE_TSS 0x89 /* present, system, DPL-0, 32-bit TSS */ +#define DESCTYPE_TASK 0x85 /* present, system, DPL-0, task gate */ +#define DESCTYPE_INT 0x8e /* present, system, DPL-0, interrupt gate */ +#define DESCTYPE_TRAP 0x8f /* present, system, DPL-0, trap gate */ +#define DESCTYPE_DPL3 0x60 /* DPL-3 */ +#define DESCTYPE_S 0x10 /* !system */ + #define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8)) #define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8)) #define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr)) #define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr)) -#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr)) -#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt)) +#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr)) +#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt)) #define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr)) #define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr)) -#define store_tr(tr) __asm__ ("str %0":"=mr" (tr)) -#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt)) +#define store_tr(tr) __asm__ ("str %0":"=m" (tr)) +#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt)) -/* - * This is the ldt that every process will get unless we need - * something other than this. - */ -extern struct desc_struct default_ldt[]; -extern void set_intr_gate(unsigned int irq, void * addr); +#if TLS_SIZE != 24 +# error update this code. +#endif -#define _set_tssldt_desc(n,addr,limit,type) \ -__asm__ __volatile__ ("movw %w3,0(%2)\n\t" \ - "movw %w1,2(%2)\n\t" \ - "rorl $16,%1\n\t" \ - "movb %b1,4(%2)\n\t" \ - "movb %4,5(%2)\n\t" \ - "movb $0,6(%2)\n\t" \ - "movb %h1,7(%2)\n\t" \ - "rorl $16,%1" \ - : "=m"(*(n)) : "q" (addr), "r"(n), "ir"(limit), "i"(type)) - -static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr) +static inline void load_TLS(struct thread_struct *t, unsigned int cpu) { - _set_tssldt_desc(&get_cpu_gdt_table(cpu)[entry], (int)addr, - offsetof(struct tss_struct, __cacheline_filler) - 1, 0x89); +#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i] + C(0); C(1); C(2); +#undef C } -#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr) +static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b) +{ + __u32 *lp = (__u32 *)((char *)dt + entry*8); + *lp = entry_a; + *(lp+1) = entry_b; +} + +#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) +#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) +#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) + +static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg) +{ + __u32 a, b; + pack_gate(&a, &b, (unsigned long)addr, seg, type, 0); + write_idt_entry(idt_table, gate, a, b); +} -static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size) +static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr) { - _set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82); + __u32 a, b; + pack_descriptor(&a, &b, (unsigned long)addr, + offsetof(struct tss_struct, __cacheline_filler) - 1, + DESCTYPE_TSS, 0); + write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b); } +static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int entries) +{ + __u32 a, b; + pack_descriptor(&a, &b, (unsigned long)addr, + entries * sizeof(struct desc_struct) - 1, + DESCTYPE_LDT, 0); + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b); +} + +#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr) + #define LDT_entry_a(info) \ ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) @@ -102,24 +151,6 @@ static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size) (info)->seg_not_present == 1 && \ (info)->useable == 0 ) -static inline void write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b) -{ - __u32 *lp = (__u32 *)((char *)ldt + entry*8); - *lp = entry_a; - *(lp+1) = entry_b; -} - -#if TLS_SIZE != 24 -# error update this code. -#endif - -static inline void load_TLS(struct thread_struct *t, unsigned int cpu) -{ -#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i] - C(0); C(1); C(2); -#undef C -} - static inline void clear_LDT(void) { int cpu = get_cpu(); -- cgit v1.2.1 From ba4d40bb5c465f0a4dcc30d02dab80c2cb7e1ff3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] Auto size the per cpu area. Now for a completely different but trivial approach. I just boot tested it with 255 CPUS and everything worked. Currently everything (except module data) we place in the per cpu area we know about at compile time. So instead of allocating a fixed size for the per_cpu area allocate the number of bytes we need plus a fixed constant for to be used for modules. It isn't perfect but it is much less of a pain to work with than what we are doing now. AK: fixed warning Signed-off-by: Eric W. Biederman Signed-off-by: Andi Kleen --- arch/x86_64/kernel/setup64.c | 7 ++----- include/asm-x86_64/percpu.h | 10 ++++++++++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index b09e60fa96b4..e85cfbb49b63 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -95,12 +95,9 @@ void __init setup_per_cpu_areas(void) #endif /* Copy section for each CPU (we discard the original) */ - size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); -#ifdef CONFIG_MODULES - if (size < PERCPU_ENOUGH_ROOM) - size = PERCPU_ENOUGH_ROOM; -#endif + size = PERCPU_ENOUGH_ROOM; + printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size); for_each_cpu_mask (i, cpu_possible_map) { char *ptr; diff --git a/include/asm-x86_64/percpu.h b/include/asm-x86_64/percpu.h index 08dd9f9dda81..39d2bab9b520 100644 --- a/include/asm-x86_64/percpu.h +++ b/include/asm-x86_64/percpu.h @@ -11,6 +11,16 @@ #include +#ifdef CONFIG_MODULES +# define PERCPU_MODULE_RESERVE 8192 +#else +# define PERCPU_MODULE_RESERVE 0 +#endif + +#define PERCPU_ENOUGH_ROOM \ + (ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + \ + PERCPU_MODULE_RESERVE) + #define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset) #define __my_cpu_offset() read_pda(data_offset) -- cgit v1.2.1 From aada06c9b7f4cdedbeb2bc905893cf1923a0abad Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] i386: clean up topology.c There is no need to duplicate the topology_init() function. Signed-off-by: Magnus Damm Signed-off-by: Andi Kleen --- arch/i386/kernel/topology.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c index e2e281d4bcc8..07d6da36a825 100644 --- a/arch/i386/kernel/topology.c +++ b/arch/i386/kernel/topology.c @@ -28,6 +28,7 @@ #include #include #include +#include #include static struct i386_cpu cpu_devices[NR_CPUS]; @@ -55,34 +56,18 @@ EXPORT_SYMBOL(arch_register_cpu); EXPORT_SYMBOL(arch_unregister_cpu); #endif /*CONFIG_HOTPLUG_CPU*/ - - -#ifdef CONFIG_NUMA -#include - static int __init topology_init(void) { int i; +#ifdef CONFIG_NUMA for_each_online_node(i) register_one_node(i); +#endif /* CONFIG_NUMA */ for_each_present_cpu(i) arch_register_cpu(i); return 0; } -#else /* !CONFIG_NUMA */ - -static int __init topology_init(void) -{ - int i; - - for_each_present_cpu(i) - arch_register_cpu(i); - return 0; -} - -#endif /* CONFIG_NUMA */ - subsys_initcall(topology_init); -- cgit v1.2.1 From c0d83745cc67ed71a08c14739a0b286d0239b1e2 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] i386: mark two more functions as __init cyrix_identify() should be __init because transmeta_identify() is. tsc_init() is only called from setup_arch() which is marked as __init. These two section mismatches have been detected using running modpost on a vmlinux image compiled with CONFIG_RELOCATABLE=y. Signed-off-by: Magnus Damm Signed-off-by: Andi Kleen --- arch/i386/kernel/tsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index 7e0d8dab2075..b8fa0a8b2e47 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c @@ -192,7 +192,7 @@ int recalibrate_cpu_khz(void) EXPORT_SYMBOL(recalibrate_cpu_khz); -void tsc_init(void) +void __init tsc_init(void) { if (!cpu_has_tsc || tsc_disable) return; -- cgit v1.2.1 From 40bee2ee73c745922e9b2d5595c46f19d1cf1b6f Mon Sep 17 00:00:00 2001 From: Brice Goglin Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] fix bus numbering format in mmconfig warning Make an mmconfig warning print the bus id with a regular format. Signed-off-by: Brice Goglin Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/pci/mmconfig.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index 3c55c76c6fd5..dd78f4d18df1 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -156,9 +156,8 @@ static __init void unreachable_devices(void) addr = pci_dev_base(0, k, PCI_DEVFN(i, 0)); if (addr == NULL|| readl(addr) != val1) { set_bit(i + 32*k, fallback_slots); - printk(KERN_NOTICE - "PCI: No mmconfig possible on device %x:%x\n", - k, i); + printk(KERN_NOTICE "PCI: No mmconfig possible" + " on device %02x:%02x\n", k, i); } } } -- cgit v1.2.1 From 73fea175303926055440c06bc8894f0c5c58afc8 Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] i386: Support physical cpu hotplug for x86_64 This patch enables ACPI based physical CPU hotplug support for x86_64. Implements acpi_map_lsapic() and acpi_unmap_lsapic() to support physical cpu hotplug. Signed-off-by: Ashok Raj Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: "Brown, Len" Signed-off-by: Andrew Morton --- arch/i386/kernel/acpi/boot.c | 69 +++++++++++++++++++++++++++++++++++++++++--- arch/i386/kernel/mpparse.c | 2 +- include/asm-i386/smp.h | 1 + 3 files changed, 67 insertions(+), 5 deletions(-) diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index 0fba420d668a..8a6c5b412348 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -512,16 +513,76 @@ EXPORT_SYMBOL(acpi_register_gsi); #ifdef CONFIG_ACPI_HOTPLUG_CPU int acpi_map_lsapic(acpi_handle handle, int *pcpu) { - /* TBD */ - return -EINVAL; + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + union acpi_object *obj; + struct acpi_table_lapic *lapic; + cpumask_t tmp_map, new_map; + u8 physid; + int cpu; + + if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer))) + return -EINVAL; + + if (!buffer.length || !buffer.pointer) + return -EINVAL; + + obj = buffer.pointer; + if (obj->type != ACPI_TYPE_BUFFER || + obj->buffer.length < sizeof(*lapic)) { + kfree(buffer.pointer); + return -EINVAL; + } + + lapic = (struct acpi_table_lapic *)obj->buffer.pointer; + + if ((lapic->header.type != ACPI_MADT_LAPIC) || + (!lapic->flags.enabled)) { + kfree(buffer.pointer); + return -EINVAL; + } + + physid = lapic->id; + + kfree(buffer.pointer); + buffer.length = ACPI_ALLOCATE_BUFFER; + buffer.pointer = NULL; + + tmp_map = cpu_present_map; + mp_register_lapic(physid, lapic->flags.enabled); + + /* + * If mp_register_lapic successfully generates a new logical cpu + * number, then the following will get us exactly what was mapped + */ + cpus_andnot(new_map, cpu_present_map, tmp_map); + if (cpus_empty(new_map)) { + printk ("Unable to map lapic to logical cpu number\n"); + return -EINVAL; + } + + cpu = first_cpu(new_map); + + *pcpu = cpu; + return 0; } EXPORT_SYMBOL(acpi_map_lsapic); int acpi_unmap_lsapic(int cpu) { - /* TBD */ - return -EINVAL; + int i; + + for_each_possible_cpu(i) { + if (x86_acpiid_to_apicid[i] == x86_cpu_to_apicid[cpu]) { + x86_acpiid_to_apicid[i] = -1; + break; + } + } + x86_cpu_to_apicid[cpu] = -1; + cpu_clear(cpu, cpu_present_map); + num_processors--; + + return (0); } EXPORT_SYMBOL(acpi_unmap_lsapic); diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index 772a4233bfe6..442aaf8c77eb 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -69,7 +69,7 @@ unsigned int def_to_bigsmp = 0; /* Processor that is doing the boot up */ unsigned int boot_cpu_physical_apicid = -1U; /* Internal processor count */ -static unsigned int __devinitdata num_processors; +unsigned int __cpuinitdata num_processors; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map; diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h index f87826039a51..32ac8c91d5c5 100644 --- a/include/asm-i386/smp.h +++ b/include/asm-i386/smp.h @@ -84,6 +84,7 @@ static inline int hard_smp_processor_id(void) extern int __cpu_disable(void); extern void __cpu_die(unsigned int cpu); +extern unsigned int num_processors; #endif /* !__ASSEMBLY__ */ -- cgit v1.2.1 From e07e23e1fd3000289fc7ccc6c71879070d3b19e0 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] non lazy "sleazy" fpu implementation Right now the kernel on x86-64 has a 100% lazy fpu behavior: after *every* context switch a trap is taken for the first FPU use to restore the FPU context lazily. This is of course great for applications that have very sporadic or no FPU use (since then you avoid doing the expensive save/restore all the time). However for very frequent FPU users... you take an extra trap every context switch. The patch below adds a simple heuristic to this code: After 5 consecutive context switches of FPU use, the lazy behavior is disabled and the context gets restored every context switch. If the app indeed uses the FPU, the trap is avoided. (the chance of the 6th time slice using FPU after the previous 5 having done so are quite high obviously). After 256 switches, this is reset and lazy behavior is returned (until there are 5 consecutive ones again). The reason for this is to give apps that do longer bursts of FPU use still the lazy behavior back after some time. [akpm@osdl.org: place new task_struct field next to jit_keyring to save space] Signed-off-by: Arjan van de Ven Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/kernel/process.c | 10 ++++++++++ arch/x86_64/kernel/traps.c | 1 + include/asm-x86_64/i387.h | 5 ++++- include/linux/sched.h | 9 +++++++++ 4 files changed, 24 insertions(+), 1 deletion(-) diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 6fbd19564e4e..9e9a70e50c72 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -552,6 +552,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); + /* we're going to use this soon, after a few expensive things */ + if (next_p->fpu_counter>5) + prefetch(&next->i387.fxsave); + /* * Reload esp0, LDT and the page table pointer: */ @@ -629,6 +633,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) __switch_to_xtra(prev_p, next_p, tss); + /* If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + if (next_p->fpu_counter>5) + math_state_restore(); return prev_p; } diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 28e53342f294..ffc40cff1e07 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -1136,6 +1136,7 @@ asmlinkage void math_state_restore(void) init_fpu(me); restore_fpu_checking(&me->thread.i387.fxsave); task_thread_info(me)->status |= TS_USEDFPU; + me->fpu_counter++; } void __init trap_init(void) diff --git a/include/asm-x86_64/i387.h b/include/asm-x86_64/i387.h index cba8a3b0cded..60c0f4853fdb 100644 --- a/include/asm-x86_64/i387.h +++ b/include/asm-x86_64/i387.h @@ -24,6 +24,7 @@ extern unsigned int mxcsr_feature_mask; extern void mxcsr_feature_mask_init(void); extern void init_fpu(struct task_struct *child); extern int save_i387(struct _fpstate __user *buf); +extern asmlinkage void math_state_restore(void); /* * FPU lazy state save handling... @@ -31,7 +32,9 @@ extern int save_i387(struct _fpstate __user *buf); #define unlazy_fpu(tsk) do { \ if (task_thread_info(tsk)->status & TS_USEDFPU) \ - save_init_fpu(tsk); \ + save_init_fpu(tsk); \ + else \ + tsk->fpu_counter = 0; \ } while (0) /* Ignore delayed exceptions from user space */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 34ed0d99b1bd..807556c5bcd2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -865,6 +865,15 @@ struct task_struct { struct key *thread_keyring; /* keyring private to this thread */ unsigned char jit_keyring; /* default keyring to attach requested keys to */ #endif + /* + * fpu_counter contains the number of consecutive context switches + * that the FPU is used. If this is over a threshold, the lazy fpu + * saving becomes unlazy to save the trap. This is an unsigned char + * so that after 256 times the counter wraps and the behavior turns + * lazy again; this to deal with bursty apps that only use FPU for + * a short time + */ + unsigned char fpu_counter; int oomkilladj; /* OOM kill score adjustment (bit shift). */ char comm[TASK_COMM_LEN]; /* executable name excluding path - access with [gs]et_task_comm (which lock -- cgit v1.2.1 From abf0f10948b316b577851ef21c728341f1046552 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] wire up oops_enter()/oops_exit() Implement pause_on_oops() on x86_64. AK: I redid the patch to do the oops_enter/exit in the existing oops_begin()/end(). This makes it much shorter. Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/x86_64/kernel/traps.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index ffc40cff1e07..34660b1e2720 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -561,6 +561,8 @@ unsigned __kprobes long oops_begin(void) int cpu = safe_smp_processor_id(); unsigned long flags; + oops_enter(); + /* racy, but better than risking deadlock. */ local_irq_save(flags); if (!spin_trylock(&die_lock)) { @@ -589,6 +591,7 @@ void __kprobes oops_end(unsigned long flags) spin_unlock_irqrestore(&die_lock, flags); if (panic_on_oops) panic("Fatal exception"); + oops_exit(); } void __kprobes __die(const char * str, struct pt_regs * regs, long err) -- cgit v1.2.1 From 6ad916581181a105d7832a7dec9e1eb58f7a1621 Mon Sep 17 00:00:00 2001 From: Keith Mannthey Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] x86_64 kernel mapping fix Fix for the x86_64 kernel mapping code. Without this patch the update path only inits one pmd_page worth of memory and tramples any entries on it. now the calling convention to phys_pmd_init and phys_init is to always pass a [pmd/pud] page not an offset within a page. Signed-off-by: Keith Mannthey Signed-off-by: Andi Kleen Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton --- arch/x86_64/mm/init.c | 51 ++++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index d40134bd6399..984155b75e4c 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -250,12 +250,13 @@ __init void early_iounmap(void *addr, unsigned long size) } static void __meminit -phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end) +phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) { - int i; + int i = pmd_index(address); - for (i = 0; i < PTRS_PER_PMD; pmd++, i++, address += PMD_SIZE) { + for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { unsigned long entry; + pmd_t *pmd = pmd_page + pmd_index(address); if (address >= end) { if (!after_bootmem) @@ -263,6 +264,10 @@ phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end) set_pmd(pmd, __pmd(0)); break; } + + if (pmd_val(*pmd)) + continue; + entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address; entry &= __supported_pte_mask; set_pmd(pmd, __pmd(entry)); @@ -272,45 +277,41 @@ phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end) static void __meminit phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) { - pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address)); - - if (pmd_none(*pmd)) { - spin_lock(&init_mm.page_table_lock); - phys_pmd_init(pmd, address, end); - spin_unlock(&init_mm.page_table_lock); - __flush_tlb_all(); - } + pmd_t *pmd = pmd_offset(pud,0); + spin_lock(&init_mm.page_table_lock); + phys_pmd_init(pmd, address, end); + spin_unlock(&init_mm.page_table_lock); + __flush_tlb_all(); } -static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) +static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) { - long i = pud_index(address); - - pud = pud + i; + int i = pud_index(addr); - if (after_bootmem && pud_val(*pud)) { - phys_pmd_update(pud, address, end); - return; - } - for (; i < PTRS_PER_PUD; pud++, i++) { + for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) { int map; - unsigned long paddr, pmd_phys; + unsigned long pmd_phys; + pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; - paddr = (address & PGDIR_MASK) + i*PUD_SIZE; - if (paddr >= end) + if (addr >= end) break; - if (!after_bootmem && !e820_any_mapped(paddr, paddr+PUD_SIZE, 0)) { + if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) { set_pud(pud, __pud(0)); continue; } + if (pud_val(*pud)) { + phys_pmd_update(pud, addr, end); + continue; + } + pmd = alloc_low_page(&map, &pmd_phys); spin_lock(&init_mm.page_table_lock); set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); - phys_pmd_init(pmd, paddr, end); + phys_pmd_init(pmd, addr, end); spin_unlock(&init_mm.page_table_lock); unmap_low_page(map); } -- cgit v1.2.1 From 68bbc172cd1b0ee01814304b8a7bef8922d5fdca Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] i386: remove redundant generic_identify() calls when identifying cpus cpu_dev->c_identify is only called from arch/i386/common.c:identify_cpu(), and this after generic_identify() already has been called. There is no need to call this function twice and hook it in c_identify - but I may be wrong, please double check before applying. This patch also removes generic_identify() from cpu.h to avoid unnecessary future nesting. Signed-off-by: Magnus Damm Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/amd.c | 1 - arch/i386/kernel/cpu/common.c | 2 +- arch/i386/kernel/cpu/cpu.h | 2 -- arch/i386/kernel/cpu/cyrix.c | 2 -- arch/i386/kernel/cpu/intel.c | 1 - arch/i386/kernel/cpu/nexgen.c | 1 - arch/i386/kernel/cpu/transmeta.c | 1 - 7 files changed, 1 insertion(+), 9 deletions(-) diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c index e6a2d6b80cda..d6e7778bac70 100644 --- a/arch/i386/kernel/cpu/amd.c +++ b/arch/i386/kernel/cpu/amd.c @@ -275,7 +275,6 @@ static struct cpu_dev amd_cpu_dev __initdata = { }, }, .c_init = init_amd, - .c_identify = generic_identify, .c_size_cache = amd_size_cache, }; diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 730a7ab75009..24ac51a02378 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -265,7 +265,7 @@ static void __init early_cpu_detect(void) } } -void __cpuinit generic_identify(struct cpuinfo_x86 * c) +static void __cpuinit generic_identify(struct cpuinfo_x86 * c) { u32 tfms, xlvl; int ebx; diff --git a/arch/i386/kernel/cpu/cpu.h b/arch/i386/kernel/cpu/cpu.h index 5a1d4f163e84..2f6432cef6ff 100644 --- a/arch/i386/kernel/cpu/cpu.h +++ b/arch/i386/kernel/cpu/cpu.h @@ -24,7 +24,5 @@ extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM]; extern int get_model_name(struct cpuinfo_x86 *c); extern void display_cacheinfo(struct cpuinfo_x86 *c); -extern void generic_identify(struct cpuinfo_x86 * c); - extern void early_intel_workaround(struct cpuinfo_x86 *c); diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c index f03b7f94c304..bb02ed1fe560 100644 --- a/arch/i386/kernel/cpu/cyrix.c +++ b/arch/i386/kernel/cpu/cyrix.c @@ -427,7 +427,6 @@ static void cyrix_identify(struct cpuinfo_x86 * c) local_irq_restore(flags); } } - generic_identify(c); } static struct cpu_dev cyrix_cpu_dev __initdata = { @@ -457,7 +456,6 @@ static struct cpu_dev nsc_cpu_dev __initdata = { .c_vendor = "NSC", .c_ident = { "Geode by NSC" }, .c_init = init_nsc, - .c_identify = generic_identify, }; int __init nsc_init_cpu(void) diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c index 5a2e270924b1..26243e019d11 100644 --- a/arch/i386/kernel/cpu/intel.c +++ b/arch/i386/kernel/cpu/intel.c @@ -263,7 +263,6 @@ static struct cpu_dev intel_cpu_dev __cpuinitdata = { }, }, .c_init = init_intel, - .c_identify = generic_identify, .c_size_cache = intel_size_cache, }; diff --git a/arch/i386/kernel/cpu/nexgen.c b/arch/i386/kernel/cpu/nexgen.c index ad87fa58058d..df188bfd759b 100644 --- a/arch/i386/kernel/cpu/nexgen.c +++ b/arch/i386/kernel/cpu/nexgen.c @@ -38,7 +38,6 @@ static void __init nexgen_identify(struct cpuinfo_x86 * c) if ( deep_magic_nexgen_probe() ) { strcpy(c->x86_vendor_id, "NexGenDriven"); } - generic_identify(c); } static struct cpu_dev nexgen_cpu_dev __initdata = { diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c index 7214c9b577ab..4027c369bc90 100644 --- a/arch/i386/kernel/cpu/transmeta.c +++ b/arch/i386/kernel/cpu/transmeta.c @@ -88,7 +88,6 @@ static void __init init_transmeta(struct cpuinfo_x86 *c) static void __init transmeta_identify(struct cpuinfo_x86 * c) { u32 xlvl; - generic_identify(c); /* Transmeta-defined flags: level 0x80860001 */ xlvl = cpuid_eax(0x80860000); -- cgit v1.2.1 From ed77504b2007ff7ce56841227467ac3ead52df62 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] mark init_amd() as __cpuinit The init_amd() function is only called from identify_cpu() which is already marked as __cpuinit. So let's mark it as __cpuinit. Signed-off-by: Magnus Damm Signed-off-by: Andi Kleen --- arch/x86_64/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 4be29832e3dc..e328e3eb8cbe 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -672,7 +672,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) #endif } -static void __init init_amd(struct cpuinfo_x86 *c) +static void __cpuinit init_amd(struct cpuinfo_x86 *c) { unsigned level; -- cgit v1.2.1 From 95414930548871c6c92a5b0e607b12b81f3d84d8 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] i386: mark cpu_dev structures as __cpuinitdata The different cpu_dev structures are all used from __cpuinit callers what I can tell. So mark them as __cpuinitdata instead of __initdata. I am a little bit unsure about arch/i386/common.c:default_cpu, especially when it comes to the purpose of this_cpu. Signed-off-by: Magnus Damm Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/amd.c | 2 +- arch/i386/kernel/cpu/centaur.c | 2 +- arch/i386/kernel/cpu/common.c | 2 +- arch/i386/kernel/cpu/cyrix.c | 4 ++-- arch/i386/kernel/cpu/nexgen.c | 2 +- arch/i386/kernel/cpu/rise.c | 2 +- arch/i386/kernel/cpu/transmeta.c | 2 +- arch/i386/kernel/cpu/umc.c | 2 +- 8 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c index d6e7778bac70..eb134a20a7b8 100644 --- a/arch/i386/kernel/cpu/amd.c +++ b/arch/i386/kernel/cpu/amd.c @@ -259,7 +259,7 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 * c, unsigned int size) return size; } -static struct cpu_dev amd_cpu_dev __initdata = { +static struct cpu_dev amd_cpu_dev __cpuinitdata = { .c_vendor = "AMD", .c_ident = { "AuthenticAMD" }, .c_models = { diff --git a/arch/i386/kernel/cpu/centaur.c b/arch/i386/kernel/cpu/centaur.c index bd75629dd262..b4d70dcea9f8 100644 --- a/arch/i386/kernel/cpu/centaur.c +++ b/arch/i386/kernel/cpu/centaur.c @@ -457,7 +457,7 @@ static unsigned int centaur_size_cache(struct cpuinfo_x86 * c, unsigned int size return size; } -static struct cpu_dev centaur_cpu_dev __initdata = { +static struct cpu_dev centaur_cpu_dev __cpuinitdata = { .c_vendor = "Centaur", .c_ident = { "CentaurHauls" }, .c_init = init_centaur, diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 24ac51a02378..99144449b9a1 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -49,7 +49,7 @@ static void default_init(struct cpuinfo_x86 * c) } } -static struct cpu_dev default_cpu = { +static struct cpu_dev __cpuinitdata default_cpu = { .c_init = default_init, .c_vendor = "Unknown", }; diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c index bb02ed1fe560..c2a0da9e0b55 100644 --- a/arch/i386/kernel/cpu/cyrix.c +++ b/arch/i386/kernel/cpu/cyrix.c @@ -429,7 +429,7 @@ static void cyrix_identify(struct cpuinfo_x86 * c) } } -static struct cpu_dev cyrix_cpu_dev __initdata = { +static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { .c_vendor = "Cyrix", .c_ident = { "CyrixInstead" }, .c_init = init_cyrix, @@ -452,7 +452,7 @@ static int __init cyrix_exit_cpu(void) late_initcall(cyrix_exit_cpu); -static struct cpu_dev nsc_cpu_dev __initdata = { +static struct cpu_dev nsc_cpu_dev __cpuinitdata = { .c_vendor = "NSC", .c_ident = { "Geode by NSC" }, .c_init = init_nsc, diff --git a/arch/i386/kernel/cpu/nexgen.c b/arch/i386/kernel/cpu/nexgen.c index df188bfd759b..9d622598c055 100644 --- a/arch/i386/kernel/cpu/nexgen.c +++ b/arch/i386/kernel/cpu/nexgen.c @@ -40,7 +40,7 @@ static void __init nexgen_identify(struct cpuinfo_x86 * c) } } -static struct cpu_dev nexgen_cpu_dev __initdata = { +static struct cpu_dev nexgen_cpu_dev __cpuinitdata = { .c_vendor = "Nexgen", .c_ident = { "NexGenDriven" }, .c_models = { diff --git a/arch/i386/kernel/cpu/rise.c b/arch/i386/kernel/cpu/rise.c index d08d5a2811c8..b597e435f6c1 100644 --- a/arch/i386/kernel/cpu/rise.c +++ b/arch/i386/kernel/cpu/rise.c @@ -28,7 +28,7 @@ static void __init init_rise(struct cpuinfo_x86 *c) set_bit(X86_FEATURE_CX8, c->x86_capability); } -static struct cpu_dev rise_cpu_dev __initdata = { +static struct cpu_dev rise_cpu_dev __cpuinitdata = { .c_vendor = "Rise", .c_ident = { "RiseRiseRise" }, .c_models = { diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c index 4027c369bc90..0b6e8392ed33 100644 --- a/arch/i386/kernel/cpu/transmeta.c +++ b/arch/i386/kernel/cpu/transmeta.c @@ -97,7 +97,7 @@ static void __init transmeta_identify(struct cpuinfo_x86 * c) } } -static struct cpu_dev transmeta_cpu_dev __initdata = { +static struct cpu_dev transmeta_cpu_dev __cpuinitdata = { .c_vendor = "Transmeta", .c_ident = { "GenuineTMx86", "TransmetaCPU" }, .c_init = init_transmeta, diff --git a/arch/i386/kernel/cpu/umc.c b/arch/i386/kernel/cpu/umc.c index 2cd988f6dc55..392b195948d6 100644 --- a/arch/i386/kernel/cpu/umc.c +++ b/arch/i386/kernel/cpu/umc.c @@ -10,7 +10,7 @@ static void __init init_umc(struct cpuinfo_x86 * c) } -static struct cpu_dev umc_cpu_dev __initdata = { +static struct cpu_dev umc_cpu_dev __cpuinitdata = { .c_vendor = "UMC", .c_ident = { "UMC UMC UMC" }, .c_models = { -- cgit v1.2.1 From b4af3f7cf11e6b5904af08a652d4a2429af17c74 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] i386: mark cpu init functions as __cpuinit, data as __cpuinitdata Mark i386-specific cpu init functions as __cpuinit. They are all only called from arch/i386/common.c:identify_cpu() that already is marked as __cpuinit. This patch also removes the empty function init_umc(). Signed-off-by: Magnus Damm Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/amd.c | 2 +- arch/i386/kernel/cpu/centaur.c | 20 ++++++++++---------- arch/i386/kernel/cpu/common.c | 2 +- arch/i386/kernel/cpu/cyrix.c | 32 ++++++++++++++++---------------- arch/i386/kernel/cpu/nexgen.c | 2 +- arch/i386/kernel/cpu/rise.c | 2 +- arch/i386/kernel/cpu/transmeta.c | 2 +- arch/i386/kernel/cpu/umc.c | 5 ----- 8 files changed, 31 insertions(+), 36 deletions(-) diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c index eb134a20a7b8..856cd08f2f3d 100644 --- a/arch/i386/kernel/cpu/amd.c +++ b/arch/i386/kernel/cpu/amd.c @@ -22,7 +22,7 @@ extern void vide(void); __asm__(".align 4\nvide: ret"); -static void __init init_amd(struct cpuinfo_x86 *c) +static void __cpuinit init_amd(struct cpuinfo_x86 *c) { u32 l, h; int mbytes = num_physpages >> (20-PAGE_SHIFT); diff --git a/arch/i386/kernel/cpu/centaur.c b/arch/i386/kernel/cpu/centaur.c index b4d70dcea9f8..34a81ede7892 100644 --- a/arch/i386/kernel/cpu/centaur.c +++ b/arch/i386/kernel/cpu/centaur.c @@ -9,7 +9,7 @@ #ifdef CONFIG_X86_OOSTORE -static u32 __init power2(u32 x) +static u32 __cpuinit power2(u32 x) { u32 s=1; while(s<=x) @@ -22,7 +22,7 @@ static u32 __init power2(u32 x) * Set up an actual MCR */ -static void __init centaur_mcr_insert(int reg, u32 base, u32 size, int key) +static void __cpuinit centaur_mcr_insert(int reg, u32 base, u32 size, int key) { u32 lo, hi; @@ -40,7 +40,7 @@ static void __init centaur_mcr_insert(int reg, u32 base, u32 size, int key) * Shortcut: We know you can't put 4Gig of RAM on a winchip */ -static u32 __init ramtop(void) /* 16388 */ +static u32 __cpuinit ramtop(void) /* 16388 */ { int i; u32 top = 0; @@ -91,7 +91,7 @@ static u32 __init ramtop(void) /* 16388 */ * Compute a set of MCR's to give maximum coverage */ -static int __init centaur_mcr_compute(int nr, int key) +static int __cpuinit centaur_mcr_compute(int nr, int key) { u32 mem = ramtop(); u32 root = power2(mem); @@ -166,7 +166,7 @@ static int __init centaur_mcr_compute(int nr, int key) return ct; } -static void __init centaur_create_optimal_mcr(void) +static void __cpuinit centaur_create_optimal_mcr(void) { int i; /* @@ -189,7 +189,7 @@ static void __init centaur_create_optimal_mcr(void) wrmsr(MSR_IDT_MCR0+i, 0, 0); } -static void __init winchip2_create_optimal_mcr(void) +static void __cpuinit winchip2_create_optimal_mcr(void) { u32 lo, hi; int i; @@ -227,7 +227,7 @@ static void __init winchip2_create_optimal_mcr(void) * Handle the MCR key on the Winchip 2. */ -static void __init winchip2_unprotect_mcr(void) +static void __cpuinit winchip2_unprotect_mcr(void) { u32 lo, hi; u32 key; @@ -239,7 +239,7 @@ static void __init winchip2_unprotect_mcr(void) wrmsr(MSR_IDT_MCR_CTRL, lo, hi); } -static void __init winchip2_protect_mcr(void) +static void __cpuinit winchip2_protect_mcr(void) { u32 lo, hi; @@ -257,7 +257,7 @@ static void __init winchip2_protect_mcr(void) #define RNG_ENABLED (1 << 3) #define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */ -static void __init init_c3(struct cpuinfo_x86 *c) +static void __cpuinit init_c3(struct cpuinfo_x86 *c) { u32 lo, hi; @@ -303,7 +303,7 @@ static void __init init_c3(struct cpuinfo_x86 *c) display_cacheinfo(c); } -static void __init init_centaur(struct cpuinfo_x86 *c) +static void __cpuinit init_centaur(struct cpuinfo_x86 *c) { enum { ECX8=1<<1, diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 99144449b9a1..2799baaadf45 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -36,7 +36,7 @@ struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; extern int disable_pse; -static void default_init(struct cpuinfo_x86 * c) +static void __cpuinit default_init(struct cpuinfo_x86 * c) { /* Not much we can do here... */ /* Check if at least it has cpuid */ diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c index c2a0da9e0b55..d34b1bdb15fd 100644 --- a/arch/i386/kernel/cpu/cyrix.c +++ b/arch/i386/kernel/cpu/cyrix.c @@ -52,25 +52,25 @@ static void __init do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) * Actually since bugs.h doesn't even reference this perhaps someone should * fix the documentation ??? */ -static unsigned char Cx86_dir0_msb __initdata = 0; +static unsigned char Cx86_dir0_msb __cpuinitdata = 0; -static char Cx86_model[][9] __initdata = { +static char Cx86_model[][9] __cpuinitdata = { "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ", "M II ", "Unknown" }; -static char Cx486_name[][5] __initdata = { +static char Cx486_name[][5] __cpuinitdata = { "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx", "SRx2", "DRx2" }; -static char Cx486S_name[][4] __initdata = { +static char Cx486S_name[][4] __cpuinitdata = { "S", "S2", "Se", "S2e" }; -static char Cx486D_name[][4] __initdata = { +static char Cx486D_name[][4] __cpuinitdata = { "DX", "DX2", "?", "?", "?", "DX4" }; -static char Cx86_cb[] __initdata = "?.5x Core/Bus Clock"; -static char cyrix_model_mult1[] __initdata = "12??43"; -static char cyrix_model_mult2[] __initdata = "12233445"; +static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock"; +static char cyrix_model_mult1[] __cpuinitdata = "12??43"; +static char cyrix_model_mult2[] __cpuinitdata = "12233445"; /* * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old @@ -82,7 +82,7 @@ static char cyrix_model_mult2[] __initdata = "12233445"; extern void calibrate_delay(void) __init; -static void __init check_cx686_slop(struct cpuinfo_x86 *c) +static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c) { unsigned long flags; @@ -107,7 +107,7 @@ static void __init check_cx686_slop(struct cpuinfo_x86 *c) } -static void __init set_cx86_reorder(void) +static void __cpuinit set_cx86_reorder(void) { u8 ccr3; @@ -122,7 +122,7 @@ static void __init set_cx86_reorder(void) setCx86(CX86_CCR3, ccr3); } -static void __init set_cx86_memwb(void) +static void __cpuinit set_cx86_memwb(void) { u32 cr0; @@ -137,7 +137,7 @@ static void __init set_cx86_memwb(void) setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14 ); } -static void __init set_cx86_inc(void) +static void __cpuinit set_cx86_inc(void) { unsigned char ccr3; @@ -158,7 +158,7 @@ static void __init set_cx86_inc(void) * Configure later MediaGX and/or Geode processor. */ -static void __init geode_configure(void) +static void __cpuinit geode_configure(void) { unsigned long flags; u8 ccr3, ccr4; @@ -184,14 +184,14 @@ static void __init geode_configure(void) #ifdef CONFIG_PCI -static struct pci_device_id __initdata cyrix_55x0[] = { +static struct pci_device_id __cpuinitdata cyrix_55x0[] = { { PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510) }, { PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520) }, { }, }; #endif -static void __init init_cyrix(struct cpuinfo_x86 *c) +static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) { unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0; char *buf = c->x86_model_id; @@ -346,7 +346,7 @@ static void __init init_cyrix(struct cpuinfo_x86 *c) /* * Handle National Semiconductor branded processors */ -static void __init init_nsc(struct cpuinfo_x86 *c) +static void __cpuinit init_nsc(struct cpuinfo_x86 *c) { /* There may be GX1 processors in the wild that are branded * NSC and not Cyrix. diff --git a/arch/i386/kernel/cpu/nexgen.c b/arch/i386/kernel/cpu/nexgen.c index 9d622598c055..d8cc88224d17 100644 --- a/arch/i386/kernel/cpu/nexgen.c +++ b/arch/i386/kernel/cpu/nexgen.c @@ -27,7 +27,7 @@ static int __init deep_magic_nexgen_probe(void) return ret; } -static void __init init_nexgen(struct cpuinfo_x86 * c) +static void __cpuinit init_nexgen(struct cpuinfo_x86 * c) { c->x86_cache_size = 256; /* A few had 1 MB... */ } diff --git a/arch/i386/kernel/cpu/rise.c b/arch/i386/kernel/cpu/rise.c index b597e435f6c1..9317f7414989 100644 --- a/arch/i386/kernel/cpu/rise.c +++ b/arch/i386/kernel/cpu/rise.c @@ -5,7 +5,7 @@ #include "cpu.h" -static void __init init_rise(struct cpuinfo_x86 *c) +static void __cpuinit init_rise(struct cpuinfo_x86 *c) { printk("CPU: Rise iDragon"); if (c->x86_model > 2) diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c index 0b6e8392ed33..725404cd77f7 100644 --- a/arch/i386/kernel/cpu/transmeta.c +++ b/arch/i386/kernel/cpu/transmeta.c @@ -5,7 +5,7 @@ #include #include "cpu.h" -static void __init init_transmeta(struct cpuinfo_x86 *c) +static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) { unsigned int cap_mask, uk, max, dummy; unsigned int cms_rev1, cms_rev2; diff --git a/arch/i386/kernel/cpu/umc.c b/arch/i386/kernel/cpu/umc.c index 392b195948d6..1bf3f87e9c5b 100644 --- a/arch/i386/kernel/cpu/umc.c +++ b/arch/i386/kernel/cpu/umc.c @@ -5,10 +5,6 @@ /* UMC chips appear to be only either 386 or 486, so no special init takes place. */ -static void __init init_umc(struct cpuinfo_x86 * c) -{ - -} static struct cpu_dev umc_cpu_dev __cpuinitdata = { .c_vendor = "UMC", @@ -21,7 +17,6 @@ static struct cpu_dev umc_cpu_dev __cpuinitdata = { } }, }, - .c_init = init_umc, }; int __init umc_init_cpu(void) -- cgit v1.2.1 From 5f0f1c166647860bb2c2a206338e7d9af3834753 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] i386: mark cpu identify functions as __cpuinit Mark i386-specific cpu identification functions as __cpuinit. They are all only called from arch/i386/common.c:identify_cpu() that already is marked as __cpuinit. Signed-off-by: Magnus Damm Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/cyrix.c | 4 ++-- arch/i386/kernel/cpu/nexgen.c | 4 ++-- arch/i386/kernel/cpu/transmeta.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c index d34b1bdb15fd..c0c3b59de32c 100644 --- a/arch/i386/kernel/cpu/cyrix.c +++ b/arch/i386/kernel/cpu/cyrix.c @@ -12,7 +12,7 @@ /* * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU */ -static void __init do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) { unsigned char ccr2, ccr3; unsigned long flags; @@ -394,7 +394,7 @@ static inline int test_cyrix_52div(void) return (unsigned char) (test >> 8) == 0x02; } -static void cyrix_identify(struct cpuinfo_x86 * c) +static void __cpuinit cyrix_identify(struct cpuinfo_x86 * c) { /* Detect Cyrix with disabled CPUID */ if ( c->x86 == 4 && test_cyrix_52div() ) { diff --git a/arch/i386/kernel/cpu/nexgen.c b/arch/i386/kernel/cpu/nexgen.c index d8cc88224d17..8bf23cc80c63 100644 --- a/arch/i386/kernel/cpu/nexgen.c +++ b/arch/i386/kernel/cpu/nexgen.c @@ -10,7 +10,7 @@ * to have CPUID. (Thanks to Herbert Oppmann) */ -static int __init deep_magic_nexgen_probe(void) +static int __cpuinit deep_magic_nexgen_probe(void) { int ret; @@ -32,7 +32,7 @@ static void __cpuinit init_nexgen(struct cpuinfo_x86 * c) c->x86_cache_size = 256; /* A few had 1 MB... */ } -static void __init nexgen_identify(struct cpuinfo_x86 * c) +static void __cpuinit nexgen_identify(struct cpuinfo_x86 * c) { /* Detect NexGen with old hypercode */ if ( deep_magic_nexgen_probe() ) { diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c index 725404cd77f7..4056fb7d2cdf 100644 --- a/arch/i386/kernel/cpu/transmeta.c +++ b/arch/i386/kernel/cpu/transmeta.c @@ -85,7 +85,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) #endif } -static void __init transmeta_identify(struct cpuinfo_x86 * c) +static void __cpuinit transmeta_identify(struct cpuinfo_x86 * c) { u32 xlvl; -- cgit v1.2.1 From e9dff0ee6694b2edd40b1b448cb786f6a7b02336 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] i386: mark cpu cache functions as __cpuinit Mark i386-specific cpu cache functions as __cpuinit. They are all only called from arch/i386/common.c:display_cache_info() that already is marked as __cpuinit. Signed-off-by: Magnus Damm Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/amd.c | 2 +- arch/i386/kernel/cpu/centaur.c | 2 +- arch/i386/kernel/cpu/intel.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c index 856cd08f2f3d..e4758095d87a 100644 --- a/arch/i386/kernel/cpu/amd.c +++ b/arch/i386/kernel/cpu/amd.c @@ -246,7 +246,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) num_cache_leaves = 3; } -static unsigned int amd_size_cache(struct cpuinfo_x86 * c, unsigned int size) +static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size) { /* AMD errata T13 (order #21922) */ if ((c->x86 == 6)) { diff --git a/arch/i386/kernel/cpu/centaur.c b/arch/i386/kernel/cpu/centaur.c index 34a81ede7892..8c25047975c0 100644 --- a/arch/i386/kernel/cpu/centaur.c +++ b/arch/i386/kernel/cpu/centaur.c @@ -442,7 +442,7 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c) } } -static unsigned int centaur_size_cache(struct cpuinfo_x86 * c, unsigned int size) +static unsigned int __cpuinit centaur_size_cache(struct cpuinfo_x86 * c, unsigned int size) { /* VIA C3 CPUs (670-68F) need further shifting. */ if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8))) diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c index 26243e019d11..94a95aa5227e 100644 --- a/arch/i386/kernel/cpu/intel.c +++ b/arch/i386/kernel/cpu/intel.c @@ -198,7 +198,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) } -static unsigned int intel_size_cache(struct cpuinfo_x86 * c, unsigned int size) +static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size) { /* Intel PIII Tualatin. This comes in two flavours. * One has 256kb of cache, the other 512. We have no way -- cgit v1.2.1 From 6f6b1e0477ccb2f25a9b045e38440347d2ce21c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20Luis=20V=E1zquez=20Cao?= Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] i386: Disallow kprobes on NMI handlers A kprobe executes IRET early and that could cause NMI recursion and stack corruption. Note: This problem was originally spotted by Andi Kleen. This patch adds fixes not included in his original patch. [AK: Jan Beulich originally discovered these classes of bugs] Signed-off-by: Fernando Vazquez Signed-off-by: Andi Kleen --- arch/i386/kernel/mca.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/i386/kernel/mca.c b/arch/i386/kernel/mca.c index cd5456f14af4..eb57a851789d 100644 --- a/arch/i386/kernel/mca.c +++ b/arch/i386/kernel/mca.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -414,7 +415,8 @@ subsys_initcall(mca_init); /*--------------------------------------------------------------------*/ -static void mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag) +static __kprobes void +mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag) { int slot = mca_dev->slot; @@ -444,7 +446,7 @@ static void mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag) /*--------------------------------------------------------------------*/ -static int mca_handle_nmi_callback(struct device *dev, void *data) +static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data) { struct mca_device *mca_dev = to_mca_device(dev); unsigned char pos5; @@ -462,7 +464,7 @@ static int mca_handle_nmi_callback(struct device *dev, void *data) return 0; } -void mca_handle_nmi(void) +void __kprobes mca_handle_nmi(void) { /* First try - scan the various adapters and see if a specific * adapter was responsible for the error. -- cgit v1.2.1 From 06039754d775d3e48e4a292e4f353321205eff53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20Luis=20V=E1zquez=20Cao?= Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] i386: Disallow kprobes on NMI handlers A kprobe executes IRET early and that could cause NMI recursion and stack corruption. Note: This problem was originally spotted and solved by Andi Kleen in the x86_64 architecture. This patch is an adaption of his patch for i386. AK: Merged with current code which was a bit different. AK: Removed printk in nmi handler that shouldn't be there in the first time AK: Added missing include. AK: added KPROBES_END Signed-off-by: Fernando Vazquez Signed-off-by: Andi Kleen --- arch/i386/kernel/entry.S | 3 ++- arch/i386/kernel/nmi.c | 6 +++--- arch/i386/kernel/traps.c | 15 +++++++++------ 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index dede506e5bd0..0928f70639aa 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -729,7 +729,7 @@ KPROBE_END(debug) * check whether we got an NMI on the debug path where the debug * fault happened on the sysenter path. */ -ENTRY(nmi) +KPROBE_ENTRY(nmi) RING0_INT_FRAME pushl %eax CFI_ADJUST_CFA_OFFSET 4 @@ -805,6 +805,7 @@ nmi_16bit_stack: .align 4 .long 1b,iret_exc .previous +KPROBE_END(nmi) KPROBE_ENTRY(int3) RING0_INT_FRAME diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 7b9a053effa3..dbda706fdd14 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -882,7 +883,7 @@ EXPORT_SYMBOL(touch_nmi_watchdog); extern void die_nmi(struct pt_regs *, const char *msg); -int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) +__kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { /* @@ -962,8 +963,7 @@ int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) * This matches the old behaviour. */ rc = 1; - } else - printk(KERN_WARNING "Unknown enabled NMI hardware?!\n"); + } } done: return rc; diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 00d643f3de41..5c0f4960c67d 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -689,7 +689,8 @@ gp_in_kernel: } } -static void mem_parity_error(unsigned char reason, struct pt_regs * regs) +static __kprobes void +mem_parity_error(unsigned char reason, struct pt_regs * regs) { printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " "CPU %d.\n", reason, smp_processor_id()); @@ -704,7 +705,8 @@ static void mem_parity_error(unsigned char reason, struct pt_regs * regs) clear_mem_error(reason); } -static void io_check_error(unsigned char reason, struct pt_regs * regs) +static __kprobes void +io_check_error(unsigned char reason, struct pt_regs * regs) { unsigned long i; @@ -720,7 +722,8 @@ static void io_check_error(unsigned char reason, struct pt_regs * regs) outb(reason, 0x61); } -static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) +static __kprobes void +unknown_nmi_error(unsigned char reason, struct pt_regs * regs) { #ifdef CONFIG_MCA /* Might actually be able to figure out what the guilty party @@ -741,7 +744,7 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) static DEFINE_SPINLOCK(nmi_print_lock); -void die_nmi (struct pt_regs *regs, const char *msg) +void __kprobes die_nmi(struct pt_regs *regs, const char *msg) { if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP) @@ -773,7 +776,7 @@ void die_nmi (struct pt_regs *regs, const char *msg) do_exit(SIGSEGV); } -static void default_do_nmi(struct pt_regs * regs) +static __kprobes void default_do_nmi(struct pt_regs * regs) { unsigned char reason = 0; @@ -811,7 +814,7 @@ static void default_do_nmi(struct pt_regs * regs) reassert_nmi(); } -fastcall void do_nmi(struct pt_regs * regs, long error_code) +fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code) { int cpu; -- cgit v1.2.1 From f704cb935006580db0495e54d3c82631f6e2a984 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] x86: remove config.h includes from asm-i386 & asm-x86_64 This is now automatically included by kbuild. Signed-off-by: Dave Jones Signed-off-by: Andi Kleen --- include/asm-i386/dwarf2.h | 2 -- include/asm-i386/tsc.h | 1 - include/asm-x86_64/calgary.h | 1 - 3 files changed, 4 deletions(-) diff --git a/include/asm-i386/dwarf2.h b/include/asm-i386/dwarf2.h index 2280f6272f80..fe2392f361ee 100644 --- a/include/asm-i386/dwarf2.h +++ b/include/asm-i386/dwarf2.h @@ -1,8 +1,6 @@ #ifndef _DWARF2_H #define _DWARF2_H -#include - #ifndef __ASSEMBLY__ #warning "asm/dwarf2.h should be only included in pure assembly files" #endif diff --git a/include/asm-i386/tsc.h b/include/asm-i386/tsc.h index 97b828ce31e0..c13933185c1c 100644 --- a/include/asm-i386/tsc.h +++ b/include/asm-i386/tsc.h @@ -6,7 +6,6 @@ #ifndef _ASM_i386_TSC_H #define _ASM_i386_TSC_H -#include #include /* diff --git a/include/asm-x86_64/calgary.h b/include/asm-x86_64/calgary.h index 0a03bda94d03..6b93f5a3a5c8 100644 --- a/include/asm-x86_64/calgary.h +++ b/include/asm-x86_64/calgary.h @@ -24,7 +24,6 @@ #ifndef _ASM_X86_64_CALGARY_H #define _ASM_X86_64_CALGARY_H -#include #include #include #include -- cgit v1.2.1 From dbf9272e863bf4b17ee8e3c66c26682b2061d40d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] Don't force reserve the 640k-1MB range From i386 x86-64 inherited code to force reserve the 640k-1MB area. That was needed on some old systems. But we generally trust the e820 map to be correct on 64bit systems and mark all areas that are not memory correctly. This patch will allow to use the real memory in there. Or rather the only way to find out if it's still needed is to try. So far I'm optimistic. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/e820.c | 33 +-------------------------------- 1 file changed, 1 insertion(+), 32 deletions(-) diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 164d0b83dc92..e06c2714ecf3 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -71,12 +71,7 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size) return 1; } #endif - /* kernel code + 640k memory hole (later should not be needed, but - be paranoid for now) */ - if (last >= 640*1024 && addr < 1024*1024) { - *addrp = 1024*1024; - return 1; - } + /* kernel code */ if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) { *addrp = __pa_symbol(&_end); return 1; @@ -519,13 +514,6 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) * If we're lucky and live on a modern system, the setup code * will have given us a memory map that we can use to properly * set up memory. If we aren't, we'll fake a memory map. - * - * We check to see that the memory map contains at least 2 elements - * before we'll use it, because the detection code in setup.S may - * not be perfect and most every PC known to man has two memory - * regions: one from 0 to 640k, and one from 1mb up. (The IBM - * thinkpad 560x, for example, does not cooperate with the memory - * detection code.) */ static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) { @@ -543,25 +531,6 @@ static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) if (start > end) return -1; - /* - * Some BIOSes claim RAM in the 640k - 1M region. - * Not right. Fix it up. - * - * This should be removed on Hammer which is supposed to not - * have non e820 covered ISA mappings there, but I had some strange - * problems so it stays for now. -AK - */ - if (type == E820_RAM) { - if (start < 0x100000ULL && end > 0xA0000ULL) { - if (start < 0xA0000ULL) - add_memory_region(start, 0xA0000ULL-start, type); - if (end <= 0x100000ULL) - continue; - start = 0x100000ULL; - size = end - start; - } - } - add_memory_region(start, size, type); } while (biosmap++,--nr_map); return 0; -- cgit v1.2.1 From 1bb4996bcebca1cde49d964b4e012699ce180e61 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Move compiler check for modules to ia64 only Apparently IA64 needs it, but i386/x86-64 don't anymore since gcc 2.95 support was dropped. Nobody else on linux-arch requested keeping it generically Cc: tony.luck@intel.com Cc: kaos@sgi.com Signed-off-by: Andi Kleen --- include/asm-ia64/module.h | 3 ++- include/linux/vermagic.h | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/asm-ia64/module.h b/include/asm-ia64/module.h index 85c82bd819f2..d2da61e4c49b 100644 --- a/include/asm-ia64/module.h +++ b/include/asm-ia64/module.h @@ -28,7 +28,8 @@ struct mod_arch_specific { #define Elf_Ehdr Elf64_Ehdr #define MODULE_PROC_FAMILY "ia64" -#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY +#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY \ + "gcc-" __stringify(__GNUC__) "." __stringify(__GNUC_MINOR__) #define ARCH_SHF_SMALL SHF_IA_64_SHORT diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h index 46919f9f5eb3..4d0909e53595 100644 --- a/include/linux/vermagic.h +++ b/include/linux/vermagic.h @@ -24,5 +24,5 @@ #define VERMAGIC_STRING \ UTS_RELEASE " " \ MODULE_VERMAGIC_SMP MODULE_VERMAGIC_PREEMPT \ - MODULE_VERMAGIC_MODULE_UNLOAD MODULE_ARCH_VERMAGIC \ - "gcc-" __stringify(__GNUC__) "." __stringify(__GNUC_MINOR__) + MODULE_VERMAGIC_MODULE_UNLOAD MODULE_ARCH_VERMAGIC + -- cgit v1.2.1 From 1164c9994fe37d5b7035a5cf9328c98dd38af7b1 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] make numa_emulation() __init Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/x86_64/mm/numa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index d64d6d93d04b..322bf45fc36a 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -225,7 +225,7 @@ void __init numa_init_array(void) int numa_fake __initdata = 0; /* Numa emulation */ -static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn) +static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) { int i; struct bootnode nodes[MAX_NUMNODES]; -- cgit v1.2.1 From a549b86dd0f3cbffcd5f9343f4ae7fcd59f7e756 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] i386: annotate FIX_STACK() and the rest of nmi() In i386's entry.S, FIX_STACK() needs annotation because it replaces the stack pointer. And the rest of nmi() needs annotation in order to compile with these new annotations. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen --- arch/i386/kernel/entry.S | 18 +++++++++++++++--- include/asm-i386/dwarf2.h | 2 ++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 0928f70639aa..4b0845249222 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -701,9 +701,15 @@ device_not_available_emulate: jne ok; \ label: \ movl TSS_sysenter_esp0+offset(%esp),%esp; \ + CFI_DEF_CFA esp, 0; \ + CFI_UNDEFINED eip; \ pushfl; \ + CFI_ADJUST_CFA_OFFSET 4; \ pushl $__KERNEL_CS; \ - pushl $sysenter_past_esp + CFI_ADJUST_CFA_OFFSET 4; \ + pushl $sysenter_past_esp; \ + CFI_ADJUST_CFA_OFFSET 4; \ + CFI_REL_OFFSET eip, 0 KPROBE_ENTRY(debug) RING0_INT_FRAME @@ -754,6 +760,7 @@ KPROBE_ENTRY(nmi) cmpl $sysenter_entry,12(%esp) je nmi_debug_stack_check nmi_stack_correct: + /* We have a RING0_INT_FRAME here */ pushl %eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL @@ -764,9 +771,12 @@ nmi_stack_correct: CFI_ENDPROC nmi_stack_fixup: + RING0_INT_FRAME FIX_STACK(12,nmi_stack_correct, 1) jmp nmi_stack_correct + nmi_debug_stack_check: + /* We have a RING0_INT_FRAME here */ cmpw $__KERNEL_CS,16(%esp) jne nmi_stack_correct cmpl $debug,(%esp) @@ -777,8 +787,10 @@ nmi_debug_stack_check: jmp nmi_stack_correct nmi_16bit_stack: - RING0_INT_FRAME - /* create the pointer to lss back */ + /* We have a RING0_INT_FRAME here. + * + * create the pointer to lss back + */ pushl %ss CFI_ADJUST_CFA_OFFSET 4 pushl %esp diff --git a/include/asm-i386/dwarf2.h b/include/asm-i386/dwarf2.h index fe2392f361ee..5d1a8db5a9b0 100644 --- a/include/asm-i386/dwarf2.h +++ b/include/asm-i386/dwarf2.h @@ -26,6 +26,7 @@ #define CFI_RESTORE .cfi_restore #define CFI_REMEMBER_STATE .cfi_remember_state #define CFI_RESTORE_STATE .cfi_restore_state +#define CFI_UNDEFINED .cfi_undefined #else @@ -46,6 +47,7 @@ #define CFI_RESTORE ignore #define CFI_REMEMBER_STATE ignore #define CFI_RESTORE_STATE ignore +#define CFI_UNDEFINED ignore #endif -- cgit v1.2.1 From 34464a5b8937b79801776dfb6970c1b949fed4be Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Detect clock skew during suspend Detect the situations in which the time after a resume from disk would be earlier than the time before the suspend and prevent them from happening on x86_64. Signed-off-by: Rafael J. Wysocki Signed-off-by: Andi Kleen --- arch/x86_64/kernel/time.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 97115e608ed8..9dd15d12b659 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -1039,8 +1039,16 @@ static int timer_resume(struct sys_device *dev) unsigned long flags; unsigned long sec; unsigned long ctime = get_cmos_time(); - unsigned long sleep_length = (ctime - sleep_start) * HZ; + long sleep_length = (ctime - sleep_start) * HZ; + if (sleep_length < 0) { + printk(KERN_WARNING "Time skew detected in timer resume!\n"); + /* The time after the resume must not be earlier than the time + * before the suspend or some nasty things will happen + */ + sleep_length = 0; + ctime = sleep_start; + } if (vxtime.hpet_address) hpet_reenable(); else -- cgit v1.2.1 From 151f8cc1169f9052095b2be36183ab132d75c6c2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Remove safe_smp_processor_id() And replace all users with ordinary smp_processor_id. The function was originally added to get some basic oops information out even if the GS register was corrupted. However that didn't work for some anymore because printk is needed to print the oops and it uses smp_processor_id() already. Also GS register corruptions are not particularly common anymore. This also helps the Xen port which would otherwise need to do this in a special way because it can't access the local APIC. Cc: Chris Wright Signed-off-by: Andi Kleen --- arch/x86_64/kernel/mce.c | 2 +- arch/x86_64/kernel/smp.c | 23 ----------------------- arch/x86_64/kernel/traps.c | 10 +++++----- include/asm-x86_64/smp.h | 2 -- 4 files changed, 6 insertions(+), 31 deletions(-) diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 4e017fb30fb3..1a93c3738404 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -182,7 +182,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) goto out2; memset(&m, 0, sizeof(struct mce)); - m.cpu = safe_smp_processor_id(); + m.cpu = smp_processor_id(); rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); if (!(m.mcgstatus & MCG_STATUS_RIPV)) kill_it = 1; diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 2df988bbf312..4f67697f5036 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -522,26 +522,3 @@ asmlinkage void smp_call_function_interrupt(void) } } -int safe_smp_processor_id(void) -{ - unsigned apicid, i; - - if (disable_apic || !apic_mapped) - return 0; - - apicid = hard_smp_processor_id(); - if (apicid < NR_CPUS && x86_cpu_to_apicid[apicid] == apicid) - return apicid; - - for (i = 0; i < NR_CPUS; ++i) { - if (x86_cpu_to_apicid[i] == apicid) - return i; - } - - /* No entries in x86_cpu_to_apicid? Either no MPS|ACPI, - * or called too early. Either way, we must be CPU 0. */ - if (x86_cpu_to_apicid[0] == BAD_APICID) - return 0; - - return 0; /* Should not happen */ -} diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 34660b1e2720..38bc821e457b 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -264,7 +264,7 @@ static int dump_trace_unwind(struct unwind_frame_info *info, void *context) void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack, struct stacktrace_ops *ops, void *data) { - const unsigned cpu = safe_smp_processor_id(); + const unsigned cpu = smp_processor_id(); unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; unsigned used = 0; @@ -429,7 +429,7 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp) { unsigned long *stack; int i; - const int cpu = safe_smp_processor_id(); + const int cpu = smp_processor_id(); unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); @@ -483,7 +483,7 @@ void show_registers(struct pt_regs *regs) int i; int in_kernel = !user_mode(regs); unsigned long rsp; - const int cpu = safe_smp_processor_id(); + const int cpu = smp_processor_id(); struct task_struct *cur = cpu_pda(cpu)->pcurrent; rsp = regs->rsp; @@ -558,7 +558,7 @@ static unsigned int die_nest_count; unsigned __kprobes long oops_begin(void) { - int cpu = safe_smp_processor_id(); + int cpu = smp_processor_id(); unsigned long flags; oops_enter(); @@ -636,7 +636,7 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) * We are in trouble anyway, lets at least try * to get a message out. */ - printk(str, safe_smp_processor_id()); + printk(str, smp_processor_id()); show_registers(regs); if (kexec_should_crash(current)) crash_kexec(regs); diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h index 498fbc1fc179..58b5d6149a42 100644 --- a/include/asm-x86_64/smp.h +++ b/include/asm-x86_64/smp.h @@ -66,7 +66,6 @@ static inline int hard_smp_processor_id(void) return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID)); } -extern int safe_smp_processor_id(void); extern int __cpu_disable(void); extern void __cpu_die(unsigned int cpu); extern void prefill_possible_map(void); @@ -100,7 +99,6 @@ static inline int cpu_present_to_apicid(int mps_cpu) #ifndef CONFIG_SMP #define stack_smp_processor_id() 0 -#define safe_smp_processor_id() 0 #define cpu_logical_map(x) (x) #else #include -- cgit v1.2.1 From d3cf7f061521c78ad62e275eb6fbdc8f43fc75a7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Remove bogus warning from early_ioremap It is correct for its only caller right now, but not for possible future others. Signed-off-by: Andi Kleen --- arch/x86_64/mm/init.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 984155b75e4c..53b49c86d7e0 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -229,7 +229,6 @@ __init void *early_ioremap(unsigned long addr, unsigned long size) /* actually usually some more */ if (size >= LARGE_PAGE_SIZE) { - printk("SMBIOS area too long %lu\n", size); return NULL; } set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE)); -- cgit v1.2.1 From df992848f5aa803fcacd2c5e7d67034bb89e3fa3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Fix pte_exec/mkexec and use it in change_page_attr() Fix the pte_exec/mkexec page table accessor functions to really use the NX bit. Previously they only checked the USER bit, but weren't actually used for anything. Then use them in change_page_attr() to manipulate the NX bit properly. Signed-off-by: Andi Kleen --- arch/x86_64/mm/pageattr.c | 8 +++++--- include/asm-x86_64/pgtable.h | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c index 2685b1f3671c..01591b649f4e 100644 --- a/arch/x86_64/mm/pageattr.c +++ b/arch/x86_64/mm/pageattr.c @@ -190,10 +190,12 @@ int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) * lowmem */ if (__pa(address) < KERNEL_TEXT_SIZE) { unsigned long addr2; - pgprot_t prot2 = prot; + pgprot_t prot2; addr2 = __START_KERNEL_map + __pa(address); - pgprot_val(prot2) &= ~_PAGE_NX; - err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC); + /* Make sure the kernel mappings stay executable */ + prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot))); + err = __change_page_attr(addr2, pfn, prot2, + PAGE_KERNEL_EXEC); } } up_write(&init_mm.mmap_sem); diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index d79e7441b513..f7614670c655 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -265,7 +265,7 @@ static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) #define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT) static inline int pte_user(pte_t pte) { return pte_val(pte) & _PAGE_USER; } static inline int pte_read(pte_t pte) { return pte_val(pte) & _PAGE_USER; } -static inline int pte_exec(pte_t pte) { return pte_val(pte) & _PAGE_USER; } +static inline int pte_exec(pte_t pte) { return !(pte_val(pte) & _PAGE_NX); } static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_RW; } @@ -278,7 +278,7 @@ static inline pte_t pte_mkclean(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & static inline pte_t pte_mkold(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_ACCESSED)); return pte; } static inline pte_t pte_wrprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_RW)); return pte; } static inline pte_t pte_mkread(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_USER)); return pte; } -static inline pte_t pte_mkexec(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_USER)); return pte; } +static inline pte_t pte_mkexec(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_NX)); return pte; } static inline pte_t pte_mkdirty(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; } static inline pte_t pte_mkyoung(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; } static inline pte_t pte_mkwrite(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW)); return pte; } -- cgit v1.2.1 From 5e6b0bfe5b452957b7be4b6ef181cd41880f8359 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Use proper accessors to change PSE bits in change_page_attr() Use normal pte accessors in change_page_attr() to access the PSE bits. Signed-off-by: Andi Kleen --- arch/x86_64/mm/pageattr.c | 16 ++++++---------- include/asm-x86_64/pgtable.h | 1 + 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c index 01591b649f4e..3e231d762aaa 100644 --- a/arch/x86_64/mm/pageattr.c +++ b/arch/x86_64/mm/pageattr.c @@ -108,8 +108,8 @@ static void revert_page(unsigned long address, pgprot_t ref_prot) BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, address); BUG_ON(pmd_val(*pmd) & _PAGE_PSE); - pgprot_val(ref_prot) |= _PAGE_PSE; large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot); + large_pte = pte_mkhuge(large_pte); set_pte((pte_t *)pmd, large_pte); } @@ -119,32 +119,28 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, { pte_t *kpte; struct page *kpte_page; - unsigned kpte_flags; pgprot_t ref_prot2; kpte = lookup_address(address); if (!kpte) return 0; kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); - kpte_flags = pte_val(*kpte); if (pgprot_val(prot) != pgprot_val(ref_prot)) { - if ((kpte_flags & _PAGE_PSE) == 0) { + if (!pte_huge(*kpte)) { set_pte(kpte, pfn_pte(pfn, prot)); } else { /* * split_large_page will take the reference for this * change_page_attr on the split page. */ - struct page *split; - ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE)); - + ref_prot2 = pte_pgprot(pte_clrhuge(*kpte)); split = split_large_page(address, prot, ref_prot2); if (!split) return -ENOMEM; - set_pte(kpte,mk_pte(split, ref_prot2)); + set_pte(kpte, mk_pte(split, ref_prot2)); kpte_page = split; - } + } page_private(kpte_page)++; - } else if ((kpte_flags & _PAGE_PSE) == 0) { + } else if (!pte_huge(*kpte)) { set_pte(kpte, pfn_pte(pfn, ref_prot)); BUG_ON(page_private(kpte_page) == 0); page_private(kpte_page)--; diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index f7614670c655..b34f43acdef1 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -283,6 +283,7 @@ static inline pte_t pte_mkdirty(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | static inline pte_t pte_mkyoung(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; } static inline pte_t pte_mkwrite(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW)); return pte; } static inline pte_t pte_mkhuge(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_PSE)); return pte; } +static inline pte_t pte_clrhuge(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_PSE)); return pte; } struct vm_area_struct; -- cgit v1.2.1 From f2c2cca3acef8b253a36381d9b469ad4fb08563a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Remove APIC version/cpu capability mpparse checking/printing ACPI went to great trouble to get the APIC version and CPU capabilities of different CPUs before passing them to the mpparser. But all that data was used was to print it out. Actually it even faked some data based on the boot cpu, not on the actual CPU being booted. Remove all this code because it's not needed. Cc: len.brown@intel.com Signed-off-by: Andi Kleen --- arch/x86_64/kernel/io_apic.c | 13 ----------- arch/x86_64/kernel/mpparse.c | 52 +++++++++++++++----------------------------- 2 files changed, 17 insertions(+), 48 deletions(-) diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index afac3dbb3729..0491019d4c8d 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -1748,19 +1748,6 @@ device_initcall(ioapic_init_sysfs); #define IO_APIC_MAX_ID 0xFE -int __init io_apic_get_version (int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.version; -} - - int __init io_apic_get_redir_entries (int ioapic) { union IO_APIC_reg_01 reg_01; diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index d63f849aea2c..32b6977f80c0 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -41,7 +41,6 @@ int acpi_found_madt; * Various Linux-internal data structures created from the * MP-table. */ -unsigned char apic_version [MAX_APICS]; DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; @@ -94,24 +93,21 @@ static int __init mpf_checksum(unsigned char *mp, int len) static void __cpuinit MP_processor_info (struct mpc_config_processor *m) { int cpu; - unsigned char ver; cpumask_t tmp_map; + char *bootup_cpu = ""; if (!(m->mpc_cpuflag & CPU_ENABLED)) { disabled_cpus++; return; } - printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n", - m->mpc_apicid, - (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8, - (m->mpc_cpufeature & CPU_MODEL_MASK)>>4, - m->mpc_apicver); - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { - Dprintk(" Bootup CPU\n"); + bootup_cpu = " (Bootup-CPU)"; boot_cpu_id = m->mpc_apicid; } + + printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu); + if (num_processors >= NR_CPUS) { printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." " Processor ignored.\n", NR_CPUS); @@ -129,17 +125,7 @@ static void __cpuinit MP_processor_info (struct mpc_config_processor *m) return; } #endif - ver = m->mpc_apicver; - physid_set(m->mpc_apicid, phys_cpu_present_map); - /* - * Validate version - */ - if (ver == 0x0) { - printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid); - ver = 0x10; - } - apic_version[m->mpc_apicid] = ver; if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { /* * bios_cpu_apicid is required to have processors listed @@ -179,8 +165,8 @@ static void __init MP_ioapic_info (struct mpc_config_ioapic *m) if (!(m->mpc_flags & MPC_APIC_USABLE)) return; - printk("I/O APIC #%d Version %d at 0x%X.\n", - m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); + printk("I/O APIC #%d at 0x%X.\n", + m->mpc_apicid, m->mpc_apicaddr); if (nr_ioapics >= MAX_IO_APICS) { printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n", MAX_IO_APICS, nr_ioapics); @@ -413,13 +399,10 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) * 2 CPUs, numbered 0 & 1. */ processor.mpc_type = MP_PROCESSOR; - /* Either an integrated APIC or a discrete 82489DX. */ - processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + processor.mpc_apicver = 0; processor.mpc_cpuflag = CPU_ENABLED; - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | - (boot_cpu_data.x86_model << 4) | - boot_cpu_data.x86_mask; - processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_cpufeature = 0; + processor.mpc_featureflag = 0; processor.mpc_reserved[0] = 0; processor.mpc_reserved[1] = 0; for (i = 0; i < 2; i++) { @@ -448,7 +431,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) ioapic.mpc_type = MP_IOAPIC; ioapic.mpc_apicid = 2; - ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.mpc_apicver = 0; ioapic.mpc_flags = MPC_APIC_USABLE; ioapic.mpc_apicaddr = 0xFEC00000; MP_ioapic_info(&ioapic); @@ -640,12 +623,11 @@ void __cpuinit mp_register_lapic (u8 id, u8 enabled) processor.mpc_type = MP_PROCESSOR; processor.mpc_apicid = id; - processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR)); + processor.mpc_apicver = 0; processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | - (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; - processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_cpufeature = 0; + processor.mpc_featureflag = 0; processor.mpc_reserved[0] = 0; processor.mpc_reserved[1] = 0; @@ -700,7 +682,7 @@ void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); mp_ioapics[idx].mpc_apicid = id; - mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); + mp_ioapics[idx].mpc_apicver = 0; /* * Build basic IRQ lookup table to facilitate gsi->io_apic lookups @@ -711,9 +693,9 @@ void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) mp_ioapic_routing[idx].gsi_end = gsi_base + io_apic_get_redir_entries(idx); - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, " "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, + mp_ioapics[idx].mpc_apicaddr, mp_ioapic_routing[idx].gsi_start, mp_ioapic_routing[idx].gsi_end); } -- cgit v1.2.1 From e4251e130deef9de5226cc36faa70a1c6671d3c5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Remove some cruft in apic id checking during processor setup - Remove a define that was used only once - Remove the too large APIC ID check because we always support the full 8bit range of APICs. - Restructure code a bit to be simpler. Cc: len.brown@intel.com Signed-off-by: Andi Kleen --- arch/x86_64/kernel/mpparse.c | 18 +----------------- include/asm-x86_64/acpi.h | 2 -- 2 files changed, 1 insertion(+), 19 deletions(-) diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 32b6977f80c0..02eef8f6415f 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -100,7 +100,6 @@ static void __cpuinit MP_processor_info (struct mpc_config_processor *m) disabled_cpus++; return; } - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { bootup_cpu = " (Bootup-CPU)"; boot_cpu_id = m->mpc_apicid; @@ -118,13 +117,6 @@ static void __cpuinit MP_processor_info (struct mpc_config_processor *m) cpus_complement(tmp_map, cpu_present_map); cpu = first_cpu(tmp_map); -#if MAX_APICS < 255 - if ((int)m->mpc_apicid > MAX_APICS) { - printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", - m->mpc_apicid, MAX_APICS); - return; - } -#endif physid_set(m->mpc_apicid, phys_cpu_present_map); if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { /* @@ -603,8 +595,6 @@ void __init mp_register_lapic_address(u64 address) set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); if (boot_cpu_id == -1U) boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); - - Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); } void __cpuinit mp_register_lapic (u8 id, u8 enabled) @@ -612,13 +602,7 @@ void __cpuinit mp_register_lapic (u8 id, u8 enabled) struct mpc_config_processor processor; int boot_cpu = 0; - if (id >= MAX_APICS) { - printk(KERN_WARNING "Processor #%d invalid (max %d)\n", - id, MAX_APICS); - return; - } - - if (id == boot_cpu_physical_apicid) + if (id == boot_cpu_id) boot_cpu = 1; processor.mpc_type = MP_PROCESSOR; diff --git a/include/asm-x86_64/acpi.h b/include/asm-x86_64/acpi.h index 2c95a319c056..ed59aa4c6ff9 100644 --- a/include/asm-x86_64/acpi.h +++ b/include/asm-x86_64/acpi.h @@ -155,8 +155,6 @@ extern void acpi_reserve_bootmem(void); #endif /*CONFIG_ACPI_SLEEP*/ -#define boot_cpu_physical_apicid boot_cpu_id - extern int acpi_disabled; extern int acpi_pci_disabled; -- cgit v1.2.1 From aecc63615e15de861db7436c50dade495639132c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Fix coding style and output of the mptable parser Give the printks a consistent prefix. Add some missing white space. Cc: len.brown@intel.com Signed-off-by: Andi Kleen --- arch/x86_64/kernel/mpparse.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 02eef8f6415f..20e88f4b564b 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -205,7 +205,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) unsigned char *mpt=((unsigned char *)mpc)+count; if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { - printk("SMP mptable: bad signature [%c%c%c%c]!\n", + printk("MPTABLE: bad signature [%c%c%c%c]!\n", mpc->mpc_signature[0], mpc->mpc_signature[1], mpc->mpc_signature[2], @@ -213,31 +213,31 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) return 0; } if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { - printk("SMP mptable: checksum error!\n"); + printk("MPTABLE: checksum error!\n"); return 0; } if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { - printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", + printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n", mpc->mpc_spec); return 0; } if (!mpc->mpc_lapic) { - printk(KERN_ERR "SMP mptable: null local APIC address!\n"); + printk(KERN_ERR "MPTABLE: null local APIC address!\n"); return 0; } memcpy(str,mpc->mpc_oem,8); - str[8]=0; - printk(KERN_INFO "OEM ID: %s ",str); + str[8] = 0; + printk(KERN_INFO "MPTABLE: OEM ID: %s ",str); memcpy(str,mpc->mpc_productid,12); - str[12]=0; - printk("Product ID: %s ",str); + str[12] = 0; + printk("MPTABLE: Product ID: %s ",str); - printk("APIC at: 0x%X\n",mpc->mpc_lapic); + printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic); /* save the local APIC address, it might be non-default */ if (!acpi_lapic) - mp_lapic_addr = mpc->mpc_lapic; + mp_lapic_addr = mpc->mpc_lapic; /* * Now process the configuration blocks. @@ -249,7 +249,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) struct mpc_config_processor *m= (struct mpc_config_processor *)mpt; if (!acpi_lapic) - MP_processor_info(m); + MP_processor_info(m); mpt += sizeof(*m); count += sizeof(*m); break; @@ -268,8 +268,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) struct mpc_config_ioapic *m= (struct mpc_config_ioapic *)mpt; MP_ioapic_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); + mpt += sizeof(*m); + count += sizeof(*m); break; } case MP_INTSRC: @@ -278,8 +278,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) (struct mpc_config_intsrc *)mpt; MP_intsrc_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); + mpt += sizeof(*m); + count += sizeof(*m); break; } case MP_LINTSRC: @@ -287,15 +287,15 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) struct mpc_config_lintsrc *m= (struct mpc_config_lintsrc *)mpt; MP_lintsrc_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); + mpt += sizeof(*m); + count += sizeof(*m); break; } } } clustered_apic_check(); if (!num_processors) - printk(KERN_ERR "SMP mptable: no processors registered!\n"); + printk(KERN_ERR "MPTABLE: no processors registered!\n"); return num_processors; } -- cgit v1.2.1 From 7a0a2dff1cac1df82acfa0395bc9bc1bf0bc16ef Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Add a missing check for irq flags tracing in NMI NMIs are not supposed to track the irq flags, but TRACE_IRQS_IRETQ did it anyways. Add a check. Cc: mingo@elte.hu Signed-off-by: Andi Kleen --- arch/x86_64/kernel/entry.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 780f9b26169f..4fcc0ad8bbeb 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -773,7 +773,9 @@ paranoid_exit\trace: testl $3,CS(%rsp) jnz paranoid_userspace\trace paranoid_swapgs\trace: + .if \trace TRACE_IRQS_IRETQ 0 + .endif swapgs paranoid_restore\trace: RESTORE_ALL 8 -- cgit v1.2.1 From b3698c03eb6d4581e879d6bb0f183ed8dda96d37 Mon Sep 17 00:00:00 2001 From: Paolo 'Blaisorblade' Giarrusso Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Fix boot code head.S warning When compiling a 64-bit kernel on an Ubuntu 6.06 32bit system (whose GCC is also a cross-compiler for x86_64) I've seen that head.o is compiled as a 64-bit file (while it should not) and ld complaining about this during linking: [AK: it happens on all systems with new binutils] ld: warning: i386:x86-64 architecture of input file `arch/x86_64/boot/compressed/head.o' is incompatible with i386 output I've verified that removing -m64 from compilation flags to turn "-m64 -traditional -m32" into "-traditional -m32" fixes the issue. Signed-off-by: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andi Kleen --- arch/x86_64/boot/compressed/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86_64/boot/compressed/Makefile b/arch/x86_64/boot/compressed/Makefile index f89d96f11a9f..e70fa6e1da08 100644 --- a/arch/x86_64/boot/compressed/Makefile +++ b/arch/x86_64/boot/compressed/Makefile @@ -7,7 +7,8 @@ # targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o -EXTRA_AFLAGS := -traditional -m32 +EXTRA_AFLAGS := -traditional +AFLAGS := $(subst -m64,-m32,$(AFLAGS)) # cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with # -m32 -- cgit v1.2.1 From 8380aabb99719af583447133f19a4d8074b5c337 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Remove non e820 fallbacks in high level code Drop support for non e820 BIOS calls to get the memory map. The boot assembler code still has some support, but not the C code now. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/e820.c | 28 +++++++++------------------- include/asm-x86_64/e820.h | 4 ---- 2 files changed, 9 insertions(+), 23 deletions(-) diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index e06c2714ecf3..5d1275a93658 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -536,10 +536,14 @@ static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) return 0; } -void __init setup_memory_region(void) +void early_panic(char *msg) { - char *who = "BIOS-e820"; + early_printk(msg); + panic(msg); +} +void __init setup_memory_region(void) +{ /* * Try to copy the BIOS-supplied E820-map. * @@ -547,24 +551,10 @@ void __init setup_memory_region(void) * the next section from 1mb->appropriate_mem_k */ sanitize_e820_map(E820_MAP, &E820_MAP_NR); - if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) { - unsigned long mem_size; - - /* compare results from other methods and take the greater */ - if (ALT_MEM_K < EXT_MEM_K) { - mem_size = EXT_MEM_K; - who = "BIOS-88"; - } else { - mem_size = ALT_MEM_K; - who = "BIOS-e801"; - } - - e820.nr_map = 0; - add_memory_region(0, LOWMEMSIZE(), E820_RAM); - add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM); - } + if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) + early_panic("Cannot find a valid memory map"); printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - e820_print_map(who); + e820_print_map("BIOS-e820"); } static int __init parse_memopt(char *p) diff --git a/include/asm-x86_64/e820.h b/include/asm-x86_64/e820.h index dba012151856..8dac397bf85e 100644 --- a/include/asm-x86_64/e820.h +++ b/include/asm-x86_64/e820.h @@ -22,10 +22,6 @@ #define E820_ACPI 3 /* usable as RAM once ACPI tables have been read */ #define E820_NVS 4 -#define HIGH_MEMORY (1024*1024) - -#define LOWMEMSIZE() (0x9f000) - #ifndef __ASSEMBLY__ struct e820entry { u64 addr; /* start of memory segment */ -- cgit v1.2.1 From 0136611c62e8650e354b95c76dff6d2ce6030eff Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] optimize hweight64 for x86_64 Based on patch from David Rientjes , but changed by AK. Optimizes the 64-bit hamming weight for x86_64 processors assuming they have fast multiplication. Uses five fewer bitops than the generic hweight64. Benchmark on one EMT64 showed ~25% speedup with 2^24 consecutive calls. Define a new ARCH_HAS_FAST_MULTIPLIER that can be set by other architectures that can also multiply fast. Signed-off-by: Andi Kleen --- include/asm-x86_64/bitops.h | 2 ++ lib/hweight.c | 10 ++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/include/asm-x86_64/bitops.h b/include/asm-x86_64/bitops.h index f7ba57b1cc08..5b535eaf5309 100644 --- a/include/asm-x86_64/bitops.h +++ b/include/asm-x86_64/bitops.h @@ -399,6 +399,8 @@ static __inline__ int fls(int x) return r+1; } +#define ARCH_HAS_FAST_MULTIPLIER 1 + #include #endif /* __KERNEL__ */ diff --git a/lib/hweight.c b/lib/hweight.c index 438257671708..360556a7803d 100644 --- a/lib/hweight.c +++ b/lib/hweight.c @@ -1,5 +1,6 @@ #include #include +#include /** * hweightN - returns the hamming weight of a N-bit word @@ -40,14 +41,19 @@ unsigned long hweight64(__u64 w) #if BITS_PER_LONG == 32 return hweight32((unsigned int)(w >> 32)) + hweight32((unsigned int)w); #elif BITS_PER_LONG == 64 +#ifdef ARCH_HAS_FAST_MULTIPLIER + w -= (w >> 1) & 0x5555555555555555ul; + w = (w & 0x3333333333333333ul) + ((w >> 2) & 0x3333333333333333ul); + w = (w + (w >> 4)) & 0x0f0f0f0f0f0f0f0ful; + return (w * 0x0101010101010101ul) >> 56; +#else __u64 res = w - ((w >> 1) & 0x5555555555555555ul); res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul); res = (res + (res >> 4)) & 0x0F0F0F0F0F0F0F0Ful; res = res + (res >> 8); res = res + (res >> 16); return (res + (res >> 32)) & 0x00000000000000FFul; -#else -#error BITS_PER_LONG not defined +#endif #endif } EXPORT_SYMBOL(hweight64); -- cgit v1.2.1 From 26374c7b7dca1ff90607c83d9b82e917119f0456 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Reload CS when startup_64 is used. In long mode the %cs is largely a relic. However there are a few cases like iret where it matters that we have a valid value. Without this patch it is possible to enter the kernel in startup_64 without setting %cs to a valid value. With this patch we don't care what %cs value we enter the kernel with, so long as the cs shadow register indicates it is a privileged code segment. Thanks to Magnus Damm for finding this problem and posting the first workable patch. I have moved the jump to set %cs down a few instructions so we don't need to take an extra jump. Which keeps the code simpler. Signed-of-by: Eric W. Biederman Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head.S | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 4a3326ce709c..1e6f80870679 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -185,12 +185,15 @@ startup_64: /* Finally jump to run C code and to be on real kernel address * Since we are running on identity-mapped space we have to jump - * to the full 64bit address , this is only possible as indirect - * jump + * to the full 64bit address, this is only possible as indirect + * jump. In addition we need to ensure %cs is set so we make this + * a far return. */ movq initial_code(%rip),%rax - pushq $0 # fake return address - jmp *%rax + pushq $0 # fake return address to stop unwinder + pushq $__KERNEL_CS # set correct cs + pushq %rax # target address in negative space + lretq /* SMP bootup changes these two */ .align 8 -- cgit v1.2.1 From f2a9e1dec27189a09ff642f2648e49ad9e76607b Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Put .note.* sections into a PT_NOTE segment This patch updates x86_64 linker script to pack any .note.* sections into a PT_NOTE segment in the output file. To do this, we tell ld that we need a PT_NOTE segment. This requires us to start explicitly mapping sections to segments, so we also need to explicitly create PT_LOAD segments for text and data, and map the sections to them appropriately. Fortunately, each section will default to its previous section's segment, so it doesn't take many changes to vmlinux.lds.S. The corresponding change is already made for i386 in -mm and I'd like this patch to join it. The section to segment mappings do change as do the segment flags so some time in -mm would be good for that reason as well, just in case. In particular .data and .bss move from the text segment to the data segment and .data.cacheline_aligned .data.read_mostly are put in the data segment instead of a separate one. I think that it would be possible to exactly match the existing section to segment mapping and flags but it would be a more intrusive change and I'm not sure there is a reason for the existing layout other than it is what you get by default if you don't explicitly specify something else. If there is a reason for the existing layout then I will of course make the more intrusive change. If there is no reason we could probably drop the executable or writable flags from some segments but I don't know how much attention is paid to them anyway so it might not be worth the effort. The vsyscall related sections need to go in a different segment to the normal data segment and so I invented a "user" segment to contain them. I believe this should appear to be another data segment as far as the kernel is concerned so the flags are setup accordingly. The notes will be used in the Xen paravirt_ops backend to provide additional information to the domain builder. I am in the process of converting the xen-unstable kernels and tools over to this scheme at the moment to support this in the future. It has been suggested to me that the notes segment should have flags 0 (i.e. not readable) since it is only used by the loader and is not used at runtime. For now I went with a readable segment since that is what the i386 patch uses. AK: dropped NOTES addition right now because the needed infrastructure for that is not merged yet Signed-off-by: Ian Campbell Signed-off-by: Andi Kleen --- arch/x86_64/kernel/vmlinux.lds.S | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 8d5a5149bb3a..2802aa963fa3 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -13,6 +13,12 @@ OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") OUTPUT_ARCH(i386:x86-64) ENTRY(phys_startup_64) jiffies_64 = jiffies; +PHDRS { + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ + user PT_LOAD FLAGS(7); /* RWE */ + note PT_NOTE FLAGS(4); /* R__ */ +} SECTIONS { . = __START_KERNEL; @@ -31,7 +37,7 @@ SECTIONS KPROBES_TEXT *(.fixup) *(.gnu.warning) - } = 0x9090 + } :text = 0x9090 /* out-of-line lock text */ .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) } @@ -57,7 +63,7 @@ SECTIONS .data : AT(ADDR(.data) - LOAD_OFFSET) { *(.data) CONSTRUCTORS - } + } :data _edata = .; /* End of data section */ @@ -89,7 +95,7 @@ SECTIONS #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) . = VSYSCALL_ADDR; - .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } + .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user __vsyscall_0 = VSYSCALL_VIRT_ADDR; . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); @@ -135,7 +141,7 @@ SECTIONS . = ALIGN(8192); /* init_task */ .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { *(.data.init_task) - } + } :data . = ALIGN(4096); .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { -- cgit v1.2.1 From 80d2679cbc8e170011c9649fb8fb684ffd7e5c8f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] x86: Remove incorrect comment about ACPI e820 entries They cannot be actually freed because the FACS table has a shared-with-the-BIOS lock. Signed-off-by: Andi Kleen --- include/asm-i386/e820.h | 2 +- include/asm-x86_64/e820.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/asm-i386/e820.h b/include/asm-i386/e820.h index ca82acb8cb1f..f7514fb6e8e4 100644 --- a/include/asm-i386/e820.h +++ b/include/asm-i386/e820.h @@ -18,7 +18,7 @@ #define E820_RAM 1 #define E820_RESERVED 2 -#define E820_ACPI 3 /* usable as RAM once ACPI tables have been read */ +#define E820_ACPI 3 #define E820_NVS 4 #define HIGH_MEMORY (1024*1024) diff --git a/include/asm-x86_64/e820.h b/include/asm-x86_64/e820.h index 8dac397bf85e..fd4198b49637 100644 --- a/include/asm-x86_64/e820.h +++ b/include/asm-x86_64/e820.h @@ -19,7 +19,7 @@ #define E820_RAM 1 #define E820_RESERVED 2 -#define E820_ACPI 3 /* usable as RAM once ACPI tables have been read */ +#define E820_ACPI 3 #define E820_NVS 4 #ifndef __ASSEMBLY__ -- cgit v1.2.1 From 53ee11ae0d73f28029a5f0d991bc4dcd7c817e7a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Optimize PDA accesses slightly Based on a idea by Jeremy Fitzhardinge: Replace the volatiles and memory clobbers in the PDA access with telling gcc about access to a proxy PDA structure that doesn't actually exist. But the dummy accesses give a defined ordering for read/write accesses. Also add some memory barriers to the early GS initialization to make sure no PDA access is moved before it. Advantage is some .text savings (probably most from better code for accessing "current"): text data bss dec hex filename 4845647 1223688 615864 6685199 66020f vmlinux 4837780 1223688 615864 6677332 65e354 vmlinux-pda 1.2% smaller code Cc: Jeremy Fitzhardinge Signed-off-by: Andi Kleen --- arch/x86_64/kernel/setup64.c | 3 +++ include/asm-x86_64/pda.h | 41 ++++++++++++++++++++++------------------- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index e85cfbb49b63..491361752c70 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -121,7 +121,10 @@ void pda_init(int cpu) /* Setup up data that may be needed in __get_free_pages early */ asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); + /* Memory clobbers used to order PDA accessed */ + mb(); wrmsrl(MSR_GS_BASE, pda); + mb(); pda->cpunumber = cpu; pda->irqcount = -1; diff --git a/include/asm-x86_64/pda.h b/include/asm-x86_64/pda.h index b47c3df9ed1d..55e21da96e7a 100644 --- a/include/asm-x86_64/pda.h +++ b/include/asm-x86_64/pda.h @@ -36,40 +36,43 @@ extern struct x8664_pda boot_cpu_pda[]; * There is no fast way to get the base address of the PDA, all the accesses * have to mention %fs/%gs. So it needs to be done this Torvaldian way. */ -#define sizeof_field(type,field) (sizeof(((type *)0)->field)) -#define typeof_field(type,field) typeof(((type *)0)->field) - extern void __bad_pda_field(void); +/* proxy_pda doesn't actually exist, but tell gcc it is accessed + for all PDA accesses so it gets read/write dependencies right. */ +extern struct x8664_pda _proxy_pda; + #define pda_offset(field) offsetof(struct x8664_pda, field) #define pda_to_op(op,field,val) do { \ - typedef typeof_field(struct x8664_pda, field) T__; \ - switch (sizeof_field(struct x8664_pda, field)) { \ + typedef typeof(_proxy_pda.field) T__; \ + switch (sizeof(_proxy_pda.field)) { \ case 2: \ -asm volatile(op "w %0,%%gs:%P1"::"ri" ((T__)val),"i"(pda_offset(field)):"memory"); break; \ +asm(op "w %1,%%gs:%P2" : "+m" (_proxy_pda.field) : \ + "ri" ((T__)val),"i"(pda_offset(field))); break; \ case 4: \ -asm volatile(op "l %0,%%gs:%P1"::"ri" ((T__)val),"i"(pda_offset(field)):"memory"); break; \ +asm(op "l %1,%%gs:%P2" : "+m" (_proxy_pda.field) : \ + "ri" ((T__)val),"i"(pda_offset(field))); break; \ case 8: \ -asm volatile(op "q %0,%%gs:%P1"::"ri" ((T__)val),"i"(pda_offset(field)):"memory"); break; \ - default: __bad_pda_field(); \ +asm(op "q %1,%%gs:%P2": "+m" (_proxy_pda.field) : \ + "ri" ((T__)val),"i"(pda_offset(field))); break; \ +default: __bad_pda_field(); \ } \ } while (0) -/* - * AK: PDA read accesses should be neither volatile nor have an memory clobber. - * Unfortunately removing them causes all hell to break lose currently. - */ #define pda_from_op(op,field) ({ \ - typeof_field(struct x8664_pda, field) ret__; \ - switch (sizeof_field(struct x8664_pda, field)) { \ + typeof(_proxy_pda.field) ret__; \ + switch (sizeof(_proxy_pda.field)) { \ case 2: \ -asm volatile(op "w %%gs:%P1,%0":"=r" (ret__):"i"(pda_offset(field)):"memory"); break;\ +asm(op "w %%gs:%P1,%0":"=r" (ret__):\ + "i" (pda_offset(field)), "m" (_proxy_pda.field)); break;\ case 4: \ -asm volatile(op "l %%gs:%P1,%0":"=r" (ret__):"i"(pda_offset(field)):"memory"); break;\ +asm(op "l %%gs:%P1,%0":"=r" (ret__):\ + "i" (pda_offset(field)), "m" (_proxy_pda.field)); break;\ case 8: \ -asm volatile(op "q %%gs:%P1,%0":"=r" (ret__):"i"(pda_offset(field)):"memory"); break;\ - default: __bad_pda_field(); \ +asm(op "q %%gs:%P1,%0":"=r" (ret__):\ + "i" (pda_offset(field)), "m" (_proxy_pda.field)); break;\ +default: __bad_pda_field(); \ } \ ret__; }) -- cgit v1.2.1 From 575400d1b483fbe9e03c68758059bfaf4e4768d1 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] i386: Fix the EDD code misparsing the command line The EDD code would scan the command line as a fixed array, without taking account of either whitespace, null-termination, the old command-line protocol, late overrides early, or the fact that the command line may not be reachable from INITSEG. This should fix those problems, and enable us to use a longer command line. Signed-off-by: H. Peter Anvin Signed-off-by: Andi Kleen --- arch/i386/boot/edd.S | 97 ++++++++++++++++++++++++++++++++++++++++------------ include/linux/edd.h | 1 + 2 files changed, 76 insertions(+), 22 deletions(-) diff --git a/arch/i386/boot/edd.S b/arch/i386/boot/edd.S index 4b84ea216f2b..34321368011a 100644 --- a/arch/i386/boot/edd.S +++ b/arch/i386/boot/edd.S @@ -15,42 +15,95 @@ #include #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) + +# It is assumed that %ds == INITSEG here + movb $0, (EDD_MBR_SIG_NR_BUF) movb $0, (EDDNR) -# Check the command line for two options: +# Check the command line for options: # edd=of disables EDD completely (edd=off) # edd=sk skips the MBR test (edd=skipmbr) +# edd=on re-enables EDD (edd=on) + pushl %esi - cmpl $0, %cs:cmd_line_ptr - jz done_cl + movw $edd_mbr_sig_start, %di # Default to edd=on + movl %cs:(cmd_line_ptr), %esi -# ds:esi has the pointer to the command line now - movl $(COMMAND_LINE_SIZE-7), %ecx -# loop through kernel command line one byte at a time -cl_loop: - cmpl $EDD_CL_EQUALS, (%si) + andl %esi, %esi + jz old_cl # Old boot protocol? + +# Convert to a real-mode pointer in fs:si + movl %esi, %eax + shrl $4, %eax + movw %ax, %fs + andw $0xf, %si + jmp have_cl_pointer + +# Old-style boot protocol? +old_cl: + push %ds # aka INITSEG + pop %fs + + cmpw $0xa33f, (0x20) + jne done_cl # No command line at all? + movw (0x22), %si # Pointer relative to INITSEG + +# fs:si has the pointer to the command line now +have_cl_pointer: + +# Loop through kernel command line one byte at a time. Just in +# case the loader is buggy and failed to null-terminate the command line +# terminate if we get close enough to the end of the segment that we +# cannot fit "edd=XX"... +cl_atspace: + cmpw $-5, %si # Watch for segment wraparound + jae done_cl + movl %fs:(%si), %eax + andb %al, %al # End of line? + jz done_cl + cmpl $EDD_CL_EQUALS, %eax jz found_edd_equals - incl %esi - loop cl_loop - jmp done_cl + cmpb $0x20, %al # <= space consider whitespace + ja cl_skipword + incw %si + jmp cl_atspace + +cl_skipword: + cmpw $-5, %si # Watch for segment wraparound + jae done_cl + movb %fs:(%si), %al # End of string? + andb %al, %al + jz done_cl + cmpb $0x20, %al + jbe cl_atspace + incw %si + jmp cl_skipword + found_edd_equals: # only looking at first two characters after equals - addl $4, %esi - cmpw $EDD_CL_OFF, (%si) # edd=of - jz do_edd_off - cmpw $EDD_CL_SKIP, (%si) # edd=sk - jz do_edd_skipmbr - jmp done_cl +# late overrides early on the command line, so keep going after finding something + movw %fs:4(%si), %ax + cmpw $EDD_CL_OFF, %ax # edd=of + je do_edd_off + cmpw $EDD_CL_SKIP, %ax # edd=sk + je do_edd_skipmbr + cmpw $EDD_CL_ON, %ax # edd=on + je do_edd_on + jmp cl_skipword do_edd_skipmbr: - popl %esi - jmp edd_start + movw $edd_start, %di + jmp cl_skipword do_edd_off: - popl %esi - jmp edd_done + movw $edd_done, %di + jmp cl_skipword +do_edd_on: + movw $edd_mbr_sig_start, %di + jmp cl_skipword + done_cl: popl %esi - + jmpw *%di # Read the first sector of each BIOS disk device and store the 4-byte signature edd_mbr_sig_start: diff --git a/include/linux/edd.h b/include/linux/edd.h index 162512b886f7..b2b3e68aa512 100644 --- a/include/linux/edd.h +++ b/include/linux/edd.h @@ -52,6 +52,7 @@ #define EDD_CL_EQUALS 0x3d646465 /* "edd=" */ #define EDD_CL_OFF 0x666f /* "of" for off */ #define EDD_CL_SKIP 0x6b73 /* "sk" for skipmbr */ +#define EDD_CL_ON 0x6e6f /* "on" for on */ #ifndef __ASSEMBLY__ -- cgit v1.2.1 From f574164491d00d28b727d713685fb5edc9138200 Mon Sep 17 00:00:00 2001 From: Keith Owens Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Remove most of the special cases for the debug IST stack Remove most of the special cases for the debug IST stack. This is a follow on clean up patch, it requires the bug fix patch that adds orig_ist. Signed-off-by: Keith Owens Signed-off-by: Andi Kleen --- arch/x86_64/kernel/setup64.c | 21 +++++---------------- arch/x86_64/kernel/traps.c | 21 +-------------------- include/asm-x86_64/pda.h | 3 --- 3 files changed, 6 insertions(+), 39 deletions(-) diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 491361752c70..9332d2361e08 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -237,28 +237,17 @@ void __cpuinit cpu_init (void) * set up and load the per-CPU TSS */ for (v = 0; v < N_EXCEPTION_STACKS; v++) { + static const unsigned int order[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER + }; if (cpu) { - static const unsigned int order[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER - }; - estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); if (!estacks) panic("Cannot allocate exception stack %ld %d\n", v, cpu); } - switch (v + 1) { -#if DEBUG_STKSZ > EXCEPTION_STKSZ - case DEBUG_STACK: - cpu_pda(cpu)->debugstack = (unsigned long)estacks; - estacks += DEBUG_STKSZ; - break; -#endif - default: - estacks += EXCEPTION_STKSZ; - break; - } + estacks += PAGE_SIZE << order[v]; orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks; } diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 38bc821e457b..fb8486eca1b6 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -162,26 +162,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, * 'stack' is in one of them: */ for (k = 0; k < N_EXCEPTION_STACKS; k++) { - unsigned long end; - - /* - * set 'end' to the end of the exception stack. - */ - switch (k + 1) { - /* - * TODO: this block is not needed i think, because - * setup64.c:cpu_init() sets up t->ist[DEBUG_STACK] - * properly too. - */ -#if DEBUG_STKSZ > EXCEPTION_STKSZ - case DEBUG_STACK: - end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ; - break; -#endif - default: - end = per_cpu(orig_ist, cpu).ist[k]; - break; - } + unsigned long end = per_cpu(orig_ist, cpu).ist[k]; /* * Is 'stack' above this exception frame's end? * If yes then skip to the next frame. diff --git a/include/asm-x86_64/pda.h b/include/asm-x86_64/pda.h index 55e21da96e7a..e2b23e337b94 100644 --- a/include/asm-x86_64/pda.h +++ b/include/asm-x86_64/pda.h @@ -13,9 +13,6 @@ struct x8664_pda { unsigned long data_offset; /* Per cpu data offset from linker address */ unsigned long kernelstack; /* top of kernel stack for current */ unsigned long oldrsp; /* user rsp for system call */ -#if DEBUG_STKSZ > EXCEPTION_STKSZ - unsigned long debugstack; /* #DB/#BP stack. */ -#endif int irqcount; /* Irq nesting counter. Starts with -1 */ int cpunumber; /* Logical CPU number */ char *irqstackptr; /* top of irqstack */ -- cgit v1.2.1 From 4bfaaef01a1badb9e8ffb0c0a37cd2379008d21f Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Avoid overwriting the current pgd (V4, x86_64) kexec: Avoid overwriting the current pgd (V4, x86_64) This patch upgrades the x86_64-specific kexec code to avoid overwriting the current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used to start a secondary kernel that dumps the memory of the previous kernel. The code introduces a new set of page tables. These tables are used to provide an executable identity mapping without overwriting the current pgd. Signed-off-by: Magnus Damm Signed-off-by: Andi Kleen --- arch/x86_64/kernel/machine_kexec.c | 71 ++++++++------- arch/x86_64/kernel/relocate_kernel.S | 171 +++++++++++++++++++++++++++++++---- include/asm-x86_64/kexec.h | 29 ++++++ 3 files changed, 218 insertions(+), 53 deletions(-) diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c index 2e94c072d84a..0497e3bd5bff 100644 --- a/arch/x86_64/kernel/machine_kexec.c +++ b/arch/x86_64/kernel/machine_kexec.c @@ -15,6 +15,15 @@ #include #include +#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) +static u64 kexec_pgd[512] PAGE_ALIGNED; +static u64 kexec_pud0[512] PAGE_ALIGNED; +static u64 kexec_pmd0[512] PAGE_ALIGNED; +static u64 kexec_pte0[512] PAGE_ALIGNED; +static u64 kexec_pud1[512] PAGE_ALIGNED; +static u64 kexec_pmd1[512] PAGE_ALIGNED; +static u64 kexec_pte1[512] PAGE_ALIGNED; + static void init_level2_page(pmd_t *level2p, unsigned long addr) { unsigned long end_addr; @@ -144,32 +153,19 @@ static void load_segments(void) ); } -typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page, - unsigned long control_code_buffer, - unsigned long start_address, - unsigned long pgtable) ATTRIB_NORET; - -extern const unsigned char relocate_new_kernel[]; -extern const unsigned long relocate_new_kernel_size; - int machine_kexec_prepare(struct kimage *image) { - unsigned long start_pgtable, control_code_buffer; + unsigned long start_pgtable; int result; /* Calculate the offsets */ start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; - control_code_buffer = start_pgtable + PAGE_SIZE; /* Setup the identity mapped 64bit page table */ result = init_pgtable(image, start_pgtable); if (result) return result; - /* Place the code in the reboot code buffer */ - memcpy(__va(control_code_buffer), relocate_new_kernel, - relocate_new_kernel_size); - return 0; } @@ -184,28 +180,34 @@ void machine_kexec_cleanup(struct kimage *image) */ NORET_TYPE void machine_kexec(struct kimage *image) { - unsigned long page_list; - unsigned long control_code_buffer; - unsigned long start_pgtable; - relocate_new_kernel_t rnk; + unsigned long page_list[PAGES_NR]; + void *control_page; /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); - /* Calculate the offsets */ - page_list = image->head; - start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; - control_code_buffer = start_pgtable + PAGE_SIZE; - - /* Set the low half of the page table to my identity mapped - * page table for kexec. Leave the high half pointing at the - * kernel pages. Don't bother to flush the global pages - * as that will happen when I fully switch to my identity mapped - * page table anyway. - */ - memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2); - __flush_tlb(); - + control_page = page_address(image->control_code_page) + PAGE_SIZE; + memcpy(control_page, relocate_kernel, PAGE_SIZE); + + page_list[PA_CONTROL_PAGE] = __pa(control_page); + page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; + page_list[PA_PGD] = __pa(kexec_pgd); + page_list[VA_PGD] = (unsigned long)kexec_pgd; + page_list[PA_PUD_0] = __pa(kexec_pud0); + page_list[VA_PUD_0] = (unsigned long)kexec_pud0; + page_list[PA_PMD_0] = __pa(kexec_pmd0); + page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; + page_list[PA_PTE_0] = __pa(kexec_pte0); + page_list[VA_PTE_0] = (unsigned long)kexec_pte0; + page_list[PA_PUD_1] = __pa(kexec_pud1); + page_list[VA_PUD_1] = (unsigned long)kexec_pud1; + page_list[PA_PMD_1] = __pa(kexec_pmd1); + page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; + page_list[PA_PTE_1] = __pa(kexec_pte1); + page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + + page_list[PA_TABLE_PAGE] = + (unsigned long)__pa(page_address(image->control_code_page)); /* The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is @@ -222,9 +224,10 @@ NORET_TYPE void machine_kexec(struct kimage *image) */ set_gdt(phys_to_virt(0),0); set_idt(phys_to_virt(0),0); + /* now call it */ - rnk = (relocate_new_kernel_t) control_code_buffer; - (*rnk)(page_list, control_code_buffer, image->start, start_pgtable); + relocate_kernel((unsigned long)image->head, (unsigned long)page_list, + image->start); } /* crashkernel=size@addr specifies the location to reserve for diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S index d24fa9b72a2b..14e95872c6a3 100644 --- a/arch/x86_64/kernel/relocate_kernel.S +++ b/arch/x86_64/kernel/relocate_kernel.S @@ -7,31 +7,169 @@ */ #include +#include +#include - /* - * Must be relocatable PIC code callable as a C function, that once - * it starts can not use the previous processes stack. - */ - .globl relocate_new_kernel +/* + * Must be relocatable PIC code callable as a C function + */ + +#define PTR(x) (x << 3) +#define PAGE_ALIGNED (1 << PAGE_SHIFT) +#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */ + + .text + .align PAGE_ALIGNED .code64 + .globl relocate_kernel +relocate_kernel: + /* %rdi indirection_page + * %rsi page_list + * %rdx start address + */ + + /* map the control page at its virtual address */ + + movq $0x0000ff8000000000, %r10 /* mask */ + mov $(39 - 3), %cl /* bits to shift */ + movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PGD)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PUD_0)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PUD_0)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PMD_0)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PMD_0)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PTE_0)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PTE_0)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + /* identity map the control page at its physical address */ + + movq $0x0000ff8000000000, %r10 /* mask */ + mov $(39 - 3), %cl /* bits to shift */ + movq PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PGD)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PUD_1)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PUD_1)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PMD_1)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PMD_1)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PTE_1)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PTE_1)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + relocate_new_kernel: - /* %rdi page_list - * %rsi reboot_code_buffer + /* %rdi indirection_page + * %rsi page_list * %rdx start address - * %rcx page_table - * %r8 arg5 - * %r9 arg6 */ /* zero out flags, and disable interrupts */ pushq $0 popfq - /* set a new stack at the bottom of our page... */ - lea 4096(%rsi), %rsp + /* get physical address of control page now */ + /* this is impossible after page table switch */ + movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 + + /* get physical address of page table now too */ + movq PTR(PA_TABLE_PAGE)(%rsi), %rcx - /* store the parameters back on the stack */ - pushq %rdx /* store the start address */ + /* switch to new set of page tables */ + movq PTR(PA_PGD)(%rsi), %r9 + movq %r9, %cr3 + + /* setup a new stack at the end of the physical control page */ + lea 4096(%r8), %rsp + + /* jump to identity mapped page */ + addq $(identity_mapped - relocate_kernel), %r8 + pushq %r8 + ret + +identity_mapped: + /* store the start address on the stack */ + pushq %rdx /* Set cr0 to a known state: * 31 1 == Paging enabled @@ -136,8 +274,3 @@ relocate_new_kernel: xorq %r15, %r15 ret -relocate_new_kernel_end: - - .globl relocate_new_kernel_size -relocate_new_kernel_size: - .quad relocate_new_kernel_end - relocate_new_kernel diff --git a/include/asm-x86_64/kexec.h b/include/asm-x86_64/kexec.h index c564bae03433..5fab957e1091 100644 --- a/include/asm-x86_64/kexec.h +++ b/include/asm-x86_64/kexec.h @@ -1,6 +1,27 @@ #ifndef _X86_64_KEXEC_H #define _X86_64_KEXEC_H +#define PA_CONTROL_PAGE 0 +#define VA_CONTROL_PAGE 1 +#define PA_PGD 2 +#define VA_PGD 3 +#define PA_PUD_0 4 +#define VA_PUD_0 5 +#define PA_PMD_0 6 +#define VA_PMD_0 7 +#define PA_PTE_0 8 +#define VA_PTE_0 9 +#define PA_PUD_1 10 +#define VA_PUD_1 11 +#define PA_PMD_1 12 +#define VA_PMD_1 13 +#define PA_PTE_1 14 +#define VA_PTE_1 15 +#define PA_TABLE_PAGE 16 +#define PAGES_NR 17 + +#ifndef __ASSEMBLY__ + #include #include @@ -64,4 +85,12 @@ static inline void crash_setup_regs(struct pt_regs *newregs, newregs->rip = (unsigned long)current_text_addr(); } } + +NORET_TYPE void +relocate_kernel(unsigned long indirection_page, + unsigned long page_list, + unsigned long start_address) ATTRIB_NORET; + +#endif /* __ASSEMBLY__ */ + #endif /* _X86_64_KEXEC_H */ -- cgit v1.2.1 From 3566561bfadffcb5dbc85d576be80c0dbf2cccc9 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] i386: Avoid overwriting the current pgd (V4, i386) kexec: Avoid overwriting the current pgd (V4, i386) This patch upgrades the i386-specific kexec code to avoid overwriting the current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used to start a secondary kernel that dumps the memory of the previous kernel. The code introduces a new set of page tables. These tables are used to provide an executable identity mapping without overwriting the current pgd. Signed-off-by: Magnus Damm Signed-off-by: Andi Kleen --- arch/i386/kernel/machine_kexec.c | 117 +++++++-------------------- arch/i386/kernel/relocate_kernel.S | 162 +++++++++++++++++++++++++++++++++---- include/asm-i386/kexec.h | 27 +++++++ 3 files changed, 201 insertions(+), 105 deletions(-) diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c index 66c3dc99a655..91966bafb3dc 100644 --- a/arch/i386/kernel/machine_kexec.c +++ b/arch/i386/kernel/machine_kexec.c @@ -21,70 +21,13 @@ #include #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) - -#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) -#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) -#define L2_ATTR (_PAGE_PRESENT) - -#define LEVEL0_SIZE (1UL << 12UL) - -#ifndef CONFIG_X86_PAE -#define LEVEL1_SIZE (1UL << 22UL) -static u32 pgtable_level1[1024] PAGE_ALIGNED; - -static void identity_map_page(unsigned long address) -{ - unsigned long level1_index, level2_index; - u32 *pgtable_level2; - - /* Find the current page table */ - pgtable_level2 = __va(read_cr3()); - - /* Find the indexes of the physical address to identity map */ - level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; - level2_index = address / LEVEL1_SIZE; - - /* Identity map the page table entry */ - pgtable_level1[level1_index] = address | L0_ATTR; - pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; - - /* Flush the tlb so the new mapping takes effect. - * Global tlb entries are not flushed but that is not an issue. - */ - load_cr3(pgtable_level2); -} - -#else -#define LEVEL1_SIZE (1UL << 21UL) -#define LEVEL2_SIZE (1UL << 30UL) -static u64 pgtable_level1[512] PAGE_ALIGNED; -static u64 pgtable_level2[512] PAGE_ALIGNED; - -static void identity_map_page(unsigned long address) -{ - unsigned long level1_index, level2_index, level3_index; - u64 *pgtable_level3; - - /* Find the current page table */ - pgtable_level3 = __va(read_cr3()); - - /* Find the indexes of the physical address to identity map */ - level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; - level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE; - level3_index = address / LEVEL2_SIZE; - - /* Identity map the page table entry */ - pgtable_level1[level1_index] = address | L0_ATTR; - pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; - set_64bit(&pgtable_level3[level3_index], - __pa(pgtable_level2) | L2_ATTR); - - /* Flush the tlb so the new mapping takes effect. - * Global tlb entries are not flushed but that is not an issue. - */ - load_cr3(pgtable_level3); -} +static u32 kexec_pgd[1024] PAGE_ALIGNED; +#ifdef CONFIG_X86_PAE +static u32 kexec_pmd0[1024] PAGE_ALIGNED; +static u32 kexec_pmd1[1024] PAGE_ALIGNED; #endif +static u32 kexec_pte0[1024] PAGE_ALIGNED; +static u32 kexec_pte1[1024] PAGE_ALIGNED; static void set_idt(void *newidt, __u16 limit) { @@ -128,16 +71,6 @@ static void load_segments(void) #undef __STR } -typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)( - unsigned long indirection_page, - unsigned long reboot_code_buffer, - unsigned long start_address, - unsigned int has_pae) ATTRIB_NORET; - -extern const unsigned char relocate_new_kernel[]; -extern void relocate_new_kernel_end(void); -extern const unsigned int relocate_new_kernel_size; - /* * A architecture hook called to validate the * proposed image and prepare the control pages @@ -170,25 +103,29 @@ void machine_kexec_cleanup(struct kimage *image) */ NORET_TYPE void machine_kexec(struct kimage *image) { - unsigned long page_list; - unsigned long reboot_code_buffer; - - relocate_new_kernel_t rnk; + unsigned long page_list[PAGES_NR]; + void *control_page; /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); - /* Compute some offsets */ - reboot_code_buffer = page_to_pfn(image->control_code_page) - << PAGE_SHIFT; - page_list = image->head; - - /* Set up an identity mapping for the reboot_code_buffer */ - identity_map_page(reboot_code_buffer); - - /* copy it out */ - memcpy((void *)reboot_code_buffer, relocate_new_kernel, - relocate_new_kernel_size); + control_page = page_address(image->control_code_page); + memcpy(control_page, relocate_kernel, PAGE_SIZE); + + page_list[PA_CONTROL_PAGE] = __pa(control_page); + page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; + page_list[PA_PGD] = __pa(kexec_pgd); + page_list[VA_PGD] = (unsigned long)kexec_pgd; +#ifdef CONFIG_X86_PAE + page_list[PA_PMD_0] = __pa(kexec_pmd0); + page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; + page_list[PA_PMD_1] = __pa(kexec_pmd1); + page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; +#endif + page_list[PA_PTE_0] = __pa(kexec_pte0); + page_list[VA_PTE_0] = (unsigned long)kexec_pte0; + page_list[PA_PTE_1] = __pa(kexec_pte1); + page_list[VA_PTE_1] = (unsigned long)kexec_pte1; /* The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is @@ -207,8 +144,8 @@ NORET_TYPE void machine_kexec(struct kimage *image) set_idt(phys_to_virt(0),0); /* now call it */ - rnk = (relocate_new_kernel_t) reboot_code_buffer; - (*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae); + relocate_kernel((unsigned long)image->head, (unsigned long)page_list, + image->start, cpu_has_pae); } /* crashkernel=size@addr specifies the location to reserve for diff --git a/arch/i386/kernel/relocate_kernel.S b/arch/i386/kernel/relocate_kernel.S index d312616effa1..f151d6fae462 100644 --- a/arch/i386/kernel/relocate_kernel.S +++ b/arch/i386/kernel/relocate_kernel.S @@ -7,16 +7,138 @@ */ #include +#include +#include + +/* + * Must be relocatable PIC code callable as a C function + */ + +#define PTR(x) (x << 2) +#define PAGE_ALIGNED (1 << PAGE_SHIFT) +#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */ +#define PAE_PGD_ATTR 0x01 /* _PAGE_PRESENT */ + + .text + .align PAGE_ALIGNED + .globl relocate_kernel +relocate_kernel: + movl 8(%esp), %ebp /* list of pages */ + +#ifdef CONFIG_X86_PAE + /* map the control page at its virtual address */ + + movl PTR(VA_PGD)(%ebp), %edi + movl PTR(VA_CONTROL_PAGE)(%ebp), %eax + andl $0xc0000000, %eax + shrl $27, %eax + addl %edi, %eax + + movl PTR(PA_PMD_0)(%ebp), %edx + orl $PAE_PGD_ATTR, %edx + movl %edx, (%eax) + + movl PTR(VA_PMD_0)(%ebp), %edi + movl PTR(VA_CONTROL_PAGE)(%ebp), %eax + andl $0x3fe00000, %eax + shrl $18, %eax + addl %edi, %eax + + movl PTR(PA_PTE_0)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) + + movl PTR(VA_PTE_0)(%ebp), %edi + movl PTR(VA_CONTROL_PAGE)(%ebp), %eax + andl $0x001ff000, %eax + shrl $9, %eax + addl %edi, %eax + + movl PTR(PA_CONTROL_PAGE)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) + + /* identity map the control page at its physical address */ + + movl PTR(VA_PGD)(%ebp), %edi + movl PTR(PA_CONTROL_PAGE)(%ebp), %eax + andl $0xc0000000, %eax + shrl $27, %eax + addl %edi, %eax + + movl PTR(PA_PMD_1)(%ebp), %edx + orl $PAE_PGD_ATTR, %edx + movl %edx, (%eax) + + movl PTR(VA_PMD_1)(%ebp), %edi + movl PTR(PA_CONTROL_PAGE)(%ebp), %eax + andl $0x3fe00000, %eax + shrl $18, %eax + addl %edi, %eax + + movl PTR(PA_PTE_1)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) + + movl PTR(VA_PTE_1)(%ebp), %edi + movl PTR(PA_CONTROL_PAGE)(%ebp), %eax + andl $0x001ff000, %eax + shrl $9, %eax + addl %edi, %eax + + movl PTR(PA_CONTROL_PAGE)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) +#else + /* map the control page at its virtual address */ + + movl PTR(VA_PGD)(%ebp), %edi + movl PTR(VA_CONTROL_PAGE)(%ebp), %eax + andl $0xffc00000, %eax + shrl $20, %eax + addl %edi, %eax + + movl PTR(PA_PTE_0)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) + + movl PTR(VA_PTE_0)(%ebp), %edi + movl PTR(VA_CONTROL_PAGE)(%ebp), %eax + andl $0x003ff000, %eax + shrl $10, %eax + addl %edi, %eax + + movl PTR(PA_CONTROL_PAGE)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) + + /* identity map the control page at its physical address */ + + movl PTR(VA_PGD)(%ebp), %edi + movl PTR(PA_CONTROL_PAGE)(%ebp), %eax + andl $0xffc00000, %eax + shrl $20, %eax + addl %edi, %eax + + movl PTR(PA_PTE_1)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) + + movl PTR(VA_PTE_1)(%ebp), %edi + movl PTR(PA_CONTROL_PAGE)(%ebp), %eax + andl $0x003ff000, %eax + shrl $10, %eax + addl %edi, %eax + + movl PTR(PA_CONTROL_PAGE)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) +#endif - /* - * Must be relocatable PIC code callable as a C function, that once - * it starts can not use the previous processes stack. - */ - .globl relocate_new_kernel relocate_new_kernel: /* read the arguments and say goodbye to the stack */ movl 4(%esp), %ebx /* page_list */ - movl 8(%esp), %ebp /* reboot_code_buffer */ + movl 8(%esp), %ebp /* list of pages */ movl 12(%esp), %edx /* start address */ movl 16(%esp), %ecx /* cpu_has_pae */ @@ -24,11 +146,26 @@ relocate_new_kernel: pushl $0 popfl - /* set a new stack at the bottom of our page... */ - lea 4096(%ebp), %esp + /* get physical address of control page now */ + /* this is impossible after page table switch */ + movl PTR(PA_CONTROL_PAGE)(%ebp), %edi - /* store the parameters back on the stack */ - pushl %edx /* store the start address */ + /* switch to new set of page tables */ + movl PTR(PA_PGD)(%ebp), %eax + movl %eax, %cr3 + + /* setup a new stack at the end of the physical control page */ + lea 4096(%edi), %esp + + /* jump to identity mapped page */ + movl %edi, %eax + addl $(identity_mapped - relocate_kernel), %eax + pushl %eax + ret + +identity_mapped: + /* store the start address on the stack */ + pushl %edx /* Set cr0 to a known state: * 31 0 == Paging disabled @@ -113,8 +250,3 @@ relocate_new_kernel: xorl %edi, %edi xorl %ebp, %ebp ret -relocate_new_kernel_end: - - .globl relocate_new_kernel_size -relocate_new_kernel_size: - .long relocate_new_kernel_end - relocate_new_kernel diff --git a/include/asm-i386/kexec.h b/include/asm-i386/kexec.h index 53f0e06672dc..4dfc9f5ed031 100644 --- a/include/asm-i386/kexec.h +++ b/include/asm-i386/kexec.h @@ -1,6 +1,26 @@ #ifndef _I386_KEXEC_H #define _I386_KEXEC_H +#define PA_CONTROL_PAGE 0 +#define VA_CONTROL_PAGE 1 +#define PA_PGD 2 +#define VA_PGD 3 +#define PA_PTE_0 4 +#define VA_PTE_0 5 +#define PA_PTE_1 6 +#define VA_PTE_1 7 +#ifdef CONFIG_X86_PAE +#define PA_PMD_0 8 +#define VA_PMD_0 9 +#define PA_PMD_1 10 +#define VA_PMD_1 11 +#define PAGES_NR 12 +#else +#define PAGES_NR 8 +#endif + +#ifndef __ASSEMBLY__ + #include #include #include @@ -72,5 +92,12 @@ static inline void crash_setup_regs(struct pt_regs *newregs, newregs->eip = (unsigned long)current_text_addr(); } } +asmlinkage NORET_TYPE void +relocate_kernel(unsigned long indirection_page, + unsigned long control_page, + unsigned long start_address, + unsigned int has_pae) ATTRIB_NORET; + +#endif /* __ASSEMBLY__ */ #endif /* _I386_KEXEC_H */ -- cgit v1.2.1 From e8c7391de4cd91b2cbb0c791f69ba1e066595848 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Don't use kernel_text_address in oops context Because it can take spinlocks. Suggested by Mathieu Desnoyers Cc: Mathieu Desnoyers Signed-off-by: Andi Kleen --- arch/x86_64/kernel/traps.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index fb8486eca1b6..01f2a8d254c2 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -299,7 +299,9 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s #define HANDLE_STACK(cond) \ do while (cond) { \ unsigned long addr = *stack++; \ - if (kernel_text_address(addr)) { \ + if (oops_in_progress ? \ + __kernel_text_address(addr) : \ + kernel_text_address(addr)) { \ /* \ * If the address is either in the text segment of the \ * kernel, or in the region which contains vmalloc'ed \ -- cgit v1.2.1 From 027a51cef330433ba5924fc92fb25ef48bcfc1a3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Document my tree in Documentation/HOWTO Signed-off-by: Andi Kleen --- Documentation/HOWTO | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/HOWTO b/Documentation/HOWTO index 915ae8c986c6..1d6560413cc5 100644 --- a/Documentation/HOWTO +++ b/Documentation/HOWTO @@ -358,7 +358,8 @@ Here is a list of some of the different kernel trees available: quilt trees: - USB, PCI, Driver Core, and I2C, Greg Kroah-Hartman kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/ - + - x86-64, partly i386, Andi Kleen + ftp.firstfloor.org:/pub/ak/x86_64/quilt/ Bug Reporting ------------- -- cgit v1.2.1 From 29a9af60e2120f874d0c600bf9e27617254a0488 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Add comments to the PDA structure to annotate offsets Change the comments in the pda structure to make the first fields to have their offset documented and to have the comments aligned. The stack protector series needs a field at offset 40 (gcc ABI); annotate upto 40 for that reason. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar Signed-off-by: Andi Kleen CC: Andi Kleen --- include/asm-x86_64/pda.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/include/asm-x86_64/pda.h b/include/asm-x86_64/pda.h index e2b23e337b94..6794ffaae433 100644 --- a/include/asm-x86_64/pda.h +++ b/include/asm-x86_64/pda.h @@ -9,13 +9,14 @@ /* Per processor datastructure. %gs points to it while the kernel runs */ struct x8664_pda { - struct task_struct *pcurrent; /* Current process */ - unsigned long data_offset; /* Per cpu data offset from linker address */ - unsigned long kernelstack; /* top of kernel stack for current */ - unsigned long oldrsp; /* user rsp for system call */ - int irqcount; /* Irq nesting counter. Starts with -1 */ - int cpunumber; /* Logical CPU number */ - char *irqstackptr; /* top of irqstack */ + struct task_struct *pcurrent; /* 0 Current process */ + unsigned long data_offset; /* 8 Per cpu data offset from linker + address */ + unsigned long kernelstack; /* 16 top of kernel stack for current */ + unsigned long oldrsp; /* 24 user rsp for system call */ + int irqcount; /* 32 Irq nesting counter. Starts with -1 */ + int cpunumber; /* 36 Logical CPU number */ + char *irqstackptr; /* 40 top of irqstack */ int nodenumber; /* number of current node */ unsigned int __softirq_pending; unsigned int __nmi_count; /* number of NMI on this CPUs */ -- cgit v1.2.1 From b62a5c740df1e3d49a97349fce0c6a23f633d7fe Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Add the Kconfig option for the stackprotector feature This patch adds the config options for -fstack-protector. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar Signed-off-by: Andi Kleen CC: Andi Kleen --- arch/x86_64/Kconfig | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 927b392a4aa0..0c61d0019dd7 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -533,6 +533,30 @@ config SECCOMP If unsure, say Y. Only embedded should say N here. +config CC_STACKPROTECTOR + bool "Enable -fstack-protector buffer overflow detection (EXPRIMENTAL)" + depends on EXPERIMENTAL + help + This option turns on the -fstack-protector GCC feature. This + feature puts, at the beginning of critical functions, a canary + value on the stack just before the return address, and validates + the value just before actually returning. Stack based buffer + overflows (that need to overwrite this return address) now also + overwrite the canary, which gets detected and the attack is then + neutralized via a kernel panic. + + This feature requires gcc version 4.2 or above, or a distribution + gcc with the feature backported. Older versions are automatically + detected and for those versions, this configuration option is ignored. + +config CC_STACKPROTECTOR_ALL + bool "Use stack-protector for all functions" + depends on CC_STACKPROTECTOR + help + Normally, GCC only inserts the canary value protection for + functions that use large-ish on-stack buffers. By enabling + this option, GCC will be asked to do this for ALL functions. + source kernel/Kconfig.hz config REORDER -- cgit v1.2.1 From 0a4254058037eb172758961d0a5b94f4320a1425 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Add the canary field to the PDA area and the task struct This patch adds the per thread cookie field to the task struct and the PDA. Also it makes sure that the PDA value gets the new cookie value at context switch, and that a new task gets a new cookie at task creation time. Signed-off-by: Arjan van Ven Signed-off-by: Ingo Molnar Signed-off-by: Andi Kleen CC: Andi Kleen --- arch/x86_64/kernel/process.c | 8 ++++++++ include/asm-x86_64/pda.h | 7 ++++++- include/linux/sched.h | 5 +++++ kernel/fork.c | 5 +++++ 4 files changed, 24 insertions(+), 1 deletion(-) diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 9e9a70e50c72..fba8dfeda67c 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -625,6 +625,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) unlazy_fpu(prev_p); write_pda(kernelstack, task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); +#ifdef CONFIG_CC_STACKPROTECTOR + write_pda(stack_canary, next_p->stack_canary); + /* + * Build time only check to make sure the stack_canary is at + * offset 40 in the pda; this is a gcc ABI requirement + */ + BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40); +#endif /* * Now maybe reload the debug registers and handle I/O bitmaps diff --git a/include/asm-x86_64/pda.h b/include/asm-x86_64/pda.h index 6794ffaae433..e7773e0af865 100644 --- a/include/asm-x86_64/pda.h +++ b/include/asm-x86_64/pda.h @@ -16,7 +16,12 @@ struct x8664_pda { unsigned long oldrsp; /* 24 user rsp for system call */ int irqcount; /* 32 Irq nesting counter. Starts with -1 */ int cpunumber; /* 36 Logical CPU number */ - char *irqstackptr; /* 40 top of irqstack */ +#ifdef CONFIG_CC_STACKPROTECTOR + unsigned long stack_canary; /* 40 stack canary value */ + /* gcc-ABI: this canary MUST be at + offset 40!!! */ +#endif + char *irqstackptr; int nodenumber; /* number of current node */ unsigned int __softirq_pending; unsigned int __nmi_count; /* number of NMI on this CPUs */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 807556c5bcd2..9d4aa7f95bc8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -819,6 +819,11 @@ struct task_struct { unsigned did_exec:1; pid_t pid; pid_t tgid; + +#ifdef CONFIG_CC_STACKPROTECTOR + /* Canary value for the -fstack-protector gcc feature */ + unsigned long stack_canary; +#endif /* * pointers to (original) parent process, youngest child, younger sibling, * older sibling, respectively. (p->father can be replaced with diff --git a/kernel/fork.c b/kernel/fork.c index f9b014e3e700..a0dad84567c9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -175,6 +176,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->thread_info = ti; setup_thread_stack(tsk, orig); +#ifdef CONFIG_CC_STACKPROTECTOR + tsk->stack_canary = get_random_int(); +#endif + /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); atomic_set(&tsk->fs_excl, 0); -- cgit v1.2.1 From 3162f751d04086a9d006342de63ac8f44fe0f72a Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] Add the __stack_chk_fail() function GCC emits a call to a __stack_chk_fail() function when the stack canary is not matching the expected value. Since this is a bad security issue; lets panic the kernel rather than limping along; the kernel really can't be trusted anymore when this happens. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar Signed-off-by: Andi Kleen CC: Andi Kleen --- kernel/panic.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/panic.c b/kernel/panic.c index d2db3e2209e0..6ceb664fb52a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -271,3 +271,15 @@ void oops_exit(void) { do_oops_enter_exit(); } + +#ifdef CONFIG_CC_STACKPROTECTOR +/* + * Called when gcc's -fstack-protector feature is used, and + * gcc detects corruption of the on-stack canary value + */ +void __stack_chk_fail(void) +{ + panic("stack-protector: Kernel stack is corrupted"); +} +EXPORT_SYMBOL(__stack_chk_fail); +#endif -- cgit v1.2.1 From 4f7fd4d7a79193ceda4ce77f75e22917d33fa154 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] Add the -fstack-protector option to the CFLAGS Add a feature check that checks that the gcc compiler has stack-protector support and has the bugfix for PR28281 to make this work in kernel mode. The easiest solution I could find was to have a shell script in scripts/ to do the detection; if needed we can make this fancier in the future without making the makefile too complex. Signed-off-by: Arjan van de Ven Signed-off-by: Andi Kleen CC: Andi Kleen CC: Sam Ravnborg --- arch/x86_64/Makefile | 3 +++ scripts/gcc-x86_64-has-stack-protector.sh | 6 ++++++ 2 files changed, 9 insertions(+) create mode 100644 scripts/gcc-x86_64-has-stack-protector.sh diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index d6472ddf5f6e..2b8d07c70106 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -58,6 +58,9 @@ cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) +cflags-$(CONFIG_CC_STACKPROTECTOR) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC) -fstack-protector ) +cflags-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC) -fstack-protector-all ) + CFLAGS += $(cflags-y) CFLAGS_KERNEL += $(cflags-kernel-y) AFLAGS += -m64 diff --git a/scripts/gcc-x86_64-has-stack-protector.sh b/scripts/gcc-x86_64-has-stack-protector.sh new file mode 100644 index 000000000000..325c0a1b03b6 --- /dev/null +++ b/scripts/gcc-x86_64-has-stack-protector.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +echo "int foo(void) { char X[200]; return 3; }" | $1 -S -xc -c -O0 -mcmodel=kernel -fstack-protector - -o - 2> /dev/null | grep -q "%gs" +if [ "$?" -eq "0" ] ; then + echo $2 +fi -- cgit v1.2.1 From 96e540492ab54423f3693958329e095878f1f12b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] Fix a irqcount comment in entry.S Signed-off-by: Andi Kleen --- arch/x86_64/kernel/entry.S | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 4fcc0ad8bbeb..ea32688386fd 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -519,7 +519,12 @@ END(stub_rt_sigreturn) testl $3,CS(%rdi) je 1f swapgs -1: incl %gs:pda_irqcount # RED-PEN should check preempt count + /* irqcount is used to check if a CPU is already on an interrupt + stack or not. While this is essentially redundant with preempt_count + it is a little cheaper to use a separate counter in the PDA + (short of moving irq_enter into assembly, which would be too + much work) */ +1: incl %gs:pda_irqcount cmoveq %gs:pda_irqstackptr,%rsp push %rbp # backlink for old unwinder /* -- cgit v1.2.1 From baf5695dd1a49bb48a3daf08726d7f243f42e97e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] Use %c instead of %P modifier in pda access Apparently that is the more official way to get numbers without $ in inline assembly Signed-off-by: Andi Kleen --- include/asm-x86_64/pda.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/asm-x86_64/pda.h b/include/asm-x86_64/pda.h index e7773e0af865..c2aac96ac323 100644 --- a/include/asm-x86_64/pda.h +++ b/include/asm-x86_64/pda.h @@ -51,13 +51,13 @@ extern struct x8664_pda _proxy_pda; typedef typeof(_proxy_pda.field) T__; \ switch (sizeof(_proxy_pda.field)) { \ case 2: \ -asm(op "w %1,%%gs:%P2" : "+m" (_proxy_pda.field) : \ +asm(op "w %1,%%gs:%c2" : "+m" (_proxy_pda.field) : \ "ri" ((T__)val),"i"(pda_offset(field))); break; \ case 4: \ -asm(op "l %1,%%gs:%P2" : "+m" (_proxy_pda.field) : \ +asm(op "l %1,%%gs:%c2" : "+m" (_proxy_pda.field) : \ "ri" ((T__)val),"i"(pda_offset(field))); break; \ case 8: \ -asm(op "q %1,%%gs:%P2": "+m" (_proxy_pda.field) : \ +asm(op "q %1,%%gs:%c2": "+m" (_proxy_pda.field) : \ "ri" ((T__)val),"i"(pda_offset(field))); break; \ default: __bad_pda_field(); \ } \ @@ -67,13 +67,13 @@ default: __bad_pda_field(); \ typeof(_proxy_pda.field) ret__; \ switch (sizeof(_proxy_pda.field)) { \ case 2: \ -asm(op "w %%gs:%P1,%0":"=r" (ret__):\ +asm(op "w %%gs:%c1,%0":"=r" (ret__):\ "i" (pda_offset(field)), "m" (_proxy_pda.field)); break;\ case 4: \ -asm(op "l %%gs:%P1,%0":"=r" (ret__):\ +asm(op "l %%gs:%c1,%0":"=r" (ret__):\ "i" (pda_offset(field)), "m" (_proxy_pda.field)); break;\ case 8: \ -asm(op "q %%gs:%P1,%0":"=r" (ret__):\ +asm(op "q %%gs:%c1,%0":"=r" (ret__):\ "i" (pda_offset(field)), "m" (_proxy_pda.field)); break;\ default: __bad_pda_field(); \ } \ -- cgit v1.2.1 From 85691f135db78f3548107a0abe383dfab3bc38fa Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] Type checking for write_pda() I just added type checking for assignments the PDA in the i386 PDA code. Here's the x86-64 equivalent. (Obviously this doesn't contain the latest x86-64 PDA change.) Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen --- include/asm-x86_64/pda.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/asm-x86_64/pda.h b/include/asm-x86_64/pda.h index c2aac96ac323..5dadb201f769 100644 --- a/include/asm-x86_64/pda.h +++ b/include/asm-x86_64/pda.h @@ -49,7 +49,8 @@ extern struct x8664_pda _proxy_pda; #define pda_to_op(op,field,val) do { \ typedef typeof(_proxy_pda.field) T__; \ - switch (sizeof(_proxy_pda.field)) { \ + if (0) { T__ tmp__; tmp__ = (val); } \ + switch (sizeof(_proxy_pda.field)) { \ case 2: \ asm(op "w %1,%%gs:%c2" : "+m" (_proxy_pda.field) : \ "ri" ((T__)val),"i"(pda_offset(field))); break; \ -- cgit v1.2.1 From 7b0bda74f7e77f362eaeee837e7911238acf4c76 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] Fix a PDA warning uncovered by the new type checking Fix linux/arch/x86_64/kernel/process.c: In function __switch_to: linux/arch/x86_64/kernel/process.c:626: warning: assignment makes integer from pointer without a cast Signed-off-by: Andi Kleen --- arch/x86_64/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index fba8dfeda67c..885c318f76ab 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -624,7 +624,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) And the AMD workaround requires it to be after DS reload. */ unlazy_fpu(prev_p); write_pda(kernelstack, - task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); + (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); #ifdef CONFIG_CC_STACKPROTECTOR write_pda(stack_canary, next_p->stack_canary); /* -- cgit v1.2.1 From 0da5db313317e3195482d3e660a1074857374a89 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] i386: Abstract sensitive instructions Abstract sensitive instructions in assembler code, replacing them with macros (which currently are #defined to the native versions). We use long names: assembler is case-insensitive, so if something goes wrong and macros do not expand, it would assemble anyway. Resulting object files are exactly the same as before. Signed-off-by: Rusty Russell Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/entry.S | 38 ++++++++++++++++++++++---------------- include/asm-i386/spinlock.h | 7 +++++-- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 4b0845249222..3872fca5c74a 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -76,8 +76,15 @@ DF_MASK = 0x00000400 NT_MASK = 0x00004000 VM_MASK = 0x00020000 +/* These are replaces for paravirtualization */ +#define DISABLE_INTERRUPTS cli +#define ENABLE_INTERRUPTS sti +#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit +#define INTERRUPT_RETURN iret +#define GET_CR0_INTO_EAX movl %cr0, %eax + #ifdef CONFIG_PREEMPT -#define preempt_stop cli; TRACE_IRQS_OFF +#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF #else #define preempt_stop #define resume_kernel restore_nocheck @@ -236,7 +243,7 @@ check_userspace: testl $(VM_MASK | 3), %eax jz resume_kernel ENTRY(resume_userspace) - cli # make sure we don't miss an interrupt + DISABLE_INTERRUPTS # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret movl TI_flags(%ebp), %ecx @@ -247,7 +254,7 @@ ENTRY(resume_userspace) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) - cli + DISABLE_INTERRUPTS cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_nocheck need_resched: @@ -275,7 +282,7 @@ sysenter_past_esp: * No need to follow this irqs on/off section: the syscall * disabled irqs and here we enable it straight after entry: */ - sti + ENABLE_INTERRUPTS pushl $(__USER_DS) CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET ss, 0*/ @@ -320,7 +327,7 @@ sysenter_past_esp: jae syscall_badsys call *sys_call_table(,%eax,4) movl %eax,EAX(%esp) - cli + DISABLE_INTERRUPTS TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx @@ -330,8 +337,7 @@ sysenter_past_esp: movl OLDESP(%esp), %ecx xorl %ebp,%ebp TRACE_IRQS_ON - sti - sysexit + ENABLE_INTERRUPTS_SYSEXIT CFI_ENDPROC @@ -356,7 +362,7 @@ syscall_call: call *sys_call_table(,%eax,4) movl %eax,EAX(%esp) # store the return value syscall_exit: - cli # make sure we don't miss an interrupt + DISABLE_INTERRUPTS # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF @@ -381,11 +387,11 @@ restore_nocheck_notrace: RESTORE_REGS addl $4, %esp CFI_ADJUST_CFA_OFFSET -4 -1: iret +1: INTERRUPT_RETURN .section .fixup,"ax" iret_exc: TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS pushl $0 # no error code pushl $do_iret_error jmp error_code @@ -409,7 +415,7 @@ ldt_ss: * dosemu and wine happy. */ subl $8, %esp # reserve space for switch16 pointer CFI_ADJUST_CFA_OFFSET 8 - cli + DISABLE_INTERRUPTS TRACE_IRQS_OFF movl %esp, %eax /* Set up the 16bit stack frame with switch32 pointer on top, @@ -419,7 +425,7 @@ ldt_ss: TRACE_IRQS_IRET RESTORE_REGS lss 20+4(%esp), %esp # switch to 16bit stack -1: iret +1: INTERRUPT_RETURN .section __ex_table,"a" .align 4 .long 1b,iret_exc @@ -434,7 +440,7 @@ work_pending: jz work_notifysig work_resched: call schedule - cli # make sure we don't miss an interrupt + DISABLE_INTERRUPTS # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF @@ -490,7 +496,7 @@ syscall_exit_work: testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl jz work_pending TRACE_IRQS_ON - sti # could let do_syscall_trace() call + ENABLE_INTERRUPTS # could let do_syscall_trace() call # schedule() instead movl %esp, %eax movl $1, %edx @@ -668,7 +674,7 @@ ENTRY(device_not_available) pushl $-1 # mark this as an int CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL - movl %cr0, %eax + GET_CR0_INTO_EAX testl $0x4, %eax # EM (math emulation bit) jne device_not_available_emulate preempt_stop @@ -811,7 +817,7 @@ nmi_16bit_stack: call do_nmi RESTORE_REGS lss 12+4(%esp), %esp # back to 16bit stack -1: iret +1: INTERRUPT_RETURN CFI_ENDPROC .section __ex_table,"a" .align 4 diff --git a/include/asm-i386/spinlock.h b/include/asm-i386/spinlock.h index 324329313af8..b0b3043f05e1 100644 --- a/include/asm-i386/spinlock.h +++ b/include/asm-i386/spinlock.h @@ -7,6 +7,9 @@ #include #include +#define CLI_STRING "cli" +#define STI_STRING "sti" + /* * Your basic SMP spinlocks, allowing only a single CPU anywhere * @@ -55,12 +58,12 @@ static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long fla "2:\t" "testl $0x200, %1\n\t" "jz 4f\n\t" - "sti\n" + STI_STRING "\n" "3:\t" "rep;nop\n\t" "cmpb $0, %0\n\t" "jle 3b\n\t" - "cli\n\t" + CLI_STRING "\n\t" "jmp 1b\n" "4:\t" "rep;nop\n\t" -- cgit v1.2.1 From 78be3706b21a232310590fe00258b224177ac05f Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] i386: Allow a kernel not to be in ring 0 We allow for the fact that the guest kernel may not run in ring 0. This requires some abstraction in a few places when setting %cs or checking privilege level (user vs kernel). This is Chris' [RFC PATCH 15/33] move segment checks to subarch, except rather than using #define USER_MODE_MASK which depends on a config option, we use Zach's more flexible approach of assuming ring 3 == userspace. I also used "get_kernel_rpl()" over "get_kernel_cs()" because I think it reads better in the code... 1) Remove the hardcoded 3 and introduce #define SEGMENT_RPL_MASK 3 2) Add a get_kernel_rpl() macro, and don't assume it's zero. And: Clean up of patch for letting kernel run other than ring 0: a. Add some comments about the SEGMENT_IS_*_CODE() macros. b. Add a USER_RPL macro. (Code was comparing a value to a mask in some places and to the magic number 3 in other places.) c. Add macros for table indicator field and use them. d. Change the entry.S tests for LDT stack segment to use the macros Signed-off-by: Rusty Russell Signed-off-by: Zachary Amsden Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/entry.S | 9 +++++---- arch/i386/kernel/process.c | 2 +- arch/i386/mm/extable.c | 2 +- arch/i386/mm/fault.c | 11 ++++------- include/asm-i386/ptrace.h | 5 +++-- include/asm-i386/segment.h | 17 +++++++++++++++++ 6 files changed, 31 insertions(+), 15 deletions(-) diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 3872fca5c74a..284f2e908ad0 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -240,8 +240,9 @@ ret_from_intr: check_userspace: movl EFLAGS(%esp), %eax # mix EFLAGS and CS movb CS(%esp), %al - testl $(VM_MASK | 3), %eax - jz resume_kernel + andl $(VM_MASK | SEGMENT_RPL_MASK), %eax + cmpl $USER_RPL, %eax + jb resume_kernel # not returning to v8086 or userspace ENTRY(resume_userspace) DISABLE_INTERRUPTS # make sure we don't miss an interrupt # setting need_resched or sigpending @@ -377,8 +378,8 @@ restore_all: # See comments in process.c:copy_thread() for details. movb OLDSS(%esp), %ah movb CS(%esp), %al - andl $(VM_MASK | (4 << 8) | 3), %eax - cmpl $((4 << 8) | 3), %eax + andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax CFI_REMEMBER_STATE je ldt_ss # returning to user-space with LDT SS restore_nocheck: diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 220aeca59c3a..8c190ca7ae44 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -338,7 +338,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) regs.xes = __USER_DS; regs.orig_eax = -1; regs.eip = (unsigned long) kernel_thread_helper; - regs.xcs = __KERNEL_CS; + regs.xcs = __KERNEL_CS | get_kernel_rpl(); regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; /* Ok, create the new process.. */ diff --git a/arch/i386/mm/extable.c b/arch/i386/mm/extable.c index de03c5430abc..0ce4f22a2635 100644 --- a/arch/i386/mm/extable.c +++ b/arch/i386/mm/extable.c @@ -11,7 +11,7 @@ int fixup_exception(struct pt_regs *regs) const struct exception_table_entry *fixup; #ifdef CONFIG_PNPBIOS - if (unlikely((regs->xcs & ~15) == (GDT_ENTRY_PNPBIOS_BASE << 3))) + if (unlikely(SEGMENT_IS_PNP_CODE(regs->xcs))) { extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp; extern u32 pnp_bios_is_utter_crap; diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index 0ce86168a0b1..5e17a3f43b41 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -27,6 +27,7 @@ #include #include #include +#include extern void die(const char *,struct pt_regs *,long); @@ -113,10 +114,10 @@ static inline unsigned long get_segment_eip(struct pt_regs *regs, } /* The standard kernel/user address space limit. */ - *eip_limit = (seg & 3) ? USER_DS.seg : KERNEL_DS.seg; + *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg; /* By far the most common cases. */ - if (likely(seg == __USER_CS || seg == __KERNEL_CS)) + if (likely(SEGMENT_IS_FLAT_CODE(seg))) return eip; /* Check the segment exists, is within the current LDT/GDT size, @@ -430,11 +431,7 @@ good_area: write = 0; switch (error_code & 3) { default: /* 3: write, present */ -#ifdef TEST_VERIFY_AREA - if (regs->cs == KERNEL_CS) - printk("WP fault at %08lx\n", regs->eip); -#endif - /* fall through */ + /* fall through */ case 2: /* write, not present */ if (!(vma->vm_flags & VM_WRITE)) goto bad_area; diff --git a/include/asm-i386/ptrace.h b/include/asm-i386/ptrace.h index 30a442ec2059..21bb91679c82 100644 --- a/include/asm-i386/ptrace.h +++ b/include/asm-i386/ptrace.h @@ -60,6 +60,7 @@ struct pt_regs { #ifdef __KERNEL__ #include +#include struct task_struct; extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code); @@ -73,11 +74,11 @@ extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int erro */ static inline int user_mode(struct pt_regs *regs) { - return (regs->xcs & 3) != 0; + return (regs->xcs & SEGMENT_RPL_MASK) == USER_RPL; } static inline int user_mode_vm(struct pt_regs *regs) { - return ((regs->xcs & 3) | (regs->eflags & VM_MASK)) != 0; + return ((regs->xcs & SEGMENT_RPL_MASK) | (regs->eflags & VM_MASK)) >= USER_RPL; } #define instruction_pointer(regs) ((regs)->eip) extern unsigned long profile_pc(struct pt_regs *regs); diff --git a/include/asm-i386/segment.h b/include/asm-i386/segment.h index faf995307b9e..b7ab59685ba7 100644 --- a/include/asm-i386/segment.h +++ b/include/asm-i386/segment.h @@ -83,6 +83,11 @@ #define GDT_SIZE (GDT_ENTRIES * 8) +/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */ +#define SEGMENT_IS_FLAT_CODE(x) (((x) & 0xec) == GDT_ENTRY_KERNEL_CS * 8) +/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */ +#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8) + /* Simple and small GDT entries for booting only */ #define GDT_ENTRY_BOOT_CS 2 @@ -112,4 +117,16 @@ */ #define IDT_ENTRIES 256 +/* Bottom two bits of selector give the ring privilege level */ +#define SEGMENT_RPL_MASK 0x3 +/* Bit 2 is table indicator (LDT/GDT) */ +#define SEGMENT_TI_MASK 0x4 + +/* User mode is privilege level 3 */ +#define USER_RPL 0x3 +/* LDT segment has TI set, GDT has it cleared */ +#define SEGMENT_LDT 0x4 +#define SEGMENT_GDT 0x0 + +#define get_kernel_rpl() 0 #endif -- cgit v1.2.1 From ec5c09269ba9cd3b9cf879390db6d5c7dcdcaf1e Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] i386: Do better early exception handlers Add early i386 fault handlers with debug information for common faults. Handles: divide error invalid opcode protection fault page fault Also adds code to detect early recursive/multiple faults and halt the system when they happen (taken from x86_64.) Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Ingo Molnar Signed-off-by: Andrew Morton --- arch/i386/kernel/head.S | 67 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index a6b8bd89aa27..be9d883c62ce 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -371,8 +371,65 @@ rp_sidt: addl $8,%edi dec %ecx jne rp_sidt + +.macro set_early_handler handler,trapno + lea \handler,%edx + movl $(__KERNEL_CS << 16),%eax + movw %dx,%ax + movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ + lea idt_table,%edi + movl %eax,8*\trapno(%edi) + movl %edx,8*\trapno+4(%edi) +.endm + + set_early_handler handler=early_divide_err,trapno=0 + set_early_handler handler=early_illegal_opcode,trapno=6 + set_early_handler handler=early_protection_fault,trapno=13 + set_early_handler handler=early_page_fault,trapno=14 + ret +early_divide_err: + xor %edx,%edx + pushl $0 /* fake errcode */ + jmp early_fault + +early_illegal_opcode: + movl $6,%edx + pushl $0 /* fake errcode */ + jmp early_fault + +early_protection_fault: + movl $13,%edx + jmp early_fault + +early_page_fault: + movl $14,%edx + jmp early_fault + +early_fault: + cld +#ifdef CONFIG_PRINTK + movl $(__KERNEL_DS),%eax + movl %eax,%ds + movl %eax,%es + cmpl $2,early_recursion_flag + je hlt_loop + incl early_recursion_flag + movl %cr2,%eax + pushl %eax + pushl %edx /* trapno */ + pushl $fault_msg +#ifdef CONFIG_EARLY_PRINTK + call early_printk +#else + call printk +#endif +#endif +hlt_loop: + hlt + jmp hlt_loop + /* This is the default interrupt "handler" :-) */ ALIGN ignore_int: @@ -386,6 +443,9 @@ ignore_int: movl $(__KERNEL_DS),%eax movl %eax,%ds movl %eax,%es + cmpl $2,early_recursion_flag + je hlt_loop + incl early_recursion_flag pushl 16(%esp) pushl 24(%esp) pushl 32(%esp) @@ -431,9 +491,16 @@ ENTRY(stack_start) ready: .byte 0 +early_recursion_flag: + .long 0 + int_msg: .asciz "Unknown interrupt or fault at EIP %p %p %p\n" +fault_msg: + .ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n" + .asciz "Stack: %p %p %p %p %p %p %p %p\n" + /* * The IDT and GDT 'descriptors' are a strange 48-bit object * only used by the lidt and lgdt instructions. They are not -- cgit v1.2.1 From 54dbc0c9ebefb38840c6b07fa6eabaeb96c921f5 Mon Sep 17 00:00:00 2001 From: "adurbin@google.com" Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] insert IOAPIC(s) and Local APIC into resource map This patch places the IOAPIC(s) and the Local APIC specified by ACPI tables into the resource map. The APICs will then be visible within /proc/iomem Signed-off-by: Aaron Durbin Signed-off-by: Andi Kleen --- arch/x86_64/kernel/apic.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 6472e321cad7..135ff25e6b44 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -45,6 +46,11 @@ int apic_calibrate_pmtmr __initdata; int disable_apic_timer __initdata; +static struct resource lapic_resource = { + .name = "Local APIC", + .flags = IORESOURCE_MEM | IORESOURCE_BUSY, +}; + /* * cpu_mask that denotes the CPUs that needs timer interrupt coming in as * IPIs in place of local APIC timers @@ -585,6 +591,40 @@ static int __init detect_init_APIC (void) return 0; } +#ifdef CONFIG_X86_IO_APIC +static struct resource * __init ioapic_setup_resources(void) +{ +#define IOAPIC_RESOURCE_NAME_SIZE 11 + unsigned long n; + struct resource *res; + char *mem; + int i; + + if (nr_ioapics <= 0) + return NULL; + + n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); + n *= nr_ioapics; + + res = alloc_bootmem(n); + + if (!res) + return NULL; + + memset(res, 0, n); + mem = (void *)&res[nr_ioapics]; + + for (i = 0; i < nr_ioapics; i++) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; + } + + return res; +} +#endif + void __init init_apic_mappings(void) { unsigned long apic_phys; @@ -604,6 +644,11 @@ void __init init_apic_mappings(void) apic_mapped = 1; apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys); + /* Put local APIC into the resource map. */ + lapic_resource.start = apic_phys; + lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; + insert_resource(&iomem_resource, &lapic_resource); + /* * Fetch the APIC ID of the BSP in case we have a * default configuration (or the MP table is broken). @@ -613,7 +658,9 @@ void __init init_apic_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; int i; + struct resource *ioapic_res; + ioapic_res = ioapic_setup_resources(); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { ioapic_phys = mp_ioapics[i].mpc_apicaddr; @@ -625,6 +672,13 @@ void __init init_apic_mappings(void) apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n", __fix_to_virt(idx), ioapic_phys); idx++; + + if (ioapic_res) { + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + insert_resource(&iomem_resource, ioapic_res); + ioapic_res++; + } } } } -- cgit v1.2.1 From f0f4c3432e5e1087b3a8c0e6bd4113d3c37497ff Mon Sep 17 00:00:00 2001 From: "adurbin@google.com" Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] i386: add HPET(s) into resource map Add HPET(s) into resource map. This will allow for the HPET(s) to be visibile within /proc/iomem. Signed-off-by: Aaron Durbin Signed-off-by: Andi Kleen --- arch/i386/kernel/acpi/boot.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index 8a6c5b412348..1aaea6ab8c46 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -30,6 +30,8 @@ #include #include #include +#include +#include #include #include @@ -646,6 +648,8 @@ static int __init acpi_parse_sbf(unsigned long phys_addr, unsigned long size) static int __init acpi_parse_hpet(unsigned long phys, unsigned long size) { struct acpi_table_hpet *hpet_tbl; + struct resource *hpet_res; + resource_size_t res_start; if (!phys || !size) return -EINVAL; @@ -661,12 +665,26 @@ static int __init acpi_parse_hpet(unsigned long phys, unsigned long size) "memory.\n"); return -1; } + +#define HPET_RESOURCE_NAME_SIZE 9 + hpet_res = alloc_bootmem(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE); + if (hpet_res) { + memset(hpet_res, 0, sizeof(*hpet_res)); + hpet_res->name = (void *)&hpet_res[1]; + hpet_res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + snprintf((char *)hpet_res->name, HPET_RESOURCE_NAME_SIZE, + "HPET %u", hpet_tbl->number); + hpet_res->end = (1 * 1024) - 1; + } + #ifdef CONFIG_X86_64 vxtime.hpet_address = hpet_tbl->addr.addrl | ((long)hpet_tbl->addr.addrh << 32); printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", hpet_tbl->id, vxtime.hpet_address); + + res_start = vxtime.hpet_address; #else /* X86 */ { extern unsigned long hpet_address; @@ -674,9 +692,17 @@ static int __init acpi_parse_hpet(unsigned long phys, unsigned long size) hpet_address = hpet_tbl->addr.addrl; printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", hpet_tbl->id, hpet_address); + + res_start = hpet_address; } #endif /* X86 */ + if (hpet_res) { + hpet_res->start = res_start; + hpet_res->end += res_start; + insert_resource(&iomem_resource, hpet_res); + } + return 0; } #else -- cgit v1.2.1 From 3022d734a54cbd2b65eea9a024564821101b4a9a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] Fix zeroing on exception in copy_*_user - Don't zero for __copy_from_user_inatomic following i386. This will prevent spurious zeros for parallel file system writers when one does a exception - The string instruction version didn't zero the output on exception. Oops. Also I cleaned up the code a bit while I was at it and added a minor optimization to the string instruction path. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/x8664_ksyms.c | 1 + arch/x86_64/lib/copy_user.S | 124 +++++++++++++++++++++++++-------------- include/asm-x86_64/uaccess.h | 6 +- 3 files changed, 83 insertions(+), 48 deletions(-) diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c index 370952c4ff22..c3454af5e3a2 100644 --- a/arch/x86_64/kernel/x8664_ksyms.c +++ b/arch/x86_64/kernel/x8664_ksyms.c @@ -29,6 +29,7 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(copy_user_generic); EXPORT_SYMBOL(copy_from_user); EXPORT_SYMBOL(copy_to_user); +EXPORT_SYMBOL(__copy_from_user_inatomic); EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S index 962f3a693c5e..70bebd310408 100644 --- a/arch/x86_64/lib/copy_user.S +++ b/arch/x86_64/lib/copy_user.S @@ -9,10 +9,29 @@ #define FIX_ALIGNMENT 1 - #include - #include - #include - #include +#include +#include +#include +#include + + .macro ALTERNATIVE_JUMP feature,orig,alt +0: + .byte 0xe9 /* 32bit jump */ + .long \orig-1f /* by default jump to orig */ +1: + .section .altinstr_replacement,"ax" +2: .byte 0xe9 /* near jump with 32bit immediate */ + .long \alt-1b /* offset */ /* or alternatively to alt */ + .previous + .section .altinstructions,"a" + .align 8 + .quad 0b + .quad 2b + .byte \feature /* when feature is set */ + .byte 5 + .byte 5 + .previous + .endm /* Standard copy_to_user with segment limit checking */ ENTRY(copy_to_user) @@ -23,25 +42,21 @@ ENTRY(copy_to_user) jc bad_to_user cmpq threadinfo_addr_limit(%rax),%rcx jae bad_to_user -2: - .byte 0xe9 /* 32bit jump */ - .long .Lcug-1f -1: + xorl %eax,%eax /* clear zero flag */ + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string CFI_ENDPROC -ENDPROC(copy_to_user) - .section .altinstr_replacement,"ax" -3: .byte 0xe9 /* replacement jmp with 32 bit immediate */ - .long copy_user_generic_c-1b /* offset */ - .previous - .section .altinstructions,"a" - .align 8 - .quad 2b - .quad 3b - .byte X86_FEATURE_REP_GOOD - .byte 5 - .byte 5 - .previous +ENTRY(copy_user_generic) + CFI_STARTPROC + movl $1,%ecx /* set zero flag */ + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + CFI_ENDPROC + +ENTRY(__copy_from_user_inatomic) + CFI_STARTPROC + xorl %ecx,%ecx /* clear zero flag */ + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + CFI_ENDPROC /* Standard copy_from_user with segment limit checking */ ENTRY(copy_from_user) @@ -52,7 +67,8 @@ ENTRY(copy_from_user) jc bad_from_user cmpq threadinfo_addr_limit(%rax),%rcx jae bad_from_user - /* FALL THROUGH to copy_user_generic */ + movl $1,%ecx /* set zero flag */ + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string CFI_ENDPROC ENDPROC(copy_from_user) @@ -73,37 +89,26 @@ END(bad_from_user) /* - * copy_user_generic - memory copy with exception handling. + * copy_user_generic_unrolled - memory copy with exception handling. + * This version is for CPUs like P4 that don't have efficient micro code for rep movsq * * Input: * rdi destination * rsi source * rdx count + * ecx zero flag -- if true zero destination on error * * Output: * eax uncopied bytes or 0 if successful. */ -ENTRY(copy_user_generic) +ENTRY(copy_user_generic_unrolled) CFI_STARTPROC - .byte 0x66,0x66,0x90 /* 5 byte nop for replacement jump */ - .byte 0x66,0x90 -1: - .section .altinstr_replacement,"ax" -2: .byte 0xe9 /* near jump with 32bit immediate */ - .long copy_user_generic_c-1b /* offset */ - .previous - .section .altinstructions,"a" - .align 8 - .quad copy_user_generic - .quad 2b - .byte X86_FEATURE_REP_GOOD - .byte 5 - .byte 5 - .previous -.Lcug: pushq %rbx CFI_ADJUST_CFA_OFFSET 8 CFI_REL_OFFSET rbx, 0 + pushq %rcx + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rcx, 0 xorl %eax,%eax /*zero for the exception handler */ #ifdef FIX_ALIGNMENT @@ -179,6 +184,9 @@ ENTRY(copy_user_generic) CFI_REMEMBER_STATE .Lende: + popq %rcx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rcx popq %rbx CFI_ADJUST_CFA_OFFSET -8 CFI_RESTORE rbx @@ -265,6 +273,8 @@ ENTRY(copy_user_generic) addl %ecx,%edx /* edx: bytes to zero, rdi: dest, eax:zero */ .Lzero_rest: + cmpl $0,(%rsp) + jz .Le_zero movq %rdx,%rcx .Le_byte: xorl %eax,%eax @@ -286,6 +296,7 @@ ENDPROC(copy_user_generic) /* rdi destination * rsi source * rdx count + * ecx zero flag * * Output: * eax uncopied bytes or 0 if successfull. @@ -296,25 +307,48 @@ ENDPROC(copy_user_generic) * And more would be dangerous because both Intel and AMD have * errata with rep movsq > 4GB. If someone feels the need to fix * this please consider this. - */ -copy_user_generic_c: + */ +ENTRY(copy_user_generic_string) CFI_STARTPROC + movl %ecx,%r8d /* save zero flag */ movl %edx,%ecx shrl $3,%ecx andl $7,%edx + jz 10f 1: rep movsq movl %edx,%ecx 2: rep movsb -4: movl %ecx,%eax +9: movl %ecx,%eax ret -3: lea (%rdx,%rcx,8),%rax + + /* multiple of 8 byte */ +10: rep + movsq + xor %eax,%eax ret + + /* exception handling */ +3: lea (%rdx,%rcx,8),%rax /* exception on quad loop */ + jmp 6f +5: movl %ecx,%eax /* exception on byte loop */ + /* eax: left over bytes */ +6: testl %r8d,%r8d /* zero flag set? */ + jz 7f + movl %eax,%ecx /* initialize x86 loop counter */ + push %rax + xorl %eax,%eax +8: rep + stosb /* zero the rest */ +11: pop %rax +7: ret CFI_ENDPROC END(copy_user_generic_c) .section __ex_table,"a" .quad 1b,3b - .quad 2b,4b + .quad 2b,5b + .quad 8b,11b + .quad 10b,3b .previous diff --git a/include/asm-x86_64/uaccess.h b/include/asm-x86_64/uaccess.h index 1e1fa003daa3..bc6812009bd7 100644 --- a/include/asm-x86_64/uaccess.h +++ b/include/asm-x86_64/uaccess.h @@ -238,6 +238,7 @@ do { \ /* Handles exceptions in both to and from, but doesn't do access_ok */ extern unsigned long copy_user_generic(void *to, const void *from, unsigned len); +extern unsigned long copy_user_generic_dontzero(void *to, const void *from, unsigned len); extern unsigned long copy_to_user(void __user *to, const void *from, unsigned len); extern unsigned long copy_from_user(void *to, const void __user *from, unsigned len); @@ -303,7 +304,6 @@ static __always_inline int __copy_to_user(void __user *dst, const void *src, uns } } - static __always_inline int __copy_in_user(void __user *dst, const void __user *src, unsigned size) { int ret = 0; @@ -352,7 +352,7 @@ long strlen_user(const char __user *str); unsigned long clear_user(void __user *mem, unsigned long len); unsigned long __clear_user(void __user *mem, unsigned long len); -#define __copy_to_user_inatomic __copy_to_user -#define __copy_from_user_inatomic __copy_from_user +extern long __copy_from_user_inatomic(void *dst, const void __user *src, unsigned size); +#define __copy_to_user_inatomic copy_user_generic #endif /* __X86_64_UACCESS_H */ -- cgit v1.2.1 From 95912008ba1fb9d0677c1ce5930aeb0e85ba5710 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] Add __must_check to copy_*_user Following i386. And also fix the two occurrences that caused warnings in arch/x86_64/* Signed-off-by: Andi Kleen --- arch/x86_64/ia32/ptrace32.c | 6 ++++-- include/asm-x86_64/i387.h | 4 ++-- include/asm-x86_64/uaccess.h | 46 ++++++++++++++++++++++++++------------------ 3 files changed, 33 insertions(+), 23 deletions(-) diff --git a/arch/x86_64/ia32/ptrace32.c b/arch/x86_64/ia32/ptrace32.c index 72bf92a9d375..d18198ed636b 100644 --- a/arch/x86_64/ia32/ptrace32.c +++ b/arch/x86_64/ia32/ptrace32.c @@ -375,8 +375,10 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) ret = -EIO; if (!access_ok(VERIFY_READ, u, sizeof(*u))) break; - /* no checking to be bug-to-bug compatible with i386 */ - __copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u)); + /* no checking to be bug-to-bug compatible with i386. */ + /* but silence warning */ + if (__copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u))) + ; set_stopped_child_used_math(child); child->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; ret = 0; diff --git a/include/asm-x86_64/i387.h b/include/asm-x86_64/i387.h index 60c0f4853fdb..0217b74cc9fc 100644 --- a/include/asm-x86_64/i387.h +++ b/include/asm-x86_64/i387.h @@ -137,8 +137,8 @@ static inline int save_i387_checking(struct i387_fxsave_struct __user *fx) #else : [fx] "cdaSDb" (fx), "0" (0)); #endif - if (unlikely(err)) - __clear_user(fx, sizeof(struct i387_fxsave_struct)); + if (unlikely(err) && __clear_user(fx, sizeof(struct i387_fxsave_struct))) + err = -EFAULT; /* No need to clear here because the caller clears USED_MATH */ return err; } diff --git a/include/asm-x86_64/uaccess.h b/include/asm-x86_64/uaccess.h index bc6812009bd7..802a4a068ef6 100644 --- a/include/asm-x86_64/uaccess.h +++ b/include/asm-x86_64/uaccess.h @@ -237,14 +237,18 @@ do { \ */ /* Handles exceptions in both to and from, but doesn't do access_ok */ -extern unsigned long copy_user_generic(void *to, const void *from, unsigned len); -extern unsigned long copy_user_generic_dontzero(void *to, const void *from, unsigned len); - -extern unsigned long copy_to_user(void __user *to, const void *from, unsigned len); -extern unsigned long copy_from_user(void *to, const void __user *from, unsigned len); -extern unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len); - -static __always_inline int __copy_from_user(void *dst, const void __user *src, unsigned size) +__must_check unsigned long +copy_user_generic(void *to, const void *from, unsigned len); + +__must_check unsigned long +copy_to_user(void __user *to, const void *from, unsigned len); +__must_check unsigned long +copy_from_user(void *to, const void __user *from, unsigned len); +__must_check unsigned long +copy_in_user(void __user *to, const void __user *from, unsigned len); + +static __always_inline __must_check +int __copy_from_user(void *dst, const void __user *src, unsigned size) { int ret = 0; if (!__builtin_constant_p(size)) @@ -273,7 +277,8 @@ static __always_inline int __copy_from_user(void *dst, const void __user *src, u } } -static __always_inline int __copy_to_user(void __user *dst, const void *src, unsigned size) +static __always_inline __must_check +int __copy_to_user(void __user *dst, const void *src, unsigned size) { int ret = 0; if (!__builtin_constant_p(size)) @@ -304,7 +309,8 @@ static __always_inline int __copy_to_user(void __user *dst, const void *src, uns } } -static __always_inline int __copy_in_user(void __user *dst, const void __user *src, unsigned size) +static __always_inline __must_check +int __copy_in_user(void __user *dst, const void __user *src, unsigned size) { int ret = 0; if (!__builtin_constant_p(size)) @@ -344,15 +350,17 @@ static __always_inline int __copy_in_user(void __user *dst, const void __user *s } } -long strncpy_from_user(char *dst, const char __user *src, long count); -long __strncpy_from_user(char *dst, const char __user *src, long count); -long strnlen_user(const char __user *str, long n); -long __strnlen_user(const char __user *str, long n); -long strlen_user(const char __user *str); -unsigned long clear_user(void __user *mem, unsigned long len); -unsigned long __clear_user(void __user *mem, unsigned long len); - -extern long __copy_from_user_inatomic(void *dst, const void __user *src, unsigned size); +__must_check long +strncpy_from_user(char *dst, const char __user *src, long count); +__must_check long +__strncpy_from_user(char *dst, const char __user *src, long count); +__must_check long strnlen_user(const char __user *str, long n); +__must_check long __strnlen_user(const char __user *str, long n); +__must_check long strlen_user(const char __user *str); +__must_check unsigned long clear_user(void __user *mem, unsigned long len); +__must_check unsigned long __clear_user(void __user *mem, unsigned long len); + +__must_check long __copy_from_user_inatomic(void *dst, const void __user *src, unsigned size); #define __copy_to_user_inatomic copy_user_generic #endif /* __X86_64_UACCESS_H */ -- cgit v1.2.1 From 758333458aa719bfc26ec16eafd4ad3a9e96014d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] Check return value of copy_to_user in compat_sys_pselect7 Fix linux/fs/compat.c: In function compat_sys_pselect7 linux/fs/compat.c:1869: warning: ignoring return value of copy_to_user, declared with attribute warn_unused_result To make it easier to handle I changed to semantics to not try to write out a timespec if an error occurred. I hope that's ok. Cc: dwmw2@infradead.org Signed-off-by: Andi Kleen --- fs/compat.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/compat.c b/fs/compat.c index e31e9cf96647..ce982f6e8c80 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1855,7 +1855,7 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp, } while (!ret && !timeout && tsp && (ts.tv_sec || ts.tv_nsec)); - if (tsp && !(current->personality & STICKY_TIMEOUTS)) { + if (ret == 0 && tsp && !(current->personality & STICKY_TIMEOUTS)) { struct compat_timespec rts; rts.tv_sec = timeout / HZ; @@ -1866,7 +1866,8 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp, } if (compat_timespec_compare(&rts, &ts) >= 0) rts = ts; - copy_to_user(tsp, &rts, sizeof(rts)); + if (copy_to_user(tsp, &rts, sizeof(rts))) + ret = -EFAULT; } if (ret == -ERESTARTNOHAND) { -- cgit v1.2.1 From 26c13f2b5bbb03f798f8907db20296347e6c7ca6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] Check return values of __copy_to_user in uname emulation Quietens some new warnings Signed-off-by: Andi Kleen --- arch/x86_64/ia32/sys_ia32.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c index 33cd0a4fe388..b0e82c7947dc 100644 --- a/arch/x86_64/ia32/sys_ia32.c +++ b/arch/x86_64/ia32/sys_ia32.c @@ -780,7 +780,7 @@ asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len, asmlinkage long sys32_olduname(struct oldold_utsname __user * name) { - int error; + int err; if (!name) return -EFAULT; @@ -789,27 +789,31 @@ asmlinkage long sys32_olduname(struct oldold_utsname __user * name) down_read(&uts_sem); - error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN); - __put_user(0,name->sysname+__OLD_UTS_LEN); - __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN); - __put_user(0,name->nodename+__OLD_UTS_LEN); - __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN); - __put_user(0,name->release+__OLD_UTS_LEN); - __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN); - __put_user(0,name->version+__OLD_UTS_LEN); + err = __copy_to_user(&name->sysname,&system_utsname.sysname, + __OLD_UTS_LEN); + err |= __put_user(0,name->sysname+__OLD_UTS_LEN); + err |= __copy_to_user(&name->nodename,&system_utsname.nodename, + __OLD_UTS_LEN); + err |= __put_user(0,name->nodename+__OLD_UTS_LEN); + err |= __copy_to_user(&name->release,&system_utsname.release, + __OLD_UTS_LEN); + err |= __put_user(0,name->release+__OLD_UTS_LEN); + err |= __copy_to_user(&name->version,&system_utsname.version, + __OLD_UTS_LEN); + err |= __put_user(0,name->version+__OLD_UTS_LEN); { char *arch = "x86_64"; if (personality(current->personality) == PER_LINUX32) arch = "i686"; - __copy_to_user(&name->machine,arch,strlen(arch)+1); + err |= __copy_to_user(&name->machine,arch,strlen(arch)+1); } up_read(&uts_sem); - error = error ? -EFAULT : 0; + err = err ? -EFAULT : 0; - return error; + return err; } long sys32_uname(struct old_utsname __user * name) -- cgit v1.2.1 From 383d079bfdfcfccd6a720405a510fe325b3e6576 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] Fix some stylistic issues in uaccess.h - Replace some broken white space. - Replace __ keywords with standard names No functional changes. Signed-off-by: Andi Kleen --- include/asm-x86_64/uaccess.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/include/asm-x86_64/uaccess.h b/include/asm-x86_64/uaccess.h index 802a4a068ef6..e856570c0689 100644 --- a/include/asm-x86_64/uaccess.h +++ b/include/asm-x86_64/uaccess.h @@ -84,7 +84,7 @@ struct exception_table_entry */ #define __get_user_x(size,ret,x,ptr) \ - __asm__ __volatile__("call __get_user_" #size \ + asm volatile("call __get_user_" #size \ :"=a" (ret),"=d" (x) \ :"c" (ptr) \ :"r8") @@ -101,7 +101,7 @@ struct exception_table_entry case 8: __get_user_x(8,__ret_gu,__val_gu,ptr); break; \ default: __get_user_bad(); break; \ } \ - (x) = (__typeof__(*(ptr)))__val_gu; \ + (x) = (typeof(*(ptr)))__val_gu; \ __ret_gu; \ }) @@ -112,7 +112,7 @@ extern void __put_user_8(void); extern void __put_user_bad(void); #define __put_user_x(size,ret,x,ptr) \ - __asm__ __volatile__("call __put_user_" #size \ + asm volatile("call __put_user_" #size \ :"=a" (ret) \ :"c" (ptr),"d" (x) \ :"r8") @@ -139,7 +139,7 @@ extern void __put_user_bad(void); #define __put_user_check(x,ptr,size) \ ({ \ int __pu_err; \ - __typeof__(*(ptr)) __user *__pu_addr = (ptr); \ + typeof(*(ptr)) __user *__pu_addr = (ptr); \ switch (size) { \ case 1: __put_user_x(1,__pu_err,x,__pu_addr); break; \ case 2: __put_user_x(2,__pu_err,x,__pu_addr); break; \ @@ -173,7 +173,7 @@ struct __large_struct { unsigned long buf[100]; }; * aliasing issues. */ #define __put_user_asm(x, addr, err, itype, rtype, ltype, errno) \ - __asm__ __volatile__( \ + asm volatile( \ "1: mov"itype" %"rtype"1,%2\n" \ "2:\n" \ ".section .fixup,\"ax\"\n" \ @@ -193,7 +193,7 @@ struct __large_struct { unsigned long buf[100]; }; int __gu_err; \ unsigned long __gu_val; \ __get_user_size(__gu_val,(ptr),(size),__gu_err); \ - (x) = (__typeof__(*(ptr)))__gu_val; \ + (x) = (typeof(*(ptr)))__gu_val; \ __gu_err; \ }) @@ -217,7 +217,7 @@ do { \ } while (0) #define __get_user_asm(x, addr, err, itype, rtype, ltype, errno) \ - __asm__ __volatile__( \ + asm volatile( \ "1: mov"itype" %2,%"rtype"1\n" \ "2:\n" \ ".section .fixup,\"ax\"\n" \ @@ -250,7 +250,7 @@ copy_in_user(void __user *to, const void __user *from, unsigned len); static __always_inline __must_check int __copy_from_user(void *dst, const void __user *src, unsigned size) { - int ret = 0; + int ret = 0; if (!__builtin_constant_p(size)) return copy_user_generic(dst,(__force void *)src,size); switch (size) { @@ -280,7 +280,7 @@ int __copy_from_user(void *dst, const void __user *src, unsigned size) static __always_inline __must_check int __copy_to_user(void __user *dst, const void *src, unsigned size) { - int ret = 0; + int ret = 0; if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst,src,size); switch (size) { @@ -312,7 +312,7 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size) static __always_inline __must_check int __copy_in_user(void __user *dst, const void __user *src, unsigned size) { - int ret = 0; + int ret = 0; if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst,(__force void *)src,size); switch (size) { -- cgit v1.2.1 From c1a9d41f4f103bfef2ed0bea1e95b3190e39e448 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] Reindent macros in pda.h Reindent the macros in x86-64 pda.h, making them much more readable. Follows Jeremy's i386 version of this. No functional changes Signed-off-by: Andi Kleen --- include/asm-x86_64/pda.h | 85 ++++++++++++++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 32 deletions(-) diff --git a/include/asm-x86_64/pda.h b/include/asm-x86_64/pda.h index 5dadb201f769..9e3aaf74475d 100644 --- a/include/asm-x86_64/pda.h +++ b/include/asm-x86_64/pda.h @@ -41,46 +41,67 @@ extern struct x8664_pda boot_cpu_pda[]; */ extern void __bad_pda_field(void); -/* proxy_pda doesn't actually exist, but tell gcc it is accessed - for all PDA accesses so it gets read/write dependencies right. */ +/* + * proxy_pda doesn't actually exist, but tell gcc it is accessed for + * all PDA accesses so it gets read/write dependencies right. + */ extern struct x8664_pda _proxy_pda; #define pda_offset(field) offsetof(struct x8664_pda, field) -#define pda_to_op(op,field,val) do { \ - typedef typeof(_proxy_pda.field) T__; \ - if (0) { T__ tmp__; tmp__ = (val); } \ - switch (sizeof(_proxy_pda.field)) { \ -case 2: \ -asm(op "w %1,%%gs:%c2" : "+m" (_proxy_pda.field) : \ - "ri" ((T__)val),"i"(pda_offset(field))); break; \ -case 4: \ -asm(op "l %1,%%gs:%c2" : "+m" (_proxy_pda.field) : \ - "ri" ((T__)val),"i"(pda_offset(field))); break; \ -case 8: \ -asm(op "q %1,%%gs:%c2": "+m" (_proxy_pda.field) : \ - "ri" ((T__)val),"i"(pda_offset(field))); break; \ -default: __bad_pda_field(); \ - } \ +#define pda_to_op(op,field,val) do { \ + typedef typeof(_proxy_pda.field) T__; \ + if (0) { T__ tmp__; tmp__ = (val); } /* type checking */ \ + switch (sizeof(_proxy_pda.field)) { \ + case 2: \ + asm(op "w %1,%%gs:%c2" : \ + "+m" (_proxy_pda.field) : \ + "ri" ((T__)val), \ + "i"(pda_offset(field))); \ + break; \ + case 4: \ + asm(op "l %1,%%gs:%c2" : \ + "+m" (_proxy_pda.field) : \ + "ri" ((T__)val), \ + "i" (pda_offset(field))); \ + break; \ + case 8: \ + asm(op "q %1,%%gs:%c2": \ + "+m" (_proxy_pda.field) : \ + "ri" ((T__)val), \ + "i"(pda_offset(field))); \ + break; \ + default: \ + __bad_pda_field(); \ + } \ } while (0) -#define pda_from_op(op,field) ({ \ - typeof(_proxy_pda.field) ret__; \ - switch (sizeof(_proxy_pda.field)) { \ -case 2: \ -asm(op "w %%gs:%c1,%0":"=r" (ret__):\ - "i" (pda_offset(field)), "m" (_proxy_pda.field)); break;\ -case 4: \ -asm(op "l %%gs:%c1,%0":"=r" (ret__):\ - "i" (pda_offset(field)), "m" (_proxy_pda.field)); break;\ -case 8: \ -asm(op "q %%gs:%c1,%0":"=r" (ret__):\ - "i" (pda_offset(field)), "m" (_proxy_pda.field)); break;\ -default: __bad_pda_field(); \ - } \ +#define pda_from_op(op,field) ({ \ + typeof(_proxy_pda.field) ret__; \ + switch (sizeof(_proxy_pda.field)) { \ + case 2: \ + asm(op "w %%gs:%c1,%0" : \ + "=r" (ret__) : \ + "i" (pda_offset(field)), \ + "m" (_proxy_pda.field)); \ + break; \ + case 4: \ + asm(op "l %%gs:%c1,%0": \ + "=r" (ret__): \ + "i" (pda_offset(field)), \ + "m" (_proxy_pda.field)); \ + break; \ + case 8: \ + asm(op "q %%gs:%c1,%0": \ + "=r" (ret__) : \ + "i" (pda_offset(field)), \ + "m" (_proxy_pda.field)); \ + break; \ + default: \ + __bad_pda_field(); \ + } \ ret__; }) - #define read_pda(field) pda_from_op("mov",field) #define write_pda(field,val) pda_to_op("mov",field,val) #define add_pda(field,val) pda_to_op("add",field,val) -- cgit v1.2.1 From fd167e42b237e0688005b3dec380eb5a6e5f3585 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] Define __bad_pda_field as noreturn This quietens so warnings about uninitialized use of the return value of the pda read operations. Signed-off-by: Andi Kleen --- include/asm-x86_64/pda.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-x86_64/pda.h b/include/asm-x86_64/pda.h index 9e3aaf74475d..531f48a6c3af 100644 --- a/include/asm-x86_64/pda.h +++ b/include/asm-x86_64/pda.h @@ -39,7 +39,7 @@ extern struct x8664_pda boot_cpu_pda[]; * There is no fast way to get the base address of the PDA, all the accesses * have to mention %fs/%gs. So it needs to be done this Torvaldian way. */ -extern void __bad_pda_field(void); +extern void __bad_pda_field(void) __attribute__((noreturn)); /* * proxy_pda doesn't actually exist, but tell gcc it is accessed for -- cgit v1.2.1 From 73bb5117a448bdf0b56232ca28451fe4c534cb3a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] Remove unused asm-x86_64/mmx.h Signed-off-by: Andi Kleen --- include/asm-x86_64/mmx.h | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100644 include/asm-x86_64/mmx.h diff --git a/include/asm-x86_64/mmx.h b/include/asm-x86_64/mmx.h deleted file mode 100644 index 46b71da99869..000000000000 --- a/include/asm-x86_64/mmx.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef _ASM_MMX_H -#define _ASM_MMX_H - -/* - * MMX 3Dnow! helper operations - */ - -#include - -extern void *_mmx_memcpy(void *to, const void *from, size_t size); -extern void mmx_clear_page(void *page); -extern void mmx_copy_page(void *to, void *from); - -#endif -- cgit v1.2.1 From b38337a624c4d3c2c3d9cdf27d952ca94181c6a8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] Mark per cpu data initialization __initdata again Before 2.6.16 this was changed to work around code that accessed CPUs not in the possible map. But that code should be all fixed now, so mark it __initdata again. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/vmlinux.lds.S | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 2802aa963fa3..d0564f1bcb0b 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -216,14 +216,12 @@ SECTIONS __initramfs_start = .; .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } __initramfs_end = .; - /* temporary here to work around NR_CPUS. If you see this comment in 2.6.17+ - complain */ - . = ALIGN(4096); - __init_end = .; - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); __per_cpu_start = .; .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } __per_cpu_end = .; + . = ALIGN(4096); + __init_end = .; . = ALIGN(4096); __nosave_begin = .; -- cgit v1.2.1 From 371c2f2783eb880df333d0e9ac35c78143e79d0a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] i386: Remove experimental mark of kexec kexec has been marked experimental for a year now and all of the serious problems have been worked through. So it is time (if not past time) to remove the experimental mark. Signed-off-by: Eric W. Biederman Signed-off-by: Andi Kleen --- arch/i386/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index e34a4dbd6b92..9b6cddc8c51b 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -739,8 +739,7 @@ config SECCOMP source kernel/Kconfig.hz config KEXEC - bool "kexec system call (EXPERIMENTAL)" - depends on EXPERIMENTAL + bool "kexec system call" help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot -- cgit v1.2.1 From 1c9c0a6ca35e9325cea811d734d6ab7352be086b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] Remove experimental mark of kexec kexec has been marked experimental for a year now and all of the serious problems have been worked through. So it is time (if not past time) to remove the experimental mark. Signed-off-by: Eric W. Biederman Signed-off-by: Andi Kleen --- arch/x86_64/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 0c61d0019dd7..1f4212605ef8 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -468,8 +468,7 @@ config X86_MCE_AMD the DRAM Error Threshold. config KEXEC - bool "kexec system call (EXPERIMENTAL)" - depends on EXPERIMENTAL + bool "kexec system call" help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot -- cgit v1.2.1 From a15da49debaf7f09460a886b0ecd08588410715e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] Fix idle notifiers Previously exit_idle would be called more often than enter_idle Now instead of using complicated tests just keep track of it using the per CPU variable as a flip flop. I moved the idle state into the PDA to make the access more efficient. Original bug report and an initial patch from Stephane Eranian, but redone by AK. Cc: Stephane Eranian Signed-off-by: Andi Kleen --- arch/x86_64/kernel/process.c | 15 +++++++++------ include/asm-x86_64/pda.h | 3 ++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 885c318f76ab..458006ae19f3 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -80,25 +80,25 @@ void idle_notifier_unregister(struct notifier_block *n) } EXPORT_SYMBOL(idle_notifier_unregister); -enum idle_state { CPU_IDLE, CPU_NOT_IDLE }; -static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE; - void enter_idle(void) { - __get_cpu_var(idle_state) = CPU_IDLE; + write_pda(isidle, 1); atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); } static void __exit_idle(void) { - __get_cpu_var(idle_state) = CPU_NOT_IDLE; + if (read_pda(isidle) == 0) + return; + write_pda(isidle, 0); atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); } /* Called from interrupts to signify idle end */ void exit_idle(void) { - if (current->pid | read_pda(irqcount)) + /* idle loop has pid 0 */ + if (current->pid) return; __exit_idle(); } @@ -220,6 +220,9 @@ void cpu_idle (void) play_dead(); enter_idle(); idle(); + /* In many cases the interrupt that ended idle + has already called exit_idle. But some idle + loops can be woken up without interrupt. */ __exit_idle(); } diff --git a/include/asm-x86_64/pda.h b/include/asm-x86_64/pda.h index 531f48a6c3af..14996d962bac 100644 --- a/include/asm-x86_64/pda.h +++ b/include/asm-x86_64/pda.h @@ -25,7 +25,8 @@ struct x8664_pda { int nodenumber; /* number of current node */ unsigned int __softirq_pending; unsigned int __nmi_count; /* number of NMI on this CPUs */ - int mmu_state; + short mmu_state; + short isidle; struct mm_struct *active_mm; unsigned apic_timer_irqs; } ____cacheline_aligned_in_smp; -- cgit v1.2.1 From 5e544d618f0fb21011f36f28d5e3952b9dc109d2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] i386/x86-64: PCI: split probing and initialization of type 1 config space access First probe if type1/2 accesses work, but then only initialize them at the end. This is useful for a later patch that needs this information inbetween. Signed-off-by: Andi Kleen --- arch/i386/pci/direct.c | 25 ++++++++++++++++--------- arch/i386/pci/init.c | 9 +++++++-- arch/i386/pci/mmconfig.c | 2 +- arch/i386/pci/pci.h | 6 ++++-- arch/x86_64/pci/mmconfig.c | 2 +- 5 files changed, 29 insertions(+), 15 deletions(-) diff --git a/arch/i386/pci/direct.c b/arch/i386/pci/direct.c index 5d81fb510375..5acf0b4743cf 100644 --- a/arch/i386/pci/direct.c +++ b/arch/i386/pci/direct.c @@ -254,7 +254,16 @@ static int __init pci_check_type2(void) return works; } -void __init pci_direct_init(void) +void __init pci_direct_init(int type) +{ + printk(KERN_INFO "PCI: Using configuration type %d\n", type); + if (type == 1) + raw_pci_ops = &pci_direct_conf1; + else + raw_pci_ops = &pci_direct_conf2; +} + +int __init pci_direct_probe(void) { struct resource *region, *region2; @@ -264,19 +273,16 @@ void __init pci_direct_init(void) if (!region) goto type2; - if (pci_check_type1()) { - printk(KERN_INFO "PCI: Using configuration type 1\n"); - raw_pci_ops = &pci_direct_conf1; - return; - } + if (pci_check_type1()) + return 1; release_resource(region); type2: if ((pci_probe & PCI_PROBE_CONF2) == 0) - return; + return 0; region = request_region(0xCF8, 4, "PCI conf2"); if (!region) - return; + return 0; region2 = request_region(0xC000, 0x1000, "PCI conf2"); if (!region2) goto fail2; @@ -284,10 +290,11 @@ void __init pci_direct_init(void) if (pci_check_type2()) { printk(KERN_INFO "PCI: Using configuration type 2\n"); raw_pci_ops = &pci_direct_conf2; - return; + return 2; } release_resource(region2); fail2: release_resource(region); + return 0; } diff --git a/arch/i386/pci/init.c b/arch/i386/pci/init.c index 51087a9d9172..d028e1b05c36 100644 --- a/arch/i386/pci/init.c +++ b/arch/i386/pci/init.c @@ -6,8 +6,13 @@ in the right sequence from here. */ static __init int pci_access_init(void) { + int type = 0; + +#ifdef CONFIG_PCI_DIRECT + type = pci_direct_probe(); +#endif #ifdef CONFIG_PCI_MMCONFIG - pci_mmcfg_init(); + pci_mmcfg_init(type); #endif if (raw_pci_ops) return 0; @@ -21,7 +26,7 @@ static __init int pci_access_init(void) * fails. */ #ifdef CONFIG_PCI_DIRECT - pci_direct_init(); + pci_direct_init(type); #endif return 0; } diff --git a/arch/i386/pci/mmconfig.c b/arch/i386/pci/mmconfig.c index 972180f738d9..44155c5e85d1 100644 --- a/arch/i386/pci/mmconfig.c +++ b/arch/i386/pci/mmconfig.c @@ -187,7 +187,7 @@ static __init void unreachable_devices(void) } } -void __init pci_mmcfg_init(void) +void __init pci_mmcfg_init(int type) { if ((pci_probe & PCI_PROBE_MMCONF) == 0) return; diff --git a/arch/i386/pci/pci.h b/arch/i386/pci/pci.h index bf4e79335388..8a7cf1f23684 100644 --- a/arch/i386/pci/pci.h +++ b/arch/i386/pci/pci.h @@ -81,7 +81,9 @@ extern int pci_conf1_write(unsigned int seg, unsigned int bus, extern int pci_conf1_read(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 *value); -extern void pci_direct_init(void); +extern int pci_direct_probe(void); +extern void pci_direct_init(int type); extern void pci_pcbios_init(void); -extern void pci_mmcfg_init(void); +extern void pci_mmcfg_init(int type); extern void pcibios_sort(void); + diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index dd78f4d18df1..d6b90d08476b 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -163,7 +163,7 @@ static __init void unreachable_devices(void) } } -void __init pci_mmcfg_init(void) +void __init pci_mmcfg_init(int type) { int i; -- cgit v1.2.1 From 9abd79280bbb9f56049f0168f412c3538cadb6eb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] i386/x86-64: Only do MCFG e820 check when type 1 works Needs earlier patch to split type 1 probing from use. This patch should fix the x86 macs where type 1 PCI config space access doesn't work, but MCFG does. They also don't have a usable e820 table so the e820 sanity check failed. Instead assume now that if type 1 doesn't work then MCFG must work and don't do the e820 check. Signed-off-by: Andi Kleen --- arch/i386/pci/mmconfig.c | 4 +++- arch/x86_64/pci/mmconfig.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/i386/pci/mmconfig.c b/arch/i386/pci/mmconfig.c index 44155c5e85d1..2c4f585510c9 100644 --- a/arch/i386/pci/mmconfig.c +++ b/arch/i386/pci/mmconfig.c @@ -198,7 +198,9 @@ void __init pci_mmcfg_init(int type) (pci_mmcfg_config[0].base_address == 0)) return; - if (!e820_all_mapped(pci_mmcfg_config[0].base_address, + /* Only do this check when type 1 works. If it doesn't work + assume we run on a Mac and always use MCFG */ + if (type == 1 && !e820_all_mapped(pci_mmcfg_config[0].base_address, pci_mmcfg_config[0].base_address + MMCONFIG_APER_MIN, E820_RESERVED)) { printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %x is not E820-reserved\n", diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index d6b90d08476b..8bea51030d66 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -176,7 +176,9 @@ void __init pci_mmcfg_init(int type) (pci_mmcfg_config[0].base_address == 0)) return; - if (!e820_all_mapped(pci_mmcfg_config[0].base_address, + /* Only do this check when type 1 works. If it doesn't work + assume we run on a Mac and always use MCFG */ + if (type == 1 && !e820_all_mapped(pci_mmcfg_config[0].base_address, pci_mmcfg_config[0].base_address + MMCONFIG_APER_MIN, E820_RESERVED)) { printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %x is not E820-reserved\n", -- cgit v1.2.1 From 56dd669a138c40ea6cdae487f233430d12372767 Mon Sep 17 00:00:00 2001 From: Aaron Durbin Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] Insert GART region into resource map Patch inserts the GART region into the iomem resource map. The GART will then be visible within /proc/iomem. It will also allow for other users utilizing the GART to subreserve the region (agp or IOMMU). Signed-off-by: Aaron Durbin Signed-off-by: Andi Kleen --- arch/x86_64/kernel/aperture.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index 04dbf1662523..f851e1f9c82a 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -33,6 +34,18 @@ int fallback_aper_force __initdata = 0; int fix_aperture __initdata = 1; +static struct resource gart_resource = { + .name = "GART", + .flags = IORESOURCE_MEM, +}; + +static void __init insert_aperture_resource(u32 aper_base, u32 aper_size) +{ + gart_resource.start = aper_base; + gart_resource.end = aper_base + aper_size - 1; + insert_resource(&iomem_resource, &gart_resource); +} + /* This code runs before the PCI subsystem is initialized, so just access the northbridge directly. */ @@ -62,6 +75,7 @@ static u32 __init allocate_aperture(void) } printk("Mapping aperture over %d KB of RAM @ %lx\n", aper_size >> 10, __pa(p)); + insert_aperture_resource((u32)__pa(p), aper_size); return (u32)__pa(p); } @@ -233,8 +247,13 @@ void __init iommu_hole_init(void) last_aper_base = aper_base; } - if (!fix && !fallback_aper_force) + if (!fix && !fallback_aper_force) { + if (last_aper_base) { + unsigned long n = (32 * 1024 * 1024) << last_aper_order; + insert_aperture_resource((u32)last_aper_base, n); + } return; + } if (!fallback_aper_force) aper_alloc = search_agp_bridge(&aper_order, &valid_agp); -- cgit v1.2.1 From 4c6e052adfe285ede5884e4e8c4d33af33932c13 Mon Sep 17 00:00:00 2001 From: Aaron Durbin Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] MMCONFIG and new Intel motherboards On Sat, Sep 09, 2006 at 04:14:29PM +0200, Andi Kleen wrote: > [patch] Looks reasonable, but probably not for 2.6.18 because this stuff > is already too fragile and it is probably too risky to do any big changes now > since not enough testing time is left. Can you please resubmit > it with proper description and signed-off-by line? I can queue it for .19 then > > -Andi Patch inserts PCI memory mapped config region(s) into the resource map. This will allow for the MMCCONFIG regions to be marked as busy in the iomem address space as well as the regions(s) showing up in /proc/iomem. Signed-off-by: Aaron Durbin Signed-off-by: Andi Kleen --- arch/x86_64/pci/mmconfig.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index 8bea51030d66..8c0722b95cdc 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -163,6 +163,37 @@ static __init void unreachable_devices(void) } } +static __init void pci_mmcfg_insert_resources(void) +{ +#define PCI_MMCFG_RESOURCE_NAME_LEN 19 + int i; + struct resource *res; + char *names; + unsigned num_buses; + + res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res), + pci_mmcfg_config_num, GFP_KERNEL); + + if (!res) { + printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n"); + return; + } + + names = (void *)&res[pci_mmcfg_config_num]; + for (i = 0; i < pci_mmcfg_config_num; i++, res++) { + num_buses = pci_mmcfg_config[i].end_bus_number - + pci_mmcfg_config[i].start_bus_number + 1; + res->name = names; + snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %u", + pci_mmcfg_config[i].pci_segment_group_number); + res->start = pci_mmcfg_config[i].base_address; + res->end = res->start + (num_buses << 20) - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + insert_resource(&iomem_resource, res); + names += PCI_MMCFG_RESOURCE_NAME_LEN; + } +} + void __init pci_mmcfg_init(int type) { int i; @@ -206,6 +237,7 @@ void __init pci_mmcfg_init(int type) } unreachable_devices(); + pci_mmcfg_insert_resources(); raw_pci_ops = &pci_mmcfg; pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; -- cgit v1.2.1 From de09bddb9d6f96785be470c832b881e6d72d589f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] i386: Add MMCFG resources to i386 too Following earlier x86-64 patch Cc: gregkh@suse.de Signed-off-by: Andi Kleen --- arch/i386/pci/mmconfig.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/arch/i386/pci/mmconfig.c b/arch/i386/pci/mmconfig.c index 2c4f585510c9..05be8db58a8c 100644 --- a/arch/i386/pci/mmconfig.c +++ b/arch/i386/pci/mmconfig.c @@ -151,6 +151,38 @@ static struct pci_raw_ops pci_mmcfg = { .write = pci_mmcfg_write, }; + +static __init void pci_mmcfg_insert_resources(void) +{ +#define PCI_MMCFG_RESOURCE_NAME_LEN 19 + int i; + struct resource *res; + char *names; + unsigned num_buses; + + res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res), + pci_mmcfg_config_num, GFP_KERNEL); + + if (!res) { + printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n"); + return; + } + + names = (void *)&res[pci_mmcfg_config_num]; + for (i = 0; i < pci_mmcfg_config_num; i++, res++) { + num_buses = pci_mmcfg_config[i].end_bus_number - + pci_mmcfg_config[i].start_bus_number + 1; + res->name = names; + snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %u", + pci_mmcfg_config[i].pci_segment_group_number); + res->start = pci_mmcfg_config[i].base_address; + res->end = res->start + (num_buses << 20) - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + insert_resource(&iomem_resource, res); + names += PCI_MMCFG_RESOURCE_NAME_LEN; + } +} + /* K8 systems have some devices (typically in the builtin northbridge) that are only accessible using type1 Normally this can be expressed in the MCFG by not listing them @@ -187,6 +219,8 @@ static __init void unreachable_devices(void) } } + + void __init pci_mmcfg_init(int type) { if ((pci_probe & PCI_PROBE_MMCONF) == 0) @@ -214,4 +248,5 @@ void __init pci_mmcfg_init(int type) pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; unreachable_devices(); + pci_mmcfg_insert_resources(); } -- cgit v1.2.1 From 2817716ace8c397685bd702223cc5a8a42c7e997 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] i386: Fix pack_descriptor() Fix pack_descriptor: 1. flags are bits 20-23 in the high word 2. limit's 4 msb are bits 16-19 in the high word These haven't mattered so far, because all users have had small limits and a flags setting of 0. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen =================================================================== --- include/asm-i386/desc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h index 5db9e96e8dc1..5874ef119ffd 100644 --- a/include/asm-i386/desc.h +++ b/include/asm-i386/desc.h @@ -46,7 +46,7 @@ static inline void pack_descriptor(__u32 *a, __u32 *b, { *a = ((base & 0xffff) << 16) | (limit & 0xffff); *b = (base & 0xff000000) | ((base & 0xff0000) >> 16) | - ((type & 0xff) << 8) | ((flags & 0xf) << 12); + (limit & 0x000f0000) | ((type & 0xff) << 8) | ((flags & 0xf) << 20); } static inline void pack_gate(__u32 *a, __u32 *b, -- cgit v1.2.1 From f354b3a92af9b132b188b9c8ebbfb74de699926d Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] i386: Split multi-line printk in oops output. Sometimes, bug reports come in where we've had an oops, and the only record we have is what the reporter saw on screen shortly before the system locked up completely. Unfortunatly, syslog only prints lines beginning with KERN_EMERG to the console, so some lines get lost. An example of this can be seen at https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=203723 Some of this information isn't vital to diagnosis, but some parts are useful, such as the tainted flag. Signed-off-by: Dave Jones Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 5c0f4960c67d..c7adb076e811 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -351,8 +351,9 @@ void show_registers(struct pt_regs *regs) ss = regs->xss & 0xffff; } print_modules(); - printk(KERN_EMERG "CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\n" - "EFLAGS: %08lx (%s %.*s) \n", + printk(KERN_EMERG "CPU: %d\n" + KERN_EMERG "EIP: %04x:[<%08lx>] %s VLI\n" + KERN_EMERG "EFLAGS: %08lx (%s %.*s)\n", smp_processor_id(), 0xffff & regs->xcs, regs->eip, print_tainted(), regs->eflags, system_utsname.release, (int)strcspn(system_utsname.version, " "), -- cgit v1.2.1 From 35d534a3ff4905c0a7bddbad0fc9df55967742c7 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] x86: - restore i8259A eoi status on resume Got it. i8259A_resume calls init_8259A(0) unconditionally, even if auto_eoi has been set. Keep track of the current status and restore that on resume. This fixes it for AMD64 and i386. Signed-off-by: Matthew Garrett Signed-off-by: Andi Kleen --- arch/i386/kernel/i8259.c | 6 +++++- arch/x86_64/kernel/i8259.c | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c index d4756d154f47..ea5f4e7958d8 100644 --- a/arch/i386/kernel/i8259.c +++ b/arch/i386/kernel/i8259.c @@ -45,6 +45,8 @@ static void end_8259A_irq (unsigned int irq) #define shutdown_8259A_irq disable_8259A_irq +static int i8259A_auto_eoi; + static void mask_and_ack_8259A(unsigned int); unsigned int startup_8259A_irq(unsigned int irq) @@ -253,7 +255,7 @@ static void save_ELCR(char *trigger) static int i8259A_resume(struct sys_device *dev) { - init_8259A(0); + init_8259A(i8259A_auto_eoi); restore_ELCR(irq_trigger); return 0; } @@ -301,6 +303,8 @@ void init_8259A(int auto_eoi) { unsigned long flags; + i8259A_auto_eoi = auto_eoi; + spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index d2f5acde5749..2dd51f364ea2 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c @@ -123,6 +123,8 @@ void (*interrupt[NR_IRQS])(void) = { DEFINE_SPINLOCK(i8259A_lock); +static int i8259A_auto_eoi; + static void end_8259A_irq (unsigned int irq) { if (irq > 256) { @@ -336,6 +338,8 @@ void init_8259A(int auto_eoi) { unsigned long flags; + i8259A_auto_eoi = auto_eoi; + spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, 0x21); /* mask all of 8259A-1 */ @@ -394,7 +398,7 @@ static void save_ELCR(char *trigger) static int i8259A_resume(struct sys_device *dev) { - init_8259A(0); + init_8259A(i8259A_auto_eoi); restore_ELCR(irq_trigger); return 0; } -- cgit v1.2.1 From 27fbe5b28a2ffef171c6005f304ea4f80fcdcc01 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] Use string instructions for Core2 copy/clear It is faster than using a unrolled loop for the use cases the kernel cares about (cached, sizes typically < 4K) Signed-off-by: Andi Kleen --- arch/x86_64/kernel/setup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index e328e3eb8cbe..085f2e09b9dc 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -843,6 +843,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); + if (c->x86 == 6) + set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); c->x86_max_cores = intel_num_cpu_cores(c); -- cgit v1.2.1 From 9ddab42d1e65dfbdb14588346d47fb142090ab40 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] Remove outdated comment in x86-64 mmconfig code Cc: gregkh@suse.de Signed-off-by: Andi Kleen --- arch/x86_64/pci/mmconfig.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index 8c0722b95cdc..7732f4254d21 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -218,7 +218,6 @@ void __init pci_mmcfg_init(int type) return; } - /* RED-PEN i386 doesn't do _nocache right now */ pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL); if (pci_mmcfg_virt == NULL) { printk("PCI: Can not allocate memory for mmconfig structures\n"); -- cgit v1.2.1 From 2049336f60d297c85ac977245b9326ec00396114 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] Don't synchronize time reading on single core AMD systems We do some additional CPU synchronization in gettimeofday et.al. to make sure the time stamps are always monotonic over multiple CPUs. But on single core systems that is not needed. So don't do it. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/setup.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 085f2e09b9dc..26524ce3b239 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -728,6 +728,12 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) /* Fix cpuid4 emulation for more */ num_cache_leaves = 3; + + /* When there is only one core no need to synchronize RDTSC */ + if (num_possible_cpus() == 1) + set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); + else + clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); } static void __cpuinit detect_ht(struct cpuinfo_x86 *c) -- cgit v1.2.1 From dd54a11004b2c9a1f136225f880e021a43b0eadc Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] Remove all traces of signal number conversion This was old code that was needed for iBCS and x86-64 never supported that. Pointed out by Albert Cahalan Signed-off-by: Andi Kleen --- arch/x86_64/ia32/ia32_signal.c | 10 +--------- arch/x86_64/kernel/signal.c | 5 ----- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c index 549de439fb2d..2d20f8304b31 100644 --- a/arch/x86_64/ia32/ia32_signal.c +++ b/arch/x86_64/ia32/ia32_signal.c @@ -431,15 +431,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) goto give_sigsegv; - { - struct exec_domain *ed = current_thread_info()->exec_domain; - err |= __put_user((ed - && ed->signal_invmap - && sig < 32 - ? ed->signal_invmap[sig] - : sig), - &frame->sig); - } + err |= __put_user(sig, &frame->sig); if (err) goto give_sigsegv; diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index 7f58bc9a056d..49ec324cd141 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -277,11 +277,6 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, #endif /* Set up registers for signal handler */ - { - struct exec_domain *ed = current_thread_info()->exec_domain; - if (unlikely(ed && ed->signal_invmap && sig < 32)) - sig = ed->signal_invmap[sig]; - } regs->rdi = sig; /* In case the signal handler was declared without prototypes */ regs->rax = 0; -- cgit v1.2.1 From 536e3ee4fed13d2d4bbf1b775174aba0cadf6aba Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] Initialize argument registers for 32bit signal handlers. In case the user space was compiled with -mregparm=3 Following i386. Pointed out by Albert Cahalan Signed-off-by: Andi Kleen --- arch/x86_64/ia32/ia32_signal.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c index 2d20f8304b31..ced15012a3d7 100644 --- a/arch/x86_64/ia32/ia32_signal.c +++ b/arch/x86_64/ia32/ia32_signal.c @@ -478,6 +478,11 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, regs->rsp = (unsigned long) frame; regs->rip = (unsigned long) ka->sa.sa_handler; + /* Make -mregparm=3 work */ + regs->rax = sig; + regs->rdx = 0; + regs->rcx = 0; + asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); -- cgit v1.2.1 From ab2e0b46cb9a197fab7d98e147cac7cd41a14047 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] Fix some broken white space in ia32_signal.c No functional changes Signed-off-by: Andi Kleen --- arch/x86_64/ia32/ia32_signal.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c index ced15012a3d7..a6ba9951e86c 100644 --- a/arch/x86_64/ia32/ia32_signal.c +++ b/arch/x86_64/ia32/ia32_signal.c @@ -490,9 +490,9 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, regs->ss = __USER32_DS; set_fs(USER_DS); - regs->eflags &= ~TF_MASK; - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); + regs->eflags &= ~TF_MASK; + if (test_thread_flag(TIF_SINGLESTEP)) + ptrace_notify(SIGTRAP); #if DEBUG_SIG printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", @@ -587,8 +587,8 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, set_fs(USER_DS); regs->eflags &= ~TF_MASK; - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); + if (test_thread_flag(TIF_SINGLESTEP)) + ptrace_notify(SIGTRAP); #if DEBUG_SIG printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", -- cgit v1.2.1 From adf1423698f00d00b267f7dca8231340ce7d65ef Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] i386/x86-64: Work around gcc bug with noreturn functions in unwinder Current gcc generates calls not jumps to noreturn functions. When that happens the return address can point to the next function, which confuses the unwinder. This patch works around it by marking asynchronous exception frames in contrast normal call frames in the unwind information. Then teach the unwinder to decode this. For normal call frames the unwinder now subtracts one from the address which avoids this problem. The standard libgcc unwinder uses the same trick. It doesn't include adjustment of the printed address (i.e. for the original example, it'd still be kernel_math_error+0 that gets displayed, but the unwinder wouldn't get confused anymore. This only works with binutils 2.6.17+ and some versions of H.J.Lu's 2.6.16 unfortunately because earlier binutils don't support .cfi_signal_frame [AK: added automatic detection of the new binutils and wrote description] Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/Makefile | 4 ++++ arch/i386/kernel/entry.S | 4 ++++ arch/x86_64/Makefile | 4 ++++ arch/x86_64/ia32/ia32entry.S | 4 ++++ arch/x86_64/kernel/entry.S | 4 ++++ include/asm-i386/dwarf2.h | 7 +++++++ include/asm-i386/unwind.h | 5 +++++ include/asm-x86_64/dwarf2.h | 6 ++++++ include/asm-x86_64/unwind.h | 5 +++++ kernel/unwind.c | 35 ++++++++++++++++++++++++++++------- scripts/Kbuild.include | 4 ++-- 11 files changed, 73 insertions(+), 9 deletions(-) diff --git a/arch/i386/Makefile b/arch/i386/Makefile index 508cdbeb3a09..7cc0b189b82b 100644 --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -50,6 +50,10 @@ CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then echo $(call cc-op cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) +# is .cfi_signal_frame supported too? +cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) +AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) + CFLAGS += $(cflags-y) # Default subarch .c files diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 284f2e908ad0..5a63d6fdb70e 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -183,18 +183,21 @@ VM_MASK = 0x00020000 #define RING0_INT_FRAME \ CFI_STARTPROC simple;\ + CFI_SIGNAL_FRAME;\ CFI_DEF_CFA esp, 3*4;\ /*CFI_OFFSET cs, -2*4;*/\ CFI_OFFSET eip, -3*4 #define RING0_EC_FRAME \ CFI_STARTPROC simple;\ + CFI_SIGNAL_FRAME;\ CFI_DEF_CFA esp, 4*4;\ /*CFI_OFFSET cs, -2*4;*/\ CFI_OFFSET eip, -3*4 #define RING0_PTREGS_FRAME \ CFI_STARTPROC simple;\ + CFI_SIGNAL_FRAME;\ CFI_DEF_CFA esp, OLDESP-EBX;\ /*CFI_OFFSET cs, CS-OLDESP;*/\ CFI_OFFSET eip, EIP-OLDESP;\ @@ -275,6 +278,7 @@ need_resched: # sysenter call handler stub ENTRY(sysenter_entry) CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA esp, 0 CFI_REGISTER esp, ebp movl TSS_sysenter_esp0(%esp),%esp diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index 2b8d07c70106..1c0f18d4f887 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -58,6 +58,10 @@ cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) +# is .cfi_signal_frame supported too? +cflags-y += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,) +AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,) + cflags-$(CONFIG_CC_STACKPROTECTOR) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC) -fstack-protector ) cflags-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC) -fstack-protector-all ) diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 32fd32bea07c..b4aa875e175b 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -71,6 +71,7 @@ */ ENTRY(ia32_sysenter_target) CFI_STARTPROC32 simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,0 CFI_REGISTER rsp,rbp swapgs @@ -186,6 +187,7 @@ ENDPROC(ia32_sysenter_target) */ ENTRY(ia32_cstar_target) CFI_STARTPROC32 simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,PDA_STACKOFFSET CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ @@ -293,6 +295,7 @@ ia32_badarg: ENTRY(ia32_syscall) CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8-RIP /*CFI_REL_OFFSET ss,SS-RIP*/ CFI_REL_OFFSET rsp,RSP-RIP @@ -370,6 +373,7 @@ ENTRY(ia32_ptregs_common) popq %r11 CFI_ENDPROC CFI_STARTPROC32 simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8-ARGOFFSET CFI_REL_OFFSET rax,RAX-ARGOFFSET CFI_REL_OFFSET rcx,RCX-ARGOFFSET diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index ea32688386fd..4cbc65290ae7 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -123,6 +123,7 @@ .macro CFI_DEFAULT_STACK start=1 .if \start CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8 .else CFI_DEF_CFA_OFFSET SS+8 @@ -207,6 +208,7 @@ END(ret_from_fork) ENTRY(system_call) CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,PDA_STACKOFFSET CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ @@ -324,6 +326,7 @@ END(system_call) */ ENTRY(int_ret_from_sys_call) CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8-ARGOFFSET /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/ CFI_REL_OFFSET rsp,RSP-ARGOFFSET @@ -484,6 +487,7 @@ END(stub_rt_sigreturn) */ .macro _frame ref CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8-\ref /*CFI_REL_OFFSET ss,SS-\ref*/ CFI_REL_OFFSET rsp,RSP-\ref diff --git a/include/asm-i386/dwarf2.h b/include/asm-i386/dwarf2.h index 5d1a8db5a9b0..6d66398a307d 100644 --- a/include/asm-i386/dwarf2.h +++ b/include/asm-i386/dwarf2.h @@ -28,6 +28,12 @@ #define CFI_RESTORE_STATE .cfi_restore_state #define CFI_UNDEFINED .cfi_undefined +#ifdef CONFIG_AS_CFI_SIGNAL_FRAME +#define CFI_SIGNAL_FRAME .cfi_signal_frame +#else +#define CFI_SIGNAL_FRAME +#endif + #else /* Due to the structure of pre-exisiting code, don't use assembler line @@ -48,6 +54,7 @@ #define CFI_REMEMBER_STATE ignore #define CFI_RESTORE_STATE ignore #define CFI_UNDEFINED ignore +#define CFI_SIGNAL_FRAME ignore #endif diff --git a/include/asm-i386/unwind.h b/include/asm-i386/unwind.h index f0ac399bae3c..5031d693b89d 100644 --- a/include/asm-i386/unwind.h +++ b/include/asm-i386/unwind.h @@ -18,6 +18,7 @@ struct unwind_frame_info { struct pt_regs regs; struct task_struct *task; + unsigned call_frame:1; }; #define UNW_PC(frame) (frame)->regs.eip @@ -44,6 +45,10 @@ struct unwind_frame_info PTREGS_INFO(edi), \ PTREGS_INFO(eip) +#define UNW_DEFAULT_RA(raItem, dataAlign) \ + ((raItem).where == Memory && \ + !((raItem).value * (dataAlign) + 4)) + static inline void arch_unw_init_frame_info(struct unwind_frame_info *info, /*const*/ struct pt_regs *regs) { diff --git a/include/asm-x86_64/dwarf2.h b/include/asm-x86_64/dwarf2.h index 2b9368365fad..eedc08526b0b 100644 --- a/include/asm-x86_64/dwarf2.h +++ b/include/asm-x86_64/dwarf2.h @@ -28,6 +28,11 @@ #define CFI_REMEMBER_STATE .cfi_remember_state #define CFI_RESTORE_STATE .cfi_restore_state #define CFI_UNDEFINED .cfi_undefined +#ifdef CONFIG_AS_CFI_SIGNAL_FRAME +#define CFI_SIGNAL_FRAME .cfi_signal_frame +#else +#define CFI_SIGNAL_FRAME +#endif #else @@ -45,6 +50,7 @@ #define CFI_REMEMBER_STATE # #define CFI_RESTORE_STATE # #define CFI_UNDEFINED # +#define CFI_SIGNAL_FRAME # #endif diff --git a/include/asm-x86_64/unwind.h b/include/asm-x86_64/unwind.h index 1f6e9bfb569e..b8fa5cb7ff88 100644 --- a/include/asm-x86_64/unwind.h +++ b/include/asm-x86_64/unwind.h @@ -18,6 +18,7 @@ struct unwind_frame_info { struct pt_regs regs; struct task_struct *task; + unsigned call_frame:1; }; #define UNW_PC(frame) (frame)->regs.rip @@ -57,6 +58,10 @@ struct unwind_frame_info PTREGS_INFO(r15), \ PTREGS_INFO(rip) +#define UNW_DEFAULT_RA(raItem, dataAlign) \ + ((raItem).where == Memory && \ + !((raItem).value * (dataAlign) + 8)) + static inline void arch_unw_init_frame_info(struct unwind_frame_info *info, /*const*/ struct pt_regs *regs) { diff --git a/kernel/unwind.c b/kernel/unwind.c index f69c804c8e62..3430475fcd88 100644 --- a/kernel/unwind.c +++ b/kernel/unwind.c @@ -603,6 +603,7 @@ int unwind(struct unwind_frame_info *frame) #define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs]) const u32 *fde = NULL, *cie = NULL; const u8 *ptr = NULL, *end = NULL; + unsigned long pc = UNW_PC(frame) - frame->call_frame; unsigned long startLoc = 0, endLoc = 0, cfa; unsigned i; signed ptrType = -1; @@ -612,7 +613,7 @@ int unwind(struct unwind_frame_info *frame) if (UNW_PC(frame) == 0) return -EINVAL; - if ((table = find_table(UNW_PC(frame))) != NULL + if ((table = find_table(pc)) != NULL && !(table->size & (sizeof(*fde) - 1))) { unsigned long tableSize = table->size; @@ -647,7 +648,7 @@ int unwind(struct unwind_frame_info *frame) ptrType & DW_EH_PE_indirect ? ptrType : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed)); - if (UNW_PC(frame) >= startLoc && UNW_PC(frame) < endLoc) + if (pc >= startLoc && pc < endLoc) break; cie = NULL; } @@ -657,16 +658,28 @@ int unwind(struct unwind_frame_info *frame) state.cieEnd = ptr; /* keep here temporarily */ ptr = (const u8 *)(cie + 2); end = (const u8 *)(cie + 1) + *cie; + frame->call_frame = 1; if ((state.version = *ptr) != 1) cie = NULL; /* unsupported version */ else if (*++ptr) { /* check if augmentation size is first (and thus present) */ if (*ptr == 'z') { - /* check for ignorable (or already handled) - * nul-terminated augmentation string */ - while (++ptr < end && *ptr) - if (strchr("LPR", *ptr) == NULL) + while (++ptr < end && *ptr) { + switch(*ptr) { + /* check for ignorable (or already handled) + * nul-terminated augmentation string */ + case 'L': + case 'P': + case 'R': + continue; + case 'S': + frame->call_frame = 0; + continue; + default: break; + } + break; + } } if (ptr >= end || *ptr) cie = NULL; @@ -755,7 +768,7 @@ int unwind(struct unwind_frame_info *frame) state.org = startLoc; memcpy(&state.cfa, &badCFA, sizeof(state.cfa)); /* process instructions */ - if (!processCFI(ptr, end, UNW_PC(frame), ptrType, &state) + if (!processCFI(ptr, end, pc, ptrType, &state) || state.loc > endLoc || state.regs[retAddrReg].where == Nowhere || state.cfa.reg >= ARRAY_SIZE(reg_info) @@ -763,6 +776,11 @@ int unwind(struct unwind_frame_info *frame) || state.cfa.offs % sizeof(unsigned long)) return -EIO; /* update frame */ +#ifndef CONFIG_AS_CFI_SIGNAL_FRAME + if(frame->call_frame + && !UNW_DEFAULT_RA(state.regs[retAddrReg], state.dataAlign)) + frame->call_frame = 0; +#endif cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs; startLoc = min((unsigned long)UNW_SP(frame), cfa); endLoc = max((unsigned long)UNW_SP(frame), cfa); @@ -866,6 +884,7 @@ int unwind_init_frame_info(struct unwind_frame_info *info, /*const*/ struct pt_regs *regs) { info->task = tsk; + info->call_frame = 0; arch_unw_init_frame_info(info, regs); return 0; @@ -879,6 +898,7 @@ int unwind_init_blocked(struct unwind_frame_info *info, struct task_struct *tsk) { info->task = tsk; + info->call_frame = 0; arch_unw_init_blocked(info); return 0; @@ -894,6 +914,7 @@ int unwind_init_running(struct unwind_frame_info *info, void *arg) { info->task = current; + info->call_frame = 0; return arch_unwind_init_running(info, callback, arg); } diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index 7adef12a0c26..4f5ff19b992b 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -66,8 +66,8 @@ as-option = $(shell if $(CC) $(CFLAGS) $(1) -Wa,-Z -c -o /dev/null \ # as-instr # Usage: cflags-y += $(call as-instr, instr, option1, option2) -as-instr = $(shell if echo -e "$(1)" | $(AS) -Z -o astest$$$$.out \ - 2>&1 >/dev/null ; then echo "$(2)"; else echo "$(3)"; fi; \ +as-instr = $(shell if echo -e "$(1)" | $(AS) >/dev/null 2>&1 -W -Z -o astest$$$$.out ; \ + then echo "$(2)"; else echo "$(3)"; fi; \ rm -f astest$$$$.out) # cc-option -- cgit v1.2.1 From 658fdbef66e5e9be79b457edc2cbbb3add840aa9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] Don't leak NT bit into next task SYSENTER can cause a NT to be set which might cause crashes on the IRET in the next task. Following similar i386 patch from Linus. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/entry.S | 4 ++++ arch/x86_64/kernel/setup64.c | 4 ++++ include/asm-x86_64/system.h | 5 +++-- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 4cbc65290ae7..2802524104f3 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -155,6 +155,10 @@ /* rdi: prev */ ENTRY(ret_from_fork) CFI_DEFAULT_STACK + push kernel_eflags(%rip) + CFI_ADJUST_CFA_OFFSET 4 + popf # reset kernel eflags + CFI_ADJUST_CFA_OFFSET -4 call schedule_tail GET_THREAD_INFO(%rcx) testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 9332d2361e08..8c4b80fe71a1 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -180,6 +180,8 @@ void __cpuinit check_efer(void) } } +unsigned long kernel_eflags; + /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT @@ -281,4 +283,6 @@ void __cpuinit cpu_init (void) set_debugreg(0UL, 7); fpu_init(); + + raw_local_save_flags(kernel_eflags); } diff --git a/include/asm-x86_64/system.h b/include/asm-x86_64/system.h index 6bf170bceae1..bd376bc8c4ab 100644 --- a/include/asm-x86_64/system.h +++ b/include/asm-x86_64/system.h @@ -14,12 +14,13 @@ #define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t" /* frame pointer must be last for get_wchan */ -#define SAVE_CONTEXT "pushq %%rbp ; movq %%rsi,%%rbp\n\t" -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\n\t" +#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" #define __EXTRA_CLOBBER \ ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15" +/* Save restore flags to clear handle leaking NT */ #define switch_to(prev,next,last) \ asm volatile(SAVE_CONTEXT \ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ -- cgit v1.2.1 From f157cbb1eb9ce3f33a401ec6d20eb3eb852351a3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] i386/x86-64: Make all early PCI scans dependent on CONFIG_PCI This is useful on systems with broken PCI bus. Affects various scans in x86-64 and i386's early ACPI quirk scan. Cc: gregkh@suse.de Cc: len.brown@intel.com Cc: Trammell Hudson Signed-off-by: Andi Kleen --- arch/i386/kernel/acpi/Makefile | 2 ++ arch/i386/kernel/setup.c | 2 ++ arch/x86_64/Kconfig | 3 ++- arch/x86_64/kernel/Makefile | 3 ++- arch/x86_64/kernel/setup.c | 2 ++ 5 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/i386/kernel/acpi/Makefile b/arch/i386/kernel/acpi/Makefile index 7e9ac99354f4..7f7be01f44e6 100644 --- a/arch/i386/kernel/acpi/Makefile +++ b/arch/i386/kernel/acpi/Makefile @@ -1,5 +1,7 @@ obj-$(CONFIG_ACPI) += boot.o +ifneq ($(CONFIG_PCI),) obj-$(CONFIG_X86_IO_APIC) += earlyquirk.o +endif obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o ifneq ($(CONFIG_ACPI_PROCESSOR),) diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index ea17567dbe72..7a99b1369fa2 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -1437,9 +1437,11 @@ void __init setup_arch(char **cmdline_p) acpi_boot_table_init(); #endif +#ifdef CONFIG_PCI #ifdef CONFIG_X86_IO_APIC check_acpi_pci(); /* Checks more than just ACPI actually */ #endif +#endif #ifdef CONFIG_ACPI acpi_boot_init(); diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 1f4212605ef8..c2c68b902347 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -105,6 +105,7 @@ config X86_PC config X86_VSMP bool "Support for ScaleMP vSMP" + depends on PCI help Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is supposed to run on these EM64T-based machines. Only choose this option @@ -291,7 +292,7 @@ config NUMA config K8_NUMA bool "Old style AMD Opteron NUMA detection" - depends on NUMA + depends on NUMA && PCI default y help Enable K8 NUMA node topology detection. You should say Y here if diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 000e67e8f028..2466fbd035ee 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -8,7 +8,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ - pci-dma.o pci-nommu.o alternative.o early-quirks.o + pci-dma.o pci-nommu.o alternative.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_X86_MCE) += mce.o @@ -39,6 +39,7 @@ obj-$(CONFIG_K8_NB) += k8.o obj-$(CONFIG_AUDIT) += audit.o obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_PCI) += early-quirks.o obj-y += topology.o obj-y += intel_cacheinfo.o diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 26524ce3b239..3d8309b1236e 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -493,7 +493,9 @@ void __init setup_arch(char **cmdline_p) paging_init(); +#ifdef CONFIG_PCI early_quirks(); +#endif /* * set this early, so we dont allocate cpu0 -- cgit v1.2.1 From 8f60774a116ced9b73ae3913d511687889efe725 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] x86: Move direct PCI scanning functions out of line Saves about 200 bytes of code space. Signed-off-by: Andi Kleen --- arch/i386/pci/Makefile | 2 +- arch/i386/pci/early.c | 44 +++++++++++++++++++++++++++++++++++++++++ arch/x86_64/pci/Makefile | 3 ++- include/asm-x86_64/pci-direct.h | 42 ++++----------------------------------- 4 files changed, 51 insertions(+), 40 deletions(-) create mode 100644 arch/i386/pci/early.c diff --git a/arch/i386/pci/Makefile b/arch/i386/pci/Makefile index 62ad75c57e6a..1594d2f55c8f 100644 --- a/arch/i386/pci/Makefile +++ b/arch/i386/pci/Makefile @@ -11,4 +11,4 @@ pci-y += legacy.o irq.o pci-$(CONFIG_X86_VISWS) := visws.o fixup.o pci-$(CONFIG_X86_NUMAQ) := numa.o irq.o -obj-y += $(pci-y) common.o +obj-y += $(pci-y) common.o early.o diff --git a/arch/i386/pci/early.c b/arch/i386/pci/early.c new file mode 100644 index 000000000000..b1f7f40d809b --- /dev/null +++ b/arch/i386/pci/early.c @@ -0,0 +1,44 @@ +#include +#include +#include + +/* Direct PCI access. This is used for PCI accesses in early boot before + the PCI subsystem works. */ + +#define PDprintk(x...) + +u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset) +{ + u32 v; + outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); + v = inl(0xcfc); + if (v != 0xffffffff) + PDprintk("%x reading 4 from %x: %x\n", slot, offset, v); + return v; +} + +u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset) +{ + u8 v; + outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); + v = inb(0xcfc + (offset&3)); + PDprintk("%x reading 1 from %x: %x\n", slot, offset, v); + return v; +} + +u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset) +{ + u16 v; + outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); + v = inw(0xcfc + (offset&2)); + PDprintk("%x reading 2 from %x: %x\n", slot, offset, v); + return v; +} + +void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, + u32 val) +{ + PDprintk("%x writing to %x: %x\n", slot, offset, val); + outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); + outl(val, 0xcfc); +} diff --git a/arch/x86_64/pci/Makefile b/arch/x86_64/pci/Makefile index a3f6ad570179..1eb18f421edf 100644 --- a/arch/x86_64/pci/Makefile +++ b/arch/x86_64/pci/Makefile @@ -9,7 +9,7 @@ obj-y := i386.o obj-$(CONFIG_PCI_DIRECT)+= direct.o obj-y += fixup.o init.o obj-$(CONFIG_ACPI) += acpi.o -obj-y += legacy.o irq.o common.o +obj-y += legacy.o irq.o common.o early.o # mmconfig has a 64bit special obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o @@ -23,3 +23,4 @@ common-y += ../../i386/pci/common.o fixup-y += ../../i386/pci/fixup.o i386-y += ../../i386/pci/i386.o init-y += ../../i386/pci/init.o +early-y += ../../i386/pci/early.o diff --git a/include/asm-x86_64/pci-direct.h b/include/asm-x86_64/pci-direct.h index 036b6ca5b53b..9d916cdaa18e 100644 --- a/include/asm-x86_64/pci-direct.h +++ b/include/asm-x86_64/pci-direct.h @@ -2,47 +2,13 @@ #define ASM_PCI_DIRECT_H 1 #include -#include /* Direct PCI access. This is used for PCI accesses in early boot before the PCI subsystem works. */ -#define PDprintk(x...) - -static inline u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset) -{ - u32 v; - outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); - v = inl(0xcfc); - if (v != 0xffffffff) - PDprintk("%x reading 4 from %x: %x\n", slot, offset, v); - return v; -} - -static inline u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset) -{ - u8 v; - outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); - v = inb(0xcfc + (offset&3)); - PDprintk("%x reading 1 from %x: %x\n", slot, offset, v); - return v; -} - -static inline u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset) -{ - u16 v; - outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); - v = inw(0xcfc + (offset&2)); - PDprintk("%x reading 2 from %x: %x\n", slot, offset, v); - return v; -} - -static inline void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, - u32 val) -{ - PDprintk("%x writing to %x: %x\n", slot, offset, val); - outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); - outl(val, 0xcfc); -} +extern u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset); +extern u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset); +extern u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset); +extern void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, u32 val); #endif -- cgit v1.2.1 From 0637a70a5db98182d9ad3d6ae1ee30acf20afde9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] x86: Allow disabling early pci scans with pci=noearly or disallowing conf1 Some buggy systems can machine check when config space accesses happen for some non existent devices. i386/x86-64 do some early device scans that might trigger this. Allow pci=noearly to disable this. Also when type 1 is disabling also don't do any early accesses which are always type1. This moves the pci= configuration parsing to be a early parameter. I don't think this can break anything because it only changes a single global that is only used by PCI. Cc: gregkh@suse.de Cc: Trammell Hudson Signed-off-by: Andi Kleen --- Documentation/kernel-parameters.txt | 6 +++++- arch/i386/kernel/acpi/earlyquirk.c | 6 +++++- arch/i386/pci/common.c | 4 ++++ arch/i386/pci/early.c | 8 ++++++++ arch/i386/pci/pci.h | 1 + arch/x86_64/kernel/aperture.c | 2 +- arch/x86_64/kernel/early-quirks.c | 4 ++++ arch/x86_64/kernel/pci-calgary.c | 3 +++ arch/x86_64/kernel/vsmp.c | 3 +++ arch/x86_64/mm/k8topology.c | 3 +++ drivers/pci/pci.c | 5 ++--- include/asm-x86_64/pci-direct.h | 2 ++ 12 files changed, 41 insertions(+), 6 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 71d05f481727..4ae99f6e9938 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1240,7 +1240,11 @@ running once the system is up. bootloader. This is currently used on IXP2000 systems where the bus has to be configured a certain way for adjunct CPUs. - + noearly [X86] Don't do any early type 1 scanning. + This might help on some broken boards which + machine check when some devices' config space + is read. But various workarounds are disabled + and some IOMMU drivers will not work. pcmv= [HW,PCMCIA] BadgePAD 4 pd. [PARIDE] diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c index 1649a175a206..fe799b11ac0a 100644 --- a/arch/i386/kernel/acpi/earlyquirk.c +++ b/arch/i386/kernel/acpi/earlyquirk.c @@ -48,7 +48,11 @@ void __init check_acpi_pci(void) int num, slot, func; /* Assume the machine supports type 1. If not it will - always read ffffffff and should not have any side effect. */ + always read ffffffff and should not have any side effect. + Actually a few buggy systems can machine check. Allow the user + to disable it by command line option at least -AK */ + if (!early_pci_allowed()) + return; /* Poor man's PCI discovery */ for (num = 0; num < 32; num++) { diff --git a/arch/i386/pci/common.c b/arch/i386/pci/common.c index 0a362e3aeac5..68bce194e688 100644 --- a/arch/i386/pci/common.c +++ b/arch/i386/pci/common.c @@ -242,6 +242,10 @@ char * __devinit pcibios_setup(char *str) acpi_noirq_set(); return NULL; } + else if (!strcmp(str, "noearly")) { + pci_probe |= PCI_PROBE_NOEARLY; + return NULL; + } #ifndef CONFIG_X86_VISWS else if (!strcmp(str, "usepirqmask")) { pci_probe |= PCI_USE_PIRQ_MASK; diff --git a/arch/i386/pci/early.c b/arch/i386/pci/early.c index b1f7f40d809b..713d6c866cae 100644 --- a/arch/i386/pci/early.c +++ b/arch/i386/pci/early.c @@ -1,6 +1,8 @@ #include +#include #include #include +#include "pci.h" /* Direct PCI access. This is used for PCI accesses in early boot before the PCI subsystem works. */ @@ -42,3 +44,9 @@ void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); outl(val, 0xcfc); } + +int early_pci_allowed(void) +{ + return (pci_probe & (PCI_PROBE_CONF1|PCI_PROBE_NOEARLY)) == + PCI_PROBE_CONF1; +} diff --git a/arch/i386/pci/pci.h b/arch/i386/pci/pci.h index 8a7cf1f23684..1814f74569c6 100644 --- a/arch/i386/pci/pci.h +++ b/arch/i386/pci/pci.h @@ -17,6 +17,7 @@ #define PCI_PROBE_CONF2 0x0004 #define PCI_PROBE_MMCONF 0x0008 #define PCI_PROBE_MASK 0x000f +#define PCI_PROBE_NOEARLY 0x0010 #define PCI_NO_SORT 0x0100 #define PCI_BIOS_SORT 0x0200 diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index f851e1f9c82a..b487396c4c5b 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -212,7 +212,7 @@ void __init iommu_hole_init(void) u64 aper_base, last_aper_base = 0; int valid_agp = 0; - if (iommu_aperture_disabled || !fix_aperture) + if (iommu_aperture_disabled || !fix_aperture || !early_pci_allowed()) return; printk("Checking aperture...\n"); diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c index d637cff1c4b1..208e38a372c1 100644 --- a/arch/x86_64/kernel/early-quirks.c +++ b/arch/x86_64/kernel/early-quirks.c @@ -82,6 +82,10 @@ static struct chipset early_qrk[] = { void __init early_quirks(void) { int num, slot, func; + + if (!early_pci_allowed()) + return; + /* Poor man's PCI discovery */ for (num = 0; num < 32; num++) { for (slot = 0; slot < 32; slot++) { diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 466588f95601..cfb09b07ae99 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -924,6 +924,9 @@ void __init detect_calgary(void) if (swiotlb || no_iommu || iommu_detected) return; + if (!early_pci_allowed()) + return; + specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { diff --git a/arch/x86_64/kernel/vsmp.c b/arch/x86_64/kernel/vsmp.c index 92f70c74965f..044e852bd25e 100644 --- a/arch/x86_64/kernel/vsmp.c +++ b/arch/x86_64/kernel/vsmp.c @@ -20,6 +20,9 @@ static int __init vsmp_init(void) void *address; unsigned int cap, ctl; + if (!early_pci_allowed()) + return 0; + /* Check if we are running on a ScaleMP vSMP box */ if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) || (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL)) diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c index 7c45c2d2b8b2..5cf594f9230d 100644 --- a/arch/x86_64/mm/k8topology.c +++ b/arch/x86_64/mm/k8topology.c @@ -54,6 +54,9 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) nodes_clear(nodes_parsed); + if (!early_pci_allowed()) + return -1; + nb = find_northbridge(); if (nb < 0) return nb; diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 9f79dd6d51ab..684deb6b03aa 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -953,13 +953,12 @@ static int __devinit pci_setup(char *str) } str = k; } - return 1; + return 0; } +early_param("pci", pci_setup); device_initcall(pci_init); -__setup("pci=", pci_setup); - #if defined(CONFIG_ISA) || defined(CONFIG_EISA) /* FIXME: Some boxes have multiple ISA bridges! */ struct pci_dev *isa_bridge; diff --git a/include/asm-x86_64/pci-direct.h b/include/asm-x86_64/pci-direct.h index 9d916cdaa18e..eba9cb471df3 100644 --- a/include/asm-x86_64/pci-direct.h +++ b/include/asm-x86_64/pci-direct.h @@ -11,4 +11,6 @@ extern u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset); extern u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset); extern void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, u32 val); +extern int early_pci_allowed(void); + #endif -- cgit v1.2.1 From b89ebd0b0a65d5371aa9ad98e873c4616056ca68 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:42 +0200 Subject: [PATCH] Fix unwinder warning in traps.c Fix linux/arch/x86_64/kernel/traps.c: In function 'dump_trace': linux/arch/x86_64/kernel/traps.c:275: warning: cast to pointer from integer of different size with allnoconfig Cc: jbeulich@novell.com Signed-off-by: Andi Kleen --- include/asm-x86_64/unwind.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/asm-x86_64/unwind.h b/include/asm-x86_64/unwind.h index b8fa5cb7ff88..2e7ff10fd775 100644 --- a/include/asm-x86_64/unwind.h +++ b/include/asm-x86_64/unwind.h @@ -99,8 +99,8 @@ static inline int arch_unw_user_mode(const struct unwind_frame_info *info) #else -#define UNW_PC(frame) ((void)(frame), 0) -#define UNW_SP(frame) ((void)(frame), 0) +#define UNW_PC(frame) ((void)(frame), 0UL) +#define UNW_SP(frame) ((void)(frame), 0UL) static inline int arch_unw_user_mode(const void *info) { -- cgit v1.2.1 From 3b171672831b9633c2ed8fa94805255cd4d5af19 Mon Sep 17 00:00:00 2001 From: Dmitriy Zavin Date: Tue, 26 Sep 2006 10:52:42 +0200 Subject: [PATCH] Add 64bit jiffies compares (for use with get_jiffies_64) The current time_before/time_after macros will fail typechecks when passed u64 values (as returned by get_jiffies_64()). On 64bit systems, this will just result in a warning about mismatching types without explicit casts, but since unsigned long and u64 (unsigned long long) are of same size, it will still work. On 32bit systems, a long is 32bits, so the value from get_jiffies_64() will be truncated by the cast and thus lose all the precision gained by 64bit jiffies. Signed-off-by: Dmitriy Zavin Signed-off-by: Andi Kleen --- include/linux/jiffies.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 329ebcffa106..c8d5f207c3d4 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -115,6 +115,21 @@ static inline u64 get_jiffies_64(void) ((long)(a) - (long)(b) >= 0)) #define time_before_eq(a,b) time_after_eq(b,a) +/* Same as above, but does so with platform independent 64bit types. + * These must be used when utilizing jiffies_64 (i.e. return value of + * get_jiffies_64() */ +#define time_after64(a,b) \ + (typecheck(__u64, a) && \ + typecheck(__u64, b) && \ + ((__s64)(b) - (__s64)(a) < 0)) +#define time_before64(a,b) time_after64(b,a) + +#define time_after_eq64(a,b) \ + (typecheck(__u64, a) && \ + typecheck(__u64, b) && \ + ((__s64)(a) - (__s64)(b) >= 0)) +#define time_before_eq64(a,b) time_after_eq64(b,a) + /* * Have the 32 bit jiffies value wrap 5 minutes after boot * so jiffies wrap bugs show up earlier. -- cgit v1.2.1 From 15d5f8398311f565682959daaca30e3ca7aea600 Mon Sep 17 00:00:00 2001 From: Dmitriy Zavin Date: Tue, 26 Sep 2006 10:52:42 +0200 Subject: [PATCH] x86: Refactor thermal throttle processing Refactor the event processing (syslog messaging and rate limiting) into separate file therm_throt.c. This allows consistent reporting of CPU thermal throttle events. After ACK'ing the interrupt, if the event is current, the user (p4.c/mce_intel.c) calls therm_throt_process to log (and rate limit) the event. If that function returns 1, the user has the option to log things further (such as to mce_log in x86_64). AK: minor cleanup Signed-off-by: Dmitriy Zavin Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/mcheck/Makefile | 2 +- arch/i386/kernel/cpu/mcheck/p4.c | 23 ++++-------- arch/i386/kernel/cpu/mcheck/therm_throt.c | 58 +++++++++++++++++++++++++++++++ arch/x86_64/kernel/Makefile | 4 +-- arch/x86_64/kernel/mce.c | 27 ++++++++++++++ arch/x86_64/kernel/mce_intel.c | 27 ++++---------- include/asm-i386/therm_throt.h | 6 ++++ include/asm-x86_64/mce.h | 2 ++ include/asm-x86_64/therm_throt.h | 1 + 9 files changed, 109 insertions(+), 41 deletions(-) create mode 100644 arch/i386/kernel/cpu/mcheck/therm_throt.c create mode 100644 include/asm-i386/therm_throt.h create mode 100644 include/asm-x86_64/therm_throt.h diff --git a/arch/i386/kernel/cpu/mcheck/Makefile b/arch/i386/kernel/cpu/mcheck/Makefile index 30808f3d6715..f1ebe1c1c17a 100644 --- a/arch/i386/kernel/cpu/mcheck/Makefile +++ b/arch/i386/kernel/cpu/mcheck/Makefile @@ -1,2 +1,2 @@ -obj-y = mce.o k7.o p4.o p5.o p6.o winchip.o +obj-y = mce.o k7.o p4.o p5.o p6.o winchip.o therm_throt.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c index b95f1b3d53aa..d83a669d376f 100644 --- a/arch/i386/kernel/cpu/mcheck/p4.c +++ b/arch/i386/kernel/cpu/mcheck/p4.c @@ -13,6 +13,8 @@ #include #include +#include + #include "mce.h" /* as supported by the P4/Xeon family */ @@ -44,25 +46,12 @@ static void unexpected_thermal_interrupt(struct pt_regs *regs) /* P4/Xeon Thermal transition interrupt handler */ static void intel_thermal_interrupt(struct pt_regs *regs) { - u32 l, h; - unsigned int cpu = smp_processor_id(); - static unsigned long next[NR_CPUS]; + __u64 msr_val; ack_APIC_irq(); - if (time_after(next[cpu], jiffies)) - return; - - next[cpu] = jiffies + HZ*5; - rdmsr(MSR_IA32_THERM_STATUS, l, h); - if (l & 0x1) { - printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu); - printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n", - cpu); - add_taint(TAINT_MACHINE_CHECK); - } else { - printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); - } + rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + therm_throt_process(msr_val & 0x1); } /* Thermal interrupt handler for this CPU setup */ @@ -122,7 +111,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) rdmsr (MSR_IA32_MISC_ENABLE, l, h); wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h); - + l = apic_read (APIC_LVTTHMR); apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED); printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c new file mode 100644 index 000000000000..85eba00d680e --- /dev/null +++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c @@ -0,0 +1,58 @@ +/* + * linux/arch/i386/kerne/cpu/mcheck/therm_throt.c + * + * Thermal throttle event support code. + * + * Author: Dmitriy Zavin (dmitriyz@google.com) + * + * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. + * + */ + +#include +#include +#include +#include +#include + +/* How long to wait between reporting thermal events */ +#define CHECK_INTERVAL (300 * HZ) + +static DEFINE_PER_CPU(unsigned long, next_check); + +/*** + * therm_throt_process - Process thermal throttling event + * @curr: Whether the condition is current or not (boolean), since the + * thermal interrupt normally gets called both when the thermal + * event begins and once the event has ended. + * + * This function is normally called by the thermal interrupt after the + * IRQ has been acknowledged. + * + * It will take care of rate limiting and printing messages to the syslog. + * + * Returns: 0 : Event should NOT be further logged, i.e. still in + * "timeout" from previous log message. + * 1 : Event should be logged further, and a message has been + * printed to the syslog. + */ +int therm_throt_process(int curr) +{ + unsigned int cpu = smp_processor_id(); + + if (time_before(jiffies, __get_cpu_var(next_check))) + return 0; + + __get_cpu_var(next_check) = jiffies + CHECK_INTERVAL; + + /* if we just entered the thermal event */ + if (curr) { + printk(KERN_CRIT "CPU%d: Temperature above threshold, " + "cpu clock throttled\n", cpu); + add_taint(TAINT_MACHINE_CHECK); + } else { + printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu); + } + + return 1; +} diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 2466fbd035ee..3c7cbff04d3d 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -11,7 +11,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ pci-dma.o pci-nommu.o alternative.o obj-$(CONFIG_STACKTRACE) += stacktrace.o -obj-$(CONFIG_X86_MCE) += mce.o +obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/ @@ -46,6 +46,7 @@ obj-y += intel_cacheinfo.o CFLAGS_vsyscall.o := $(PROFILING) -g0 +therm_throt-y += ../../i386/kernel/cpu/mcheck/therm_throt.o bootflag-y += ../../i386/kernel/bootflag.o cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o topology-y += ../../i386/kernel/topology.o @@ -55,4 +56,3 @@ quirks-y += ../../i386/kernel/quirks.o i8237-y += ../../i386/kernel/i8237.o msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o alternative-y += ../../i386/kernel/alternative.o - diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 1a93c3738404..bbea88801d88 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -274,6 +274,33 @@ void do_machine_check(struct pt_regs * regs, long error_code) atomic_dec(&mce_entry); } +#ifdef CONFIG_X86_MCE_INTEL +/*** + * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog + * @cpu: The CPU on which the event occured. + * @status: Event status information + * + * This function should be called by the thermal interrupt after the + * event has been processed and the decision was made to log the event + * further. + * + * The status parameter will be saved to the 'status' field of 'struct mce' + * and historically has been the register value of the + * MSR_IA32_THERMAL_STATUS (Intel) msr. + */ +void mce_log_therm_throt_event(unsigned int cpu, __u64 status) +{ + struct mce m; + + memset(&m, 0, sizeof(m)); + m.cpu = cpu; + m.bank = MCE_THERMAL_BANK; + m.status = status; + rdtscll(m.tsc); + mce_log(&m); +} +#endif /* CONFIG_X86_MCE_INTEL */ + /* * Periodic polling timer for "silent" machine check errors. */ diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c index 8f533d2c40cb..dec11219e278 100644 --- a/arch/x86_64/kernel/mce_intel.c +++ b/arch/x86_64/kernel/mce_intel.c @@ -11,36 +11,21 @@ #include #include #include - -static DEFINE_PER_CPU(unsigned long, next_check); +#include asmlinkage void smp_thermal_interrupt(void) { - struct mce m; + __u64 msr_val; ack_APIC_irq(); exit_idle(); irq_enter(); - if (time_before(jiffies, __get_cpu_var(next_check))) - goto done; - - __get_cpu_var(next_check) = jiffies + HZ*300; - memset(&m, 0, sizeof(m)); - m.cpu = smp_processor_id(); - m.bank = MCE_THERMAL_BANK; - rdtscll(m.tsc); - rdmsrl(MSR_IA32_THERM_STATUS, m.status); - if (m.status & 0x1) { - printk(KERN_EMERG - "CPU%d: Temperature above threshold, cpu clock throttled\n", m.cpu); - add_taint(TAINT_MACHINE_CHECK); - } else { - printk(KERN_EMERG "CPU%d: Temperature/speed normal\n", m.cpu); - } - mce_log(&m); -done: + rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + if (therm_throt_process(msr_val & 1)) + mce_log_therm_throt_event(smp_processor_id(), msr_val); + irq_exit(); } diff --git a/include/asm-i386/therm_throt.h b/include/asm-i386/therm_throt.h new file mode 100644 index 000000000000..3c9c22cc10c9 --- /dev/null +++ b/include/asm-i386/therm_throt.h @@ -0,0 +1,6 @@ +#ifndef __ASM_I386_THERM_THROT_H__ +#define __ASM_I386_THERM_THROT_H__ 1 + +int therm_throt_process(int curr); + +#endif /* __ASM_I386_THERM_THROT_H__ */ diff --git a/include/asm-x86_64/mce.h b/include/asm-x86_64/mce.h index d13687dfd691..5a11146d6d9c 100644 --- a/include/asm-x86_64/mce.h +++ b/include/asm-x86_64/mce.h @@ -99,6 +99,8 @@ static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) } #endif +void mce_log_therm_throt_event(unsigned int cpu, __u64 status); + extern atomic_t mce_entry; #endif diff --git a/include/asm-x86_64/therm_throt.h b/include/asm-x86_64/therm_throt.h new file mode 100644 index 000000000000..5aac059007ba --- /dev/null +++ b/include/asm-x86_64/therm_throt.h @@ -0,0 +1 @@ +#include -- cgit v1.2.1 From 66aea9913cf435fe92ebb7bf869b4f15901ab993 Mon Sep 17 00:00:00 2001 From: Dmitriy Zavin Date: Tue, 26 Sep 2006 10:52:42 +0200 Subject: [PATCH] i386: Make the jiffies compares use the 64bit safe macros. Signed-off-by: Dmitriy Zavin Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/mcheck/therm_throt.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c index 85eba00d680e..101f7ace00ce 100644 --- a/arch/i386/kernel/cpu/mcheck/therm_throt.c +++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c @@ -18,7 +18,7 @@ /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) -static DEFINE_PER_CPU(unsigned long, next_check); +static DEFINE_PER_CPU(__u64, next_check); /*** * therm_throt_process - Process thermal throttling event @@ -39,11 +39,12 @@ static DEFINE_PER_CPU(unsigned long, next_check); int therm_throt_process(int curr) { unsigned int cpu = smp_processor_id(); + __u64 tmp_jiffs = get_jiffies_64(); - if (time_before(jiffies, __get_cpu_var(next_check))) + if (time_before64(tmp_jiffs, __get_cpu_var(next_check))) return 0; - __get_cpu_var(next_check) = jiffies + CHECK_INTERVAL; + __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; /* if we just entered the thermal event */ if (curr) { -- cgit v1.2.1 From 3222b36f46c22f46697a0a53fa8804153a32669f Mon Sep 17 00:00:00 2001 From: Dmitriy Zavin Date: Tue, 26 Sep 2006 10:52:42 +0200 Subject: [PATCH] x86: Add a cumulative thermal throttle event counter. The counter is exported to /sys that keeps track of the number of thermal events, such that the user knows how bad the thermal problem might be (since the logging to syslog and mcelog is rate limited). AK: Fixed cpu hotplug locking Signed-off-by: Dmitriy Zavin Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/mcheck/p4.c | 3 + arch/i386/kernel/cpu/mcheck/therm_throt.c | 133 ++++++++++++++++++++++++++++-- arch/x86_64/kernel/mce_intel.c | 3 + include/asm-i386/therm_throt.h | 3 + 4 files changed, 136 insertions(+), 6 deletions(-) diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c index d83a669d376f..504434a46011 100644 --- a/arch/i386/kernel/cpu/mcheck/p4.c +++ b/arch/i386/kernel/cpu/mcheck/p4.c @@ -115,6 +115,9 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) l = apic_read (APIC_LVTTHMR); apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED); printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); + + /* enable thermal throttle processing */ + atomic_set(&therm_throt_en, 1); return; } #endif /* CONFIG_X86_MCE_P4THERMAL */ diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c index 101f7ace00ce..4f43047de406 100644 --- a/arch/i386/kernel/cpu/mcheck/therm_throt.c +++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c @@ -1,15 +1,22 @@ /* * linux/arch/i386/kerne/cpu/mcheck/therm_throt.c * - * Thermal throttle event support code. + * Thermal throttle event support code (such as syslog messaging and rate + * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). + * This allows consistent reporting of CPU thermal throttle events. + * + * Maintains a counter in /sys that keeps track of the number of thermal + * events, such that the user knows how bad the thermal problem might be + * (since the logging to syslog and mcelog is rate limited). * * Author: Dmitriy Zavin (dmitriyz@google.com) * * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. - * + * Inspired by Ross Biro's and Al Borchers' counter code. */ #include +#include #include #include #include @@ -18,15 +25,53 @@ /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) -static DEFINE_PER_CPU(__u64, next_check); +static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; +static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); +atomic_t therm_throt_en = ATOMIC_INIT(0); + +#ifdef CONFIG_SYSFS +#define define_therm_throt_sysdev_one_ro(_name) \ + static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) + +#define define_therm_throt_sysdev_show_func(name) \ +static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ + char *buf) \ +{ \ + unsigned int cpu = dev->id; \ + ssize_t ret; \ + \ + preempt_disable(); /* CPU hotplug */ \ + if (cpu_online(cpu)) \ + ret = sprintf(buf, "%lu\n", \ + per_cpu(thermal_throttle_##name, cpu)); \ + else \ + ret = 0; \ + preempt_enable(); \ + \ + return ret; \ +} + +define_therm_throt_sysdev_show_func(count); +define_therm_throt_sysdev_one_ro(count); + +static struct attribute *thermal_throttle_attrs[] = { + &attr_count.attr, + NULL +}; + +static struct attribute_group thermal_throttle_attr_group = { + .attrs = thermal_throttle_attrs, + .name = "thermal_throttle" +}; +#endif /* CONFIG_SYSFS */ /*** - * therm_throt_process - Process thermal throttling event + * therm_throt_process - Process thermal throttling event from interrupt * @curr: Whether the condition is current or not (boolean), since the * thermal interrupt normally gets called both when the thermal * event begins and once the event has ended. * - * This function is normally called by the thermal interrupt after the + * This function is called by the thermal interrupt after the * IRQ has been acknowledged. * * It will take care of rate limiting and printing messages to the syslog. @@ -41,6 +86,9 @@ int therm_throt_process(int curr) unsigned int cpu = smp_processor_id(); __u64 tmp_jiffs = get_jiffies_64(); + if (curr) + __get_cpu_var(thermal_throttle_count)++; + if (time_before64(tmp_jiffs, __get_cpu_var(next_check))) return 0; @@ -49,7 +97,9 @@ int therm_throt_process(int curr) /* if we just entered the thermal event */ if (curr) { printk(KERN_CRIT "CPU%d: Temperature above threshold, " - "cpu clock throttled\n", cpu); + "cpu clock throttled (total events = %lu)\n", cpu, + __get_cpu_var(thermal_throttle_count)); + add_taint(TAINT_MACHINE_CHECK); } else { printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu); @@ -57,3 +107,74 @@ int therm_throt_process(int curr) return 1; } + +#ifdef CONFIG_SYSFS +/* Add/Remove thermal_throttle interface for CPU device */ +static __cpuinit int thermal_throttle_add_dev(struct sys_device * sys_dev) +{ + sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +static __cpuinit int thermal_throttle_remove_dev(struct sys_device * sys_dev) +{ + sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); + return 0; +} + +/* Mutex protecting device creation against CPU hotplug */ +static DEFINE_MUTEX(therm_cpu_lock); + +/* Get notified when a cpu comes on/off. Be hotplug friendly. */ +static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct sys_device *sys_dev; + + sys_dev = get_cpu_sysdev(cpu); + mutex_lock(&therm_cpu_lock); + switch (action) { + case CPU_ONLINE: + thermal_throttle_add_dev(sys_dev); + break; + case CPU_DEAD: + thermal_throttle_remove_dev(sys_dev); + break; + } + mutex_unlock(&therm_cpu_lock); + return NOTIFY_OK; +} + +static struct notifier_block thermal_throttle_cpu_notifier = +{ + .notifier_call = thermal_throttle_cpu_callback, +}; +#endif /* CONFIG_HOTPLUG_CPU */ + +static __init int thermal_throttle_init_device(void) +{ + unsigned int cpu = 0; + + if (!atomic_read(&therm_throt_en)) + return 0; + + register_hotcpu_notifier(&thermal_throttle_cpu_notifier); + +#ifdef CONFIG_HOTPLUG_CPU + mutex_lock(&therm_cpu_lock); +#endif + /* connect live CPUs to sysfs */ + for_each_online_cpu(cpu) + thermal_throttle_add_dev(get_cpu_sysdev(cpu)); +#ifdef CONFIG_HOTPLUG_CPU + mutex_unlock(&therm_cpu_lock); +#endif + + return 0; +} + +device_initcall(thermal_throttle_init_device); +#endif /* CONFIG_SYSFS */ diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c index dec11219e278..6551505d8a2c 100644 --- a/arch/x86_64/kernel/mce_intel.c +++ b/arch/x86_64/kernel/mce_intel.c @@ -77,6 +77,9 @@ static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c) apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", cpu, tm2 ? "TM2" : "TM1"); + + /* enable thermal throttle processing */ + atomic_set(&therm_throt_en, 1); return; } diff --git a/include/asm-i386/therm_throt.h b/include/asm-i386/therm_throt.h index 3c9c22cc10c9..399bf6026b16 100644 --- a/include/asm-i386/therm_throt.h +++ b/include/asm-i386/therm_throt.h @@ -1,6 +1,9 @@ #ifndef __ASM_I386_THERM_THROT_H__ #define __ASM_I386_THERM_THROT_H__ 1 +#include + +extern atomic_t therm_throt_en; int therm_throt_process(int curr); #endif /* __ASM_I386_THERM_THROT_H__ */ -- cgit v1.2.1 From dcf10307c3fff2bca4b104ad8fe4e3aa8dcb2ad1 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 26 Sep 2006 10:52:42 +0200 Subject: [PATCH] i386/x86-64: New Intel feature flags Add supplemental SSE3 instructions flag, and Direct Cache Access flag. As described in "Intel Processor idenfication and the CPUID instruction AP485 Sept 2006" AK: also added for x86-64 Signed-off-by: Dave Jones Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/proc.c | 4 ++-- arch/x86_64/kernel/setup.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c index f54a15268ed7..76aac088a323 100644 --- a/arch/i386/kernel/cpu/proc.c +++ b/arch/i386/kernel/cpu/proc.c @@ -46,8 +46,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) /* Intel-defined (#2) */ "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", - "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, + NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* VIA/Cyrix/Centaur-defined */ diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 3d8309b1236e..3c7ad267d8a7 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -1071,8 +1071,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) /* Intel-defined (#2) */ "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", - "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, + NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* VIA/Cyrix/Centaur-defined */ -- cgit v1.2.1 From 3f75f42d7733e73aca5c78326489efd4189e0111 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 26 Sep 2006 10:52:42 +0200 Subject: [PATCH] Don't set calgary iommu as default y Most systems don't need it. Signed-off-by: Andi Kleen --- arch/x86_64/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index c2c68b902347..294bf77f7222 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -422,7 +422,6 @@ config IOMMU config CALGARY_IOMMU bool "IBM Calgary IOMMU support" - default y select SWIOTLB depends on PCI && EXPERIMENTAL help -- cgit v1.2.1