Merge git://git.kernel.org/pub/scm/linux/kernel/git/cmetcalf/linux-tile

Pull Tile arch updates from Chris Metcalf: "These changes bring in a bunch of new functionality that has been maintained internally at Tilera over the last year, plus other stray bits of work that I've taken into the tile tree from other folks. The changes include some PCI root complex work, interrupt-driven console support, support for performing fast-path unaligned data fixups by kernel-based JIT code generation, CONFIG_PREEMPT support, vDSO support for gettimeofday(), a serial driver for the tilegx on-chip UART, KGDB support, more optimized string routines, support for ftrace and kprobes, improved ASLR, and many bug fixes. We also remove support for the old TILE64 chip, which is no longer buildable" * git://git.kernel.org/pub/scm/linux/kernel/git/cmetcalf/linux-tile: (85 commits) tile: refresh tile defconfig files tile: rework <asm/cmpxchg.h> tile PCI RC: make default consistent DMA mask 32-bit tile: add null check for kzalloc in tile/kernel/setup.c tile: make __write_once a synonym for __read_mostly tile: remove support for TILE64 tile: use asm-generic/bitops/builtin-*.h tile: eliminate no-op "noatomichash" boot argument tile: use standard tile_bundle_bits type in traps.c tile: simplify code referencing hypervisor API addresses tile: change <asm/system.h> to <asm/switch_to.h> in comments tile: mark pcibios_init() as __init tile: check for correct compiler earlier in asm-offsets.c tile: use standard 'generic-y' model for <asm/hw_irq.h> tile: use asm-generic version of <asm/local64.h> tile PCI RC: add comment about "PCI hole" problem tile: remove DEBUG_EXTRA_FLAGS kernel config option tile: add virt_to_kpte() API and clean up and document behavior tile: support FRAME_POINTER tile: support reporting Tilera hypervisor statistics ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2013-09-06 11:14:33 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-09-06 11:14:33 -0700
commit: 4de9ad9bc08b4953fc03336ad38908496e2f8826 (patch)
tree: bd44add223061a58317034a0d6c9686d95d12fba /arch/tile
parent: 576c25eb5954035b64112188d9a2683144600f3d (diff)
parent: 06da6629e68ddc8ffe2933d33b3681f09104b3f1 (diff)
download: blackbird-op-linux-4de9ad9bc08b4953fc03336ad38908496e2f8826.tar.gz
blackbird-op-linux-4de9ad9bc08b4953fc03336ad38908496e2f8826.zip
148 files changed, 7824 insertions, 2884 deletions
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 24565a7ffe6d..6e1ed55f6cfc 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -26,6 +26,7 @@ config TILE
 	select HAVE_SYSCALL_TRACEPOINTS
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
 	select HAVE_DEBUG_STACKOVERFLOW
+	select ARCH_WANT_FRAME_POINTERS
 
 # FIXME: investigate whether we need/want these options.
 #	select HAVE_IOREMAP_PROT
@@ -64,6 +65,9 @@ config HUGETLB_SUPER_PAGES
 	depends on HUGETLB_PAGE && TILEGX
 	def_bool y
 
+config GENERIC_TIME_VSYSCALL
+	def_bool y
+
 # FIXME: tilegx can implement a more efficient rwsem.
 config RWSEM_GENERIC_SPINLOCK
 	def_bool y
@@ -112,10 +116,19 @@ config SMP
 config HVC_TILE
 	depends on TTY
 	select HVC_DRIVER
+	select HVC_IRQ if TILEGX
 	def_bool y
 
 config TILEGX
-	bool "Building with TILE-Gx (64-bit) compiler and toolchain"
+	bool "Building for TILE-Gx (64-bit) processor"
+	select HAVE_FUNCTION_TRACER
+	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
+	select HAVE_FUNCTION_GRAPH_TRACER
+	select HAVE_DYNAMIC_FTRACE
+	select HAVE_FTRACE_MCOUNT_RECORD
+	select HAVE_KPROBES
+	select HAVE_KRETPROBES
+	select HAVE_ARCH_KGDB
 
 config TILEPRO
 	def_bool !TILEGX
@@ -194,7 +207,7 @@ config SYSVIPC_COMPAT
 	def_bool y
 	depends on COMPAT && SYSVIPC
 
-# We do not currently support disabling HIGHMEM on tile64 and tilepro.
+# We do not currently support disabling HIGHMEM on tilepro.
 config HIGHMEM
 	bool # "Support for more than 512 MB of RAM"
 	default !TILEGX
@@ -300,6 +313,8 @@ config PAGE_OFFSET
 
 source "mm/Kconfig"
 
+source "kernel/Kconfig.preempt"
+
 config CMDLINE_BOOL
 	bool "Built-in kernel command line"
 	default n
@@ -396,8 +411,20 @@ config NO_IOMEM
 config NO_IOPORT
 	def_bool !PCI
 
+config TILE_PCI_IO
+	bool "PCI I/O space support"
+	default n
+	depends on PCI
+	depends on TILEGX
+	---help---
+	  Enable PCI I/O space support on TILEGx. Since the PCI I/O space
+	  is used by few modern PCIe endpoint devices, its support is disabled
+	  by default to save the TRIO PIO Region resource for other purposes.
+
 source "drivers/pci/Kconfig"
 
+source "drivers/pci/pcie/Kconfig"
+
 config TILE_USB
 	tristate "Tilera USB host adapter support"
 	default y
diff --git a/arch/tile/Kconfig.debug b/arch/tile/Kconfig.debug
index 9165ea979e85..19734d3ab1e8 100644
--- a/arch/tile/Kconfig.debug
+++ b/arch/tile/Kconfig.debug
@@ -14,14 +14,12 @@ config EARLY_PRINTK
 	  with klogd/syslogd. You should normally N here,
 	  unless you want to debug such a crash.
 
-config DEBUG_EXTRA_FLAGS
-	string "Additional compiler arguments when building with '-g'"
-	depends on DEBUG_INFO
-	default ""
+config TILE_HVGLUE_TRACE
+	bool "Provide wrapper functions for hypervisor ABI calls"
+	default n
 	help
-	  Debug info can be large, and flags like
-	  `-femit-struct-debug-baseonly' can reduce the kernel file
-	  size and build time noticeably.  Such flags are often
-	  helpful if the main use of debug info is line number info.
+	  Provide wrapper functions for the hypervisor ABI calls
+	  defined in arch/tile/kernel/hvglue.S.  This allows tracing
+	  mechanisms, etc., to have visibility into those calls.
 
 endmenu
diff --git a/arch/tile/Makefile b/arch/tile/Makefile
index 3d15364c6071..4dc380a519d4 100644
--- a/arch/tile/Makefile
+++ b/arch/tile/Makefile
@@ -30,10 +30,6 @@ endif
 # In kernel modules, this causes load failures due to unsupported relocations.
 KBUILD_CFLAGS   += -fno-asynchronous-unwind-tables
 
-ifneq ($(CONFIG_DEBUG_EXTRA_FLAGS),"")
-KBUILD_CFLAGS   += $(CONFIG_DEBUG_EXTRA_FLAGS)
-endif
-
 LIBGCC_PATH     := \
   $(shell $(CC) $(KBUILD_CFLAGS) $(KCFLAGS) -print-libgcc-file-name)
 
diff --git a/arch/tile/configs/tilegx_defconfig b/arch/tile/configs/tilegx_defconfig
index 47684815e5c8..730e40d9cf62 100644
--- a/arch/tile/configs/tilegx_defconfig
+++ b/arch/tile/configs/tilegx_defconfig
@@ -1,16 +1,15 @@
 CONFIG_TILEGX=y
-CONFIG_EXPERIMENTAL=y
-# CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_FHANDLE=y
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
 CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_BSD_PROCESS_ACCT_V3=y
-CONFIG_FHANDLE=y
 CONFIG_TASKSTATS=y
 CONFIG_TASK_DELAY_ACCT=y
 CONFIG_TASK_XACCT=y
 CONFIG_TASK_IO_ACCOUNTING=y
-CONFIG_AUDIT=y
 CONFIG_LOG_BUF_SHIFT=19
 CONFIG_CGROUPS=y
 CONFIG_CGROUP_DEBUG=y
@@ -18,18 +17,18 @@ CONFIG_CGROUP_DEVICE=y
 CONFIG_CPUSETS=y
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_RESOURCE_COUNTERS=y
-CONFIG_CGROUP_MEMCG=y
-CONFIG_CGROUP_MEMCG_SWAP=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_RT_GROUP_SCHED=y
 CONFIG_BLK_CGROUP=y
 CONFIG_NAMESPACES=y
 CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_RD_XZ=y
 CONFIG_SYSCTL_SYSCALL=y
 CONFIG_EMBEDDED=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
+CONFIG_KPROBES=y
 CONFIG_MODULES=y
 CONFIG_MODULE_FORCE_LOAD=y
 CONFIG_MODULE_UNLOAD=y
@@ -45,12 +44,12 @@ CONFIG_UNIXWARE_DISKLABEL=y
 CONFIG_SGI_PARTITION=y
 CONFIG_SUN_PARTITION=y
 CONFIG_KARMA_PARTITION=y
-CONFIG_EFI_PARTITION=y
 CONFIG_CFQ_GROUP_IOSCHED=y
 CONFIG_NR_CPUS=100
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
 CONFIG_HZ_100=y
+# CONFIG_COMPACTION is not set
+CONFIG_PREEMPT_VOLUNTARY=y
+CONFIG_TILE_PCI_IO=y
 CONFIG_PCI_DEBUG=y
 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
 CONFIG_BINFMT_MISC=y
@@ -108,150 +107,9 @@ CONFIG_IPV6_MULTIPLE_TABLES=y
 CONFIG_IPV6_MROUTE=y
 CONFIG_IPV6_PIMSM_V2=y
 CONFIG_NETLABEL=y
-CONFIG_NETFILTER=y
-CONFIG_NF_CONNTRACK=m
-CONFIG_NF_CONNTRACK_SECMARK=y
-CONFIG_NF_CONNTRACK_ZONES=y
-CONFIG_NF_CONNTRACK_EVENTS=y
-CONFIG_NF_CT_PROTO_DCCP=m
-CONFIG_NF_CT_PROTO_UDPLITE=m
-CONFIG_NF_CONNTRACK_AMANDA=m
-CONFIG_NF_CONNTRACK_FTP=m
-CONFIG_NF_CONNTRACK_H323=m
-CONFIG_NF_CONNTRACK_IRC=m
-CONFIG_NF_CONNTRACK_NETBIOS_NS=m
-CONFIG_NF_CONNTRACK_PPTP=m
-CONFIG_NF_CONNTRACK_SANE=m
-CONFIG_NF_CONNTRACK_SIP=m
-CONFIG_NF_CONNTRACK_TFTP=m
-CONFIG_NETFILTER_TPROXY=m
-CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
-CONFIG_NETFILTER_XT_TARGET_CONNMARK=m
-CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m
-CONFIG_NETFILTER_XT_TARGET_CT=m
-CONFIG_NETFILTER_XT_TARGET_DSCP=m
-CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m
-CONFIG_NETFILTER_XT_TARGET_MARK=m
-CONFIG_NETFILTER_XT_TARGET_NFLOG=m
-CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
-CONFIG_NETFILTER_XT_TARGET_TEE=m
-CONFIG_NETFILTER_XT_TARGET_TPROXY=m
-CONFIG_NETFILTER_XT_TARGET_TRACE=m
-CONFIG_NETFILTER_XT_TARGET_SECMARK=m
-CONFIG_NETFILTER_XT_TARGET_TCPMSS=m
-CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m
-CONFIG_NETFILTER_XT_MATCH_CLUSTER=m
-CONFIG_NETFILTER_XT_MATCH_COMMENT=m
-CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
-CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
-CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
-CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
-CONFIG_NETFILTER_XT_MATCH_DSCP=m
-CONFIG_NETFILTER_XT_MATCH_ESP=m
-CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
-CONFIG_NETFILTER_XT_MATCH_HELPER=m
-CONFIG_NETFILTER_XT_MATCH_IPRANGE=m
-CONFIG_NETFILTER_XT_MATCH_IPVS=m
-CONFIG_NETFILTER_XT_MATCH_LENGTH=m
-CONFIG_NETFILTER_XT_MATCH_LIMIT=m
-CONFIG_NETFILTER_XT_MATCH_MAC=m
-CONFIG_NETFILTER_XT_MATCH_MARK=m
-CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m
-CONFIG_NETFILTER_XT_MATCH_OSF=m
-CONFIG_NETFILTER_XT_MATCH_OWNER=m
-CONFIG_NETFILTER_XT_MATCH_POLICY=m
-CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m
-CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m
-CONFIG_NETFILTER_XT_MATCH_QUOTA=m
-CONFIG_NETFILTER_XT_MATCH_RATEEST=m
-CONFIG_NETFILTER_XT_MATCH_REALM=m
-CONFIG_NETFILTER_XT_MATCH_RECENT=m
-CONFIG_NETFILTER_XT_MATCH_SOCKET=m
-CONFIG_NETFILTER_XT_MATCH_STATE=m
-CONFIG_NETFILTER_XT_MATCH_STATISTIC=m
-CONFIG_NETFILTER_XT_MATCH_STRING=m
-CONFIG_NETFILTER_XT_MATCH_TCPMSS=m
-CONFIG_NETFILTER_XT_MATCH_TIME=m
-CONFIG_NETFILTER_XT_MATCH_U32=m
-CONFIG_IP_VS=m
-CONFIG_IP_VS_IPV6=y
-CONFIG_IP_VS_PROTO_TCP=y
-CONFIG_IP_VS_PROTO_UDP=y
-CONFIG_IP_VS_PROTO_ESP=y
-CONFIG_IP_VS_PROTO_AH=y
-CONFIG_IP_VS_PROTO_SCTP=y
-CONFIG_IP_VS_RR=m
-CONFIG_IP_VS_WRR=m
-CONFIG_IP_VS_LC=m
-CONFIG_IP_VS_WLC=m
-CONFIG_IP_VS_LBLC=m
-CONFIG_IP_VS_LBLCR=m
-CONFIG_IP_VS_SED=m
-CONFIG_IP_VS_NQ=m
-CONFIG_NF_CONNTRACK_IPV4=m
-# CONFIG_NF_CONNTRACK_PROC_COMPAT is not set
-CONFIG_IP_NF_QUEUE=m
-CONFIG_IP_NF_IPTABLES=y
-CONFIG_IP_NF_MATCH_AH=m
-CONFIG_IP_NF_MATCH_ECN=m
-CONFIG_IP_NF_MATCH_TTL=m
-CONFIG_IP_NF_FILTER=y
-CONFIG_IP_NF_TARGET_REJECT=y
-CONFIG_IP_NF_TARGET_LOG=m
-CONFIG_IP_NF_TARGET_ULOG=m
-CONFIG_IP_NF_MANGLE=m
-CONFIG_IP_NF_TARGET_ECN=m
-CONFIG_IP_NF_TARGET_TTL=m
-CONFIG_IP_NF_RAW=m
-CONFIG_IP_NF_SECURITY=m
-CONFIG_IP_NF_ARPTABLES=m
-CONFIG_IP_NF_ARPFILTER=m
-CONFIG_IP_NF_ARP_MANGLE=m
-CONFIG_NF_CONNTRACK_IPV6=m
-CONFIG_IP6_NF_QUEUE=m
-CONFIG_IP6_NF_IPTABLES=m
-CONFIG_IP6_NF_MATCH_AH=m
-CONFIG_IP6_NF_MATCH_EUI64=m
-CONFIG_IP6_NF_MATCH_FRAG=m
-CONFIG_IP6_NF_MATCH_OPTS=m
-CONFIG_IP6_NF_MATCH_HL=m
-CONFIG_IP6_NF_MATCH_IPV6HEADER=m
-CONFIG_IP6_NF_MATCH_MH=m
-CONFIG_IP6_NF_MATCH_RT=m
-CONFIG_IP6_NF_TARGET_HL=m
-CONFIG_IP6_NF_TARGET_LOG=m
-CONFIG_IP6_NF_FILTER=m
-CONFIG_IP6_NF_TARGET_REJECT=m
-CONFIG_IP6_NF_MANGLE=m
-CONFIG_IP6_NF_RAW=m
-CONFIG_IP6_NF_SECURITY=m
-CONFIG_BRIDGE_NF_EBTABLES=m
-CONFIG_BRIDGE_EBT_BROUTE=m
-CONFIG_BRIDGE_EBT_T_FILTER=m
-CONFIG_BRIDGE_EBT_T_NAT=m
-CONFIG_BRIDGE_EBT_802_3=m
-CONFIG_BRIDGE_EBT_AMONG=m
-CONFIG_BRIDGE_EBT_ARP=m
-CONFIG_BRIDGE_EBT_IP=m
-CONFIG_BRIDGE_EBT_IP6=m
-CONFIG_BRIDGE_EBT_LIMIT=m
-CONFIG_BRIDGE_EBT_MARK=m
-CONFIG_BRIDGE_EBT_PKTTYPE=m
-CONFIG_BRIDGE_EBT_STP=m
-CONFIG_BRIDGE_EBT_VLAN=m
-CONFIG_BRIDGE_EBT_ARPREPLY=m
-CONFIG_BRIDGE_EBT_DNAT=m
-CONFIG_BRIDGE_EBT_MARK_T=m
-CONFIG_BRIDGE_EBT_REDIRECT=m
-CONFIG_BRIDGE_EBT_SNAT=m
-CONFIG_BRIDGE_EBT_LOG=m
-CONFIG_BRIDGE_EBT_ULOG=m
-CONFIG_BRIDGE_EBT_NFLOG=m
 CONFIG_RDS=m
 CONFIG_RDS_TCP=m
 CONFIG_BRIDGE=m
-CONFIG_NET_DSA=y
 CONFIG_VLAN_8021Q=m
 CONFIG_VLAN_8021Q_GVRP=y
 CONFIG_PHONET=m
@@ -292,13 +150,13 @@ CONFIG_NET_ACT_POLICE=m
 CONFIG_NET_ACT_GACT=m
 CONFIG_GACT_PROB=y
 CONFIG_NET_ACT_MIRRED=m
-CONFIG_NET_ACT_IPT=m
 CONFIG_NET_ACT_NAT=m
 CONFIG_NET_ACT_PEDIT=m
 CONFIG_NET_ACT_SIMP=m
 CONFIG_NET_ACT_SKBEDIT=m
 CONFIG_NET_CLS_IND=y
 CONFIG_DCB=y
+CONFIG_DNS_RESOLVER=y
 # CONFIG_WIRELESS is not set
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_DEVTMPFS=y
@@ -317,10 +175,12 @@ CONFIG_BLK_DEV_SD=y
 CONFIG_SCSI_CONSTANTS=y
 CONFIG_SCSI_LOGGING=y
 CONFIG_SCSI_SAS_ATA=y
+CONFIG_ISCSI_TCP=m
 CONFIG_SCSI_MVSAS=y
 # CONFIG_SCSI_MVSAS_DEBUG is not set
 CONFIG_SCSI_MVSAS_TASKLET=y
 CONFIG_ATA=y
+CONFIG_SATA_AHCI=y
 CONFIG_SATA_SIL24=y
 # CONFIG_ATA_SFF is not set
 CONFIG_MD=y
@@ -343,6 +203,12 @@ CONFIG_DM_MULTIPATH_QL=m
 CONFIG_DM_MULTIPATH_ST=m
 CONFIG_DM_DELAY=m
 CONFIG_DM_UEVENT=y
+CONFIG_TARGET_CORE=m
+CONFIG_TCM_IBLOCK=m
+CONFIG_TCM_FILEIO=m
+CONFIG_TCM_PSCSI=m
+CONFIG_LOOPBACK_TARGET=m
+CONFIG_ISCSI_TARGET=m
 CONFIG_FUSION=y
 CONFIG_FUSION_SAS=y
 CONFIG_NETDEVICES=y
@@ -359,42 +225,8 @@ CONFIG_VETH=m
 CONFIG_NET_DSA_MV88E6060=y
 CONFIG_NET_DSA_MV88E6131=y
 CONFIG_NET_DSA_MV88E6123_61_65=y
-# CONFIG_NET_VENDOR_3COM is not set
-# CONFIG_NET_VENDOR_ADAPTEC is not set
-# CONFIG_NET_VENDOR_ALTEON is not set
-# CONFIG_NET_VENDOR_AMD is not set
-# CONFIG_NET_VENDOR_ATHEROS is not set
-# CONFIG_NET_VENDOR_BROADCOM is not set
-# CONFIG_NET_VENDOR_BROCADE is not set
-# CONFIG_NET_VENDOR_CHELSIO is not set
-# CONFIG_NET_VENDOR_CISCO is not set
-# CONFIG_NET_VENDOR_DEC is not set
-# CONFIG_NET_VENDOR_DLINK is not set
-# CONFIG_NET_VENDOR_EMULEX is not set
-# CONFIG_NET_VENDOR_EXAR is not set
-# CONFIG_NET_VENDOR_HP is not set
-# CONFIG_NET_VENDOR_INTEL is not set
-# CONFIG_NET_VENDOR_MARVELL is not set
-# CONFIG_NET_VENDOR_MELLANOX is not set
-# CONFIG_NET_VENDOR_MICREL is not set
-# CONFIG_NET_VENDOR_MYRI is not set
-# CONFIG_NET_VENDOR_NATSEMI is not set
-# CONFIG_NET_VENDOR_NVIDIA is not set
-# CONFIG_NET_VENDOR_OKI is not set
-# CONFIG_NET_PACKET_ENGINE is not set
-# CONFIG_NET_VENDOR_QLOGIC is not set
-# CONFIG_NET_VENDOR_REALTEK is not set
-# CONFIG_NET_VENDOR_RDC is not set
-# CONFIG_NET_VENDOR_SEEQ is not set
-# CONFIG_NET_VENDOR_SILAN is not set
-# CONFIG_NET_VENDOR_SIS is not set
-# CONFIG_NET_VENDOR_SMSC is not set
-# CONFIG_NET_VENDOR_STMICRO is not set
-# CONFIG_NET_VENDOR_SUN is not set
-# CONFIG_NET_VENDOR_TEHUTI is not set
-# CONFIG_NET_VENDOR_TI is not set
-# CONFIG_TILE_NET is not set
-# CONFIG_NET_VENDOR_VIA is not set
+CONFIG_SKY2=y
+CONFIG_PTP_1588_CLOCK_TILEGX=y
 # CONFIG_WLAN is not set
 # CONFIG_INPUT_MOUSEDEV is not set
 # CONFIG_INPUT_KEYBOARD is not set
@@ -402,6 +234,7 @@ CONFIG_NET_DSA_MV88E6123_61_65=y
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
 # CONFIG_LEGACY_PTYS is not set
+CONFIG_SERIAL_TILEGX=y
 CONFIG_HW_RANDOM=y
 CONFIG_HW_RANDOM_TIMERIOMEM=m
 CONFIG_I2C=y
@@ -410,13 +243,16 @@ CONFIG_I2C_CHARDEV=y
 CONFIG_WATCHDOG=y
 CONFIG_WATCHDOG_NOWAYOUT=y
 # CONFIG_VGA_ARB is not set
-# CONFIG_HID_SUPPORT is not set
+CONFIG_DRM=m
+CONFIG_DRM_TDFX=m
+CONFIG_DRM_R128=m
+CONFIG_DRM_MGA=m
+CONFIG_DRM_VIA=m
+CONFIG_DRM_SAVAGE=m
 CONFIG_USB=y
-# CONFIG_USB_DEVICE_CLASS is not set
 CONFIG_USB_EHCI_HCD=y
 CONFIG_USB_OHCI_HCD=y
 CONFIG_USB_STORAGE=y
-CONFIG_USB_LIBUSUAL=y
 CONFIG_EDAC=y
 CONFIG_EDAC_MM_EDAC=y
 CONFIG_RTC_CLASS=y
@@ -464,9 +300,8 @@ CONFIG_ECRYPT_FS=m
 CONFIG_CRAMFS=m
 CONFIG_SQUASHFS=m
 CONFIG_NFS_FS=m
-CONFIG_NFS_V3=y
 CONFIG_NFS_V3_ACL=y
-CONFIG_NFS_V4=y
+CONFIG_NFS_V4=m
 CONFIG_NFS_V4_1=y
 CONFIG_NFS_FSCACHE=y
 CONFIG_NFSD=m
@@ -519,25 +354,28 @@ CONFIG_NLS_ISO8859_15=m
 CONFIG_NLS_KOI8_R=m
 CONFIG_NLS_KOI8_U=m
 CONFIG_NLS_UTF8=m
+CONFIG_DLM=m
 CONFIG_DLM_DEBUG=y
+CONFIG_DYNAMIC_DEBUG=y
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_REDUCED=y
 # CONFIG_ENABLE_WARN_DEPRECATED is not set
-CONFIG_MAGIC_SYSRQ=y
 CONFIG_STRIP_ASM_SYMS=y
 CONFIG_DEBUG_FS=y
 CONFIG_HEADERS_CHECK=y
+# CONFIG_FRAME_POINTER is not set
+CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y
+CONFIG_DEBUG_VM=y
+CONFIG_DEBUG_MEMORY_INIT=y
+CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_LOCKUP_DETECTOR=y
 CONFIG_SCHEDSTATS=y
 CONFIG_TIMER_STATS=y
-CONFIG_DEBUG_INFO=y
-CONFIG_DEBUG_INFO_REDUCED=y
-CONFIG_DEBUG_VM=y
-CONFIG_DEBUG_MEMORY_INIT=y
 CONFIG_DEBUG_LIST=y
 CONFIG_DEBUG_CREDENTIALS=y
-CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y
-CONFIG_DYNAMIC_DEBUG=y
+CONFIG_RCU_CPU_STALL_TIMEOUT=60
 CONFIG_ASYNC_RAID6_TEST=m
-CONFIG_DEBUG_STACKOVERFLOW=y
+CONFIG_KGDB=y
 CONFIG_KEYS_DEBUG_PROC_KEYS=y
 CONFIG_SECURITY=y
 CONFIG_SECURITYFS=y
@@ -546,7 +384,6 @@ CONFIG_SECURITY_NETWORK_XFRM=y
 CONFIG_SECURITY_SELINUX=y
 CONFIG_SECURITY_SELINUX_BOOTPARAM=y
 CONFIG_SECURITY_SELINUX_DISABLE=y
-CONFIG_CRYPTO_NULL=m
 CONFIG_CRYPTO_PCRYPT=m
 CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_TEST=m
@@ -559,14 +396,12 @@ CONFIG_CRYPTO_XTS=m
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_VMAC=m
-CONFIG_CRYPTO_CRC32C=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD128=m
 CONFIG_CRYPTO_RMD160=m
 CONFIG_CRYPTO_RMD256=m
 CONFIG_CRYPTO_RMD320=m
 CONFIG_CRYPTO_SHA1=y
-CONFIG_CRYPTO_SHA256=m
 CONFIG_CRYPTO_SHA512=m
 CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
diff --git a/arch/tile/configs/tilepro_defconfig b/arch/tile/configs/tilepro_defconfig
index dd2b8f0c631f..80fc32ed0491 100644
--- a/arch/tile/configs/tilepro_defconfig
+++ b/arch/tile/configs/tilepro_defconfig
@@ -1,15 +1,14 @@
-CONFIG_EXPERIMENTAL=y
-# CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
 CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_BSD_PROCESS_ACCT_V3=y
-CONFIG_FHANDLE=y
 CONFIG_TASKSTATS=y
 CONFIG_TASK_DELAY_ACCT=y
 CONFIG_TASK_XACCT=y
 CONFIG_TASK_IO_ACCOUNTING=y
-CONFIG_AUDIT=y
 CONFIG_LOG_BUF_SHIFT=19
 CONFIG_CGROUPS=y
 CONFIG_CGROUP_DEBUG=y
@@ -17,14 +16,13 @@ CONFIG_CGROUP_DEVICE=y
 CONFIG_CPUSETS=y
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_RESOURCE_COUNTERS=y
-CONFIG_CGROUP_MEMCG=y
-CONFIG_CGROUP_MEMCG_SWAP=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_RT_GROUP_SCHED=y
 CONFIG_BLK_CGROUP=y
 CONFIG_NAMESPACES=y
 CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_RD_XZ=y
 CONFIG_SYSCTL_SYSCALL=y
 CONFIG_EMBEDDED=y
 # CONFIG_COMPAT_BRK is not set
@@ -44,11 +42,10 @@ CONFIG_UNIXWARE_DISKLABEL=y
 CONFIG_SGI_PARTITION=y
 CONFIG_SUN_PARTITION=y
 CONFIG_KARMA_PARTITION=y
-CONFIG_EFI_PARTITION=y
 CONFIG_CFQ_GROUP_IOSCHED=y
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
 CONFIG_HZ_100=y
+# CONFIG_COMPACTION is not set
+CONFIG_PREEMPT_VOLUNTARY=y
 CONFIG_PCI_DEBUG=y
 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
 CONFIG_BINFMT_MISC=y
@@ -122,16 +119,15 @@ CONFIG_NF_CONNTRACK_PPTP=m
 CONFIG_NF_CONNTRACK_SANE=m
 CONFIG_NF_CONNTRACK_SIP=m
 CONFIG_NF_CONNTRACK_TFTP=m
-CONFIG_NETFILTER_TPROXY=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
 CONFIG_NETFILTER_XT_TARGET_CONNMARK=m
 CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m
-CONFIG_NETFILTER_XT_TARGET_CT=m
 CONFIG_NETFILTER_XT_TARGET_DSCP=m
 CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m
 CONFIG_NETFILTER_XT_TARGET_MARK=m
 CONFIG_NETFILTER_XT_TARGET_NFLOG=m
 CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
+CONFIG_NETFILTER_XT_TARGET_NOTRACK=m
 CONFIG_NETFILTER_XT_TARGET_TEE=m
 CONFIG_NETFILTER_XT_TARGET_TPROXY=m
 CONFIG_NETFILTER_XT_TARGET_TRACE=m
@@ -189,14 +185,12 @@ CONFIG_IP_VS_SED=m
 CONFIG_IP_VS_NQ=m
 CONFIG_NF_CONNTRACK_IPV4=m
 # CONFIG_NF_CONNTRACK_PROC_COMPAT is not set
-CONFIG_IP_NF_QUEUE=m
 CONFIG_IP_NF_IPTABLES=y
 CONFIG_IP_NF_MATCH_AH=m
 CONFIG_IP_NF_MATCH_ECN=m
 CONFIG_IP_NF_MATCH_TTL=m
 CONFIG_IP_NF_FILTER=y
 CONFIG_IP_NF_TARGET_REJECT=y
-CONFIG_IP_NF_TARGET_LOG=m
 CONFIG_IP_NF_TARGET_ULOG=m
 CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
@@ -207,8 +201,6 @@ CONFIG_IP_NF_ARPTABLES=m
 CONFIG_IP_NF_ARPFILTER=m
 CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NF_CONNTRACK_IPV6=m
-CONFIG_IP6_NF_QUEUE=m
-CONFIG_IP6_NF_IPTABLES=m
 CONFIG_IP6_NF_MATCH_AH=m
 CONFIG_IP6_NF_MATCH_EUI64=m
 CONFIG_IP6_NF_MATCH_FRAG=m
@@ -218,7 +210,6 @@ CONFIG_IP6_NF_MATCH_IPV6HEADER=m
 CONFIG_IP6_NF_MATCH_MH=m
 CONFIG_IP6_NF_MATCH_RT=m
 CONFIG_IP6_NF_TARGET_HL=m
-CONFIG_IP6_NF_TARGET_LOG=m
 CONFIG_IP6_NF_FILTER=m
 CONFIG_IP6_NF_TARGET_REJECT=m
 CONFIG_IP6_NF_MANGLE=m
@@ -249,7 +240,6 @@ CONFIG_BRIDGE_EBT_NFLOG=m
 CONFIG_RDS=m
 CONFIG_RDS_TCP=m
 CONFIG_BRIDGE=m
-CONFIG_NET_DSA=y
 CONFIG_VLAN_8021Q=m
 CONFIG_VLAN_8021Q_GVRP=y
 CONFIG_PHONET=m
@@ -297,6 +287,7 @@ CONFIG_NET_ACT_SIMP=m
 CONFIG_NET_ACT_SKBEDIT=m
 CONFIG_NET_CLS_IND=y
 CONFIG_DCB=y
+CONFIG_DNS_RESOLVER=y
 # CONFIG_WIRELESS is not set
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_DEVTMPFS=y
@@ -354,40 +345,7 @@ CONFIG_NET_DSA_MV88E6060=y
 CONFIG_NET_DSA_MV88E6131=y
 CONFIG_NET_DSA_MV88E6123_61_65=y
 # CONFIG_NET_VENDOR_3COM is not set
-# CONFIG_NET_VENDOR_ADAPTEC is not set
-# CONFIG_NET_VENDOR_ALTEON is not set
-# CONFIG_NET_VENDOR_AMD is not set
-# CONFIG_NET_VENDOR_ATHEROS is not set
-# CONFIG_NET_VENDOR_BROADCOM is not set
-# CONFIG_NET_VENDOR_BROCADE is not set
-# CONFIG_NET_VENDOR_CHELSIO is not set
-# CONFIG_NET_VENDOR_CISCO is not set
-# CONFIG_NET_VENDOR_DEC is not set
-# CONFIG_NET_VENDOR_DLINK is not set
-# CONFIG_NET_VENDOR_EMULEX is not set
-# CONFIG_NET_VENDOR_EXAR is not set
-# CONFIG_NET_VENDOR_HP is not set
-# CONFIG_NET_VENDOR_INTEL is not set
-# CONFIG_NET_VENDOR_MARVELL is not set
-# CONFIG_NET_VENDOR_MELLANOX is not set
-# CONFIG_NET_VENDOR_MICREL is not set
-# CONFIG_NET_VENDOR_MYRI is not set
-# CONFIG_NET_VENDOR_NATSEMI is not set
-# CONFIG_NET_VENDOR_NVIDIA is not set
-# CONFIG_NET_VENDOR_OKI is not set
-# CONFIG_NET_PACKET_ENGINE is not set
-# CONFIG_NET_VENDOR_QLOGIC is not set
-# CONFIG_NET_VENDOR_REALTEK is not set
-# CONFIG_NET_VENDOR_RDC is not set
-# CONFIG_NET_VENDOR_SEEQ is not set
-# CONFIG_NET_VENDOR_SILAN is not set
-# CONFIG_NET_VENDOR_SIS is not set
-# CONFIG_NET_VENDOR_SMSC is not set
-# CONFIG_NET_VENDOR_STMICRO is not set
-# CONFIG_NET_VENDOR_SUN is not set
-# CONFIG_NET_VENDOR_TEHUTI is not set
-# CONFIG_NET_VENDOR_TI is not set
-# CONFIG_NET_VENDOR_VIA is not set
+CONFIG_E1000E=y
 # CONFIG_WLAN is not set
 # CONFIG_INPUT_MOUSEDEV is not set
 # CONFIG_INPUT_KEYBOARD is not set
@@ -403,7 +361,6 @@ CONFIG_I2C_CHARDEV=y
 CONFIG_WATCHDOG=y
 CONFIG_WATCHDOG_NOWAYOUT=y
 # CONFIG_VGA_ARB is not set
-# CONFIG_HID_SUPPORT is not set
 # CONFIG_USB_SUPPORT is not set
 CONFIG_EDAC=y
 CONFIG_EDAC_MM_EDAC=y
@@ -448,13 +405,13 @@ CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 CONFIG_TMPFS_POSIX_ACL=y
 CONFIG_HUGETLBFS=y
+CONFIG_CONFIGFS_FS=m
 CONFIG_ECRYPT_FS=m
 CONFIG_CRAMFS=m
 CONFIG_SQUASHFS=m
 CONFIG_NFS_FS=m
-CONFIG_NFS_V3=y
 CONFIG_NFS_V3_ACL=y
-CONFIG_NFS_V4=y
+CONFIG_NFS_V4=m
 CONFIG_NFS_V4_1=y
 CONFIG_NFS_FSCACHE=y
 CONFIG_NFSD=m
@@ -508,26 +465,29 @@ CONFIG_NLS_ISO8859_15=m
 CONFIG_NLS_KOI8_R=m
 CONFIG_NLS_KOI8_U=m
 CONFIG_NLS_UTF8=m
+CONFIG_DLM=m
 CONFIG_DLM_DEBUG=y
+CONFIG_DYNAMIC_DEBUG=y
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_REDUCED=y
 # CONFIG_ENABLE_WARN_DEPRECATED is not set
 CONFIG_FRAME_WARN=2048
-CONFIG_MAGIC_SYSRQ=y
 CONFIG_STRIP_ASM_SYMS=y
 CONFIG_DEBUG_FS=y
 CONFIG_HEADERS_CHECK=y
+# CONFIG_FRAME_POINTER is not set
+CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_VM=y
+CONFIG_DEBUG_MEMORY_INIT=y
+CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_LOCKUP_DETECTOR=y
 CONFIG_SCHEDSTATS=y
 CONFIG_TIMER_STATS=y
-CONFIG_DEBUG_INFO=y
-CONFIG_DEBUG_INFO_REDUCED=y
-CONFIG_DEBUG_VM=y
-CONFIG_DEBUG_MEMORY_INIT=y
 CONFIG_DEBUG_LIST=y
 CONFIG_DEBUG_CREDENTIALS=y
-CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y
-CONFIG_DYNAMIC_DEBUG=y
+CONFIG_RCU_CPU_STALL_TIMEOUT=60
 CONFIG_ASYNC_RAID6_TEST=m
-CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_KEYS_DEBUG_PROC_KEYS=y
 CONFIG_SECURITY=y
 CONFIG_SECURITYFS=y
@@ -536,7 +496,6 @@ CONFIG_SECURITY_NETWORK_XFRM=y
 CONFIG_SECURITY_SELINUX=y
 CONFIG_SECURITY_SELINUX_BOOTPARAM=y
 CONFIG_SECURITY_SELINUX_DISABLE=y
-CONFIG_CRYPTO_NULL=m
 CONFIG_CRYPTO_PCRYPT=m
 CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_TEST=m
@@ -549,14 +508,12 @@ CONFIG_CRYPTO_XTS=m
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_VMAC=m
-CONFIG_CRYPTO_CRC32C=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD128=m
 CONFIG_CRYPTO_RMD160=m
 CONFIG_CRYPTO_RMD256=m
 CONFIG_CRYPTO_RMD320=m
 CONFIG_CRYPTO_SHA1=y
-CONFIG_CRYPTO_SHA256=m
 CONFIG_CRYPTO_SHA512=m
 CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
diff --git a/arch/tile/gxio/Kconfig b/arch/tile/gxio/Kconfig
index d221f8d6de8b..d4e10d58071b 100644
--- a/arch/tile/gxio/Kconfig
+++ b/arch/tile/gxio/Kconfig
@@ -26,3 +26,8 @@ config TILE_GXIO_TRIO
 config TILE_GXIO_USB_HOST
 	bool
 	select TILE_GXIO
+
+# Support direct access to the TILE-Gx UART hardware from kernel space.
+config TILE_GXIO_UART
+	bool
+	select TILE_GXIO
diff --git a/arch/tile/gxio/Makefile b/arch/tile/gxio/Makefile
index 8684bcaa74ea..26ae2c727467 100644
--- a/arch/tile/gxio/Makefile
+++ b/arch/tile/gxio/Makefile
@@ -6,4 +6,5 @@ obj-$(CONFIG_TILE_GXIO) += iorpc_globals.o kiorpc.o
 obj-$(CONFIG_TILE_GXIO_DMA) += dma_queue.o
 obj-$(CONFIG_TILE_GXIO_MPIPE) += mpipe.o iorpc_mpipe.o iorpc_mpipe_info.o
 obj-$(CONFIG_TILE_GXIO_TRIO) += trio.o iorpc_trio.o
+obj-$(CONFIG_TILE_GXIO_UART) += uart.o iorpc_uart.o
 obj-$(CONFIG_TILE_GXIO_USB_HOST) += usb_host.o iorpc_usb_host.o
diff --git a/arch/tile/gxio/iorpc_trio.c b/arch/tile/gxio/iorpc_trio.c
index cef4b2209cda..da6e18e049c3 100644
--- a/arch/tile/gxio/iorpc_trio.c
+++ b/arch/tile/gxio/iorpc_trio.c
@@ -61,6 +61,29 @@ int gxio_trio_alloc_memory_maps(gxio_trio_context_t * context,
 
 EXPORT_SYMBOL(gxio_trio_alloc_memory_maps);
 
+struct alloc_scatter_queues_param {
+	unsigned int count;
+	unsigned int first;
+	unsigned int flags;
+};
+
+int gxio_trio_alloc_scatter_queues(gxio_trio_context_t * context,
+				   unsigned int count, unsigned int first,
+				   unsigned int flags)
+{
+	struct alloc_scatter_queues_param temp;
+	struct alloc_scatter_queues_param *params = &temp;
+
+	params->count = count;
+	params->first = first;
+	params->flags = flags;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params),
+			     GXIO_TRIO_OP_ALLOC_SCATTER_QUEUES);
+}
+
+EXPORT_SYMBOL(gxio_trio_alloc_scatter_queues);
 
 struct alloc_pio_regions_param {
 	unsigned int count;
diff --git a/arch/tile/gxio/iorpc_uart.c b/arch/tile/gxio/iorpc_uart.c
new file mode 100644
index 000000000000..b9a6d6193d73
--- /dev/null
+++ b/arch/tile/gxio/iorpc_uart.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#include "gxio/iorpc_uart.h"
+
+struct cfg_interrupt_param {
+	union iorpc_interrupt interrupt;
+};
+
+int gxio_uart_cfg_interrupt(gxio_uart_context_t *context, int inter_x,
+			    int inter_y, int inter_ipi, int inter_event)
+{
+	struct cfg_interrupt_param temp;
+	struct cfg_interrupt_param *params = &temp;
+
+	params->interrupt.kernel.x = inter_x;
+	params->interrupt.kernel.y = inter_y;
+	params->interrupt.kernel.ipi = inter_ipi;
+	params->interrupt.kernel.event = inter_event;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_UART_OP_CFG_INTERRUPT);
+}
+
+EXPORT_SYMBOL(gxio_uart_cfg_interrupt);
+
+struct get_mmio_base_param {
+	HV_PTE base;
+};
+
+int gxio_uart_get_mmio_base(gxio_uart_context_t *context, HV_PTE *base)
+{
+	int __result;
+	struct get_mmio_base_param temp;
+	struct get_mmio_base_param *params = &temp;
+
+	__result =
+	    hv_dev_pread(context->fd, 0, (HV_VirtAddr) params, sizeof(*params),
+			 GXIO_UART_OP_GET_MMIO_BASE);
+	*base = params->base;
+
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_uart_get_mmio_base);
+
+struct check_mmio_offset_param {
+	unsigned long offset;
+	unsigned long size;
+};
+
+int gxio_uart_check_mmio_offset(gxio_uart_context_t *context,
+				unsigned long offset, unsigned long size)
+{
+	struct check_mmio_offset_param temp;
+	struct check_mmio_offset_param *params = &temp;
+
+	params->offset = offset;
+	params->size = size;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_UART_OP_CHECK_MMIO_OFFSET);
+}
+
+EXPORT_SYMBOL(gxio_uart_check_mmio_offset);
diff --git a/arch/tile/gxio/uart.c b/arch/tile/gxio/uart.c
new file mode 100644
index 000000000000..ba585175ef88
--- /dev/null
+++ b/arch/tile/gxio/uart.c
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/*
+ * Implementation of UART gxio calls.
+ */
+
+#include <linux/io.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+
+#include <gxio/uart.h>
+#include <gxio/iorpc_globals.h>
+#include <gxio/iorpc_uart.h>
+#include <gxio/kiorpc.h>
+
+int gxio_uart_init(gxio_uart_context_t *context, int uart_index)
+{
+	char file[32];
+	int fd;
+
+	snprintf(file, sizeof(file), "uart/%d/iorpc", uart_index);
+	fd = hv_dev_open((HV_VirtAddr) file, 0);
+	if (fd < 0) {
+		if (fd >= GXIO_ERR_MIN && fd <= GXIO_ERR_MAX)
+			return fd;
+		else
+			return -ENODEV;
+	}
+
+	context->fd = fd;
+
+	/* Map in the MMIO space. */
+	context->mmio_base = (void __force *)
+		iorpc_ioremap(fd, HV_UART_MMIO_OFFSET, HV_UART_MMIO_SIZE);
+
+	if (context->mmio_base == NULL) {
+		hv_dev_close(context->fd);
+		context->fd = -1;
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(gxio_uart_init);
+
+int gxio_uart_destroy(gxio_uart_context_t *context)
+{
+	iounmap((void __force __iomem *)(context->mmio_base));
+	hv_dev_close(context->fd);
+
+	context->mmio_base = NULL;
+	context->fd = -1;
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(gxio_uart_destroy);
+
+/* UART register write wrapper. */
+void gxio_uart_write(gxio_uart_context_t *context, uint64_t offset,
+		     uint64_t word)
+{
+	__gxio_mmio_write(context->mmio_base + offset, word);
+}
+
+EXPORT_SYMBOL_GPL(gxio_uart_write);
+
+/* UART register read wrapper. */
+uint64_t gxio_uart_read(gxio_uart_context_t *context, uint64_t offset)
+{
+	return __gxio_mmio_read(context->mmio_base + offset);
+}
+
+EXPORT_SYMBOL_GPL(gxio_uart_read);
diff --git a/arch/tile/include/arch/trio.h b/arch/tile/include/arch/trio.h
index d3000a871a21..c0ddedcae085 100644
--- a/arch/tile/include/arch/trio.h
+++ b/arch/tile/include/arch/trio.h
@@ -23,6 +23,45 @@
 #ifndef __ASSEMBLER__
 
 /*
+ * Map SQ Doorbell Format.
+ * This describes the format of the write-only doorbell register that exists
+ * in the last 8-bytes of the MAP_SQ_BASE/LIM range.  This register is only
+ * writable from PCIe space.  Writes to this register will not be written to
+ * Tile memory space and thus no IO VA translation is required if the last
+ * page of the BASE/LIM range is not otherwise written.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /*
+     * When written with a 1, the associated MAP_SQ region's doorbell
+     * interrupt will be triggered once all previous writes are visible to
+     * Tile software.
+     */
+    uint_reg_t doorbell   : 1;
+    /*
+     * When written with a 1, the descriptor at the head of the associated
+     * MAP_SQ's FIFO will be dequeued.
+     */
+    uint_reg_t pop        : 1;
+    /* Reserved. */
+    uint_reg_t __reserved : 62;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved : 62;
+    uint_reg_t pop        : 1;
+    uint_reg_t doorbell   : 1;
+#endif
+  };
+
+  uint_reg_t word;
+} TRIO_MAP_SQ_DOORBELL_FMT_t;
+
+
+/*
  * Tile PIO Region Configuration - CFG Address Format.
  * This register describes the address format for PIO accesses when the
  * associated region is setup with TYPE=CFG.
diff --git a/arch/tile/include/arch/uart.h b/arch/tile/include/arch/uart.h
new file mode 100644
index 000000000000..07966970adad
--- /dev/null
+++ b/arch/tile/include/arch/uart.h
@@ -0,0 +1,300 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+#ifndef __ARCH_UART_H__
+#define __ARCH_UART_H__
+
+#include <arch/abi.h>
+#include <arch/uart_def.h>
+
+#ifndef __ASSEMBLER__
+
+/* Divisor. */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /*
+     * Baud Rate Divisor.  Desired_baud_rate = REF_CLK frequency / (baud *
+     * 16).
+     *                       Note: REF_CLK is always 125 MHz, the default
+     * divisor = 68, baud rate = 125M/(68*16) = 115200 baud.
+     */
+    uint_reg_t divisor    : 12;
+    /* Reserved. */
+    uint_reg_t __reserved : 52;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved : 52;
+    uint_reg_t divisor    : 12;
+#endif
+  };
+
+  uint_reg_t word;
+} UART_DIVISOR_t;
+
+/* FIFO Count. */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /*
+     * n: n active entries in the receive FIFO (max is 2**8). Each entry has
+     * 8 bits.
+     * 0: no active entry in the receive FIFO (that is empty).
+     */
+    uint_reg_t rfifo_count  : 9;
+    /* Reserved. */
+    uint_reg_t __reserved_0 : 7;
+    /*
+     * n: n active entries in the transmit FIFO (max is 2**8). Each entry has
+     * 8 bits.
+     * 0: no active entry in the transmit FIFO (that is empty).
+     */
+    uint_reg_t tfifo_count  : 9;
+    /* Reserved. */
+    uint_reg_t __reserved_1 : 7;
+    /*
+     * n: n active entries in the write FIFO (max is 2**2). Each entry has 8
+     * bits.
+     * 0: no active entry in the write FIFO (that is empty).
+     */
+    uint_reg_t wfifo_count  : 3;
+    /* Reserved. */
+    uint_reg_t __reserved_2 : 29;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_2 : 29;
+    uint_reg_t wfifo_count  : 3;
+    uint_reg_t __reserved_1 : 7;
+    uint_reg_t tfifo_count  : 9;
+    uint_reg_t __reserved_0 : 7;
+    uint_reg_t rfifo_count  : 9;
+#endif
+  };
+
+  uint_reg_t word;
+} UART_FIFO_COUNT_t;
+
+/* FLAG. */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /* Reserved. */
+    uint_reg_t __reserved_0 : 1;
+    /* 1: receive FIFO is empty */
+    uint_reg_t rfifo_empty  : 1;
+    /* 1: write FIFO is empty. */
+    uint_reg_t wfifo_empty  : 1;
+    /* 1: transmit FIFO is empty. */
+    uint_reg_t tfifo_empty  : 1;
+    /* 1: receive FIFO is full. */
+    uint_reg_t rfifo_full   : 1;
+    /* 1: write FIFO is full. */
+    uint_reg_t wfifo_full   : 1;
+    /* 1: transmit FIFO is full. */
+    uint_reg_t tfifo_full   : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_1 : 57;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_1 : 57;
+    uint_reg_t tfifo_full   : 1;
+    uint_reg_t wfifo_full   : 1;
+    uint_reg_t rfifo_full   : 1;
+    uint_reg_t tfifo_empty  : 1;
+    uint_reg_t wfifo_empty  : 1;
+    uint_reg_t rfifo_empty  : 1;
+    uint_reg_t __reserved_0 : 1;
+#endif
+  };
+
+  uint_reg_t word;
+} UART_FLAG_t;
+
+/*
+ * Interrupt Vector Mask.
+ * Each bit in this register corresponds to a specific interrupt. When set,
+ * the associated interrupt will not be dispatched.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /* Read data FIFO read and no data available */
+    uint_reg_t rdat_err       : 1;
+    /* Write FIFO was written but it was full */
+    uint_reg_t wdat_err       : 1;
+    /* Stop bit not found when current data was received */
+    uint_reg_t frame_err      : 1;
+    /* Parity error was detected when current data was received */
+    uint_reg_t parity_err     : 1;
+    /* Data was received but the receive FIFO was full */
+    uint_reg_t rfifo_overflow : 1;
+    /*
+     * An almost full event is reached when data is to be written to the
+     * receive FIFO, and the receive FIFO has more than or equal to
+     * BUFFER_THRESHOLD.RFIFO_AFULL bytes.
+     */
+    uint_reg_t rfifo_afull    : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_0   : 1;
+    /* An entry in the transmit FIFO was popped */
+    uint_reg_t tfifo_re       : 1;
+    /* An entry has been pushed into the receive FIFO */
+    uint_reg_t rfifo_we       : 1;
+    /* An entry of the write FIFO has been popped */
+    uint_reg_t wfifo_re       : 1;
+    /* Rshim read receive FIFO in protocol mode */
+    uint_reg_t rfifo_err      : 1;
+    /*
+     * An almost empty event is reached when data is to be read from the
+     * transmit FIFO, and the transmit FIFO has less than or equal to
+     * BUFFER_THRESHOLD.TFIFO_AEMPTY bytes.
+     */
+    uint_reg_t tfifo_aempty   : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_1   : 52;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_1   : 52;
+    uint_reg_t tfifo_aempty   : 1;
+    uint_reg_t rfifo_err      : 1;
+    uint_reg_t wfifo_re       : 1;
+    uint_reg_t rfifo_we       : 1;
+    uint_reg_t tfifo_re       : 1;
+    uint_reg_t __reserved_0   : 1;
+    uint_reg_t rfifo_afull    : 1;
+    uint_reg_t rfifo_overflow : 1;
+    uint_reg_t parity_err     : 1;
+    uint_reg_t frame_err      : 1;
+    uint_reg_t wdat_err       : 1;
+    uint_reg_t rdat_err       : 1;
+#endif
+  };
+
+  uint_reg_t word;
+} UART_INTERRUPT_MASK_t;
+
+/*
+ * Interrupt vector, write-one-to-clear.
+ * Each bit in this register corresponds to a specific interrupt. Hardware
+ * sets the bit when the associated condition has occurred. Writing a 1
+ * clears the status bit.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /* Read data FIFO read and no data available */
+    uint_reg_t rdat_err       : 1;
+    /* Write FIFO was written but it was full */
+    uint_reg_t wdat_err       : 1;
+    /* Stop bit not found when current data was received */
+    uint_reg_t frame_err      : 1;
+    /* Parity error was detected when current data was received */
+    uint_reg_t parity_err     : 1;
+    /* Data was received but the receive FIFO was full */
+    uint_reg_t rfifo_overflow : 1;
+    /*
+     * Data was received and the receive FIFO is now almost full (more than
+     * BUFFER_THRESHOLD.RFIFO_AFULL bytes in it)
+     */
+    uint_reg_t rfifo_afull    : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_0   : 1;
+    /* An entry in the transmit FIFO was popped */
+    uint_reg_t tfifo_re       : 1;
+    /* An entry has been pushed into the receive FIFO */
+    uint_reg_t rfifo_we       : 1;
+    /* An entry of the write FIFO has been popped */
+    uint_reg_t wfifo_re       : 1;
+    /* Rshim read receive FIFO in protocol mode */
+    uint_reg_t rfifo_err      : 1;
+    /*
+     * Data was read from the transmit FIFO and now it is almost empty (less
+     * than or equal to BUFFER_THRESHOLD.TFIFO_AEMPTY bytes in it).
+     */
+    uint_reg_t tfifo_aempty   : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_1   : 52;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_1   : 52;
+    uint_reg_t tfifo_aempty   : 1;
+    uint_reg_t rfifo_err      : 1;
+    uint_reg_t wfifo_re       : 1;
+    uint_reg_t rfifo_we       : 1;
+    uint_reg_t tfifo_re       : 1;
+    uint_reg_t __reserved_0   : 1;
+    uint_reg_t rfifo_afull    : 1;
+    uint_reg_t rfifo_overflow : 1;
+    uint_reg_t parity_err     : 1;
+    uint_reg_t frame_err      : 1;
+    uint_reg_t wdat_err       : 1;
+    uint_reg_t rdat_err       : 1;
+#endif
+  };
+
+  uint_reg_t word;
+} UART_INTERRUPT_STATUS_t;
+
+/* Type. */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /* Number of stop bits, rx and tx */
+    uint_reg_t sbits        : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_0 : 1;
+    /* Data word size, rx and tx */
+    uint_reg_t dbits        : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_1 : 1;
+    /* Parity selection, rx and tx */
+    uint_reg_t ptype        : 3;
+    /* Reserved. */
+    uint_reg_t __reserved_2 : 57;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_2 : 57;
+    uint_reg_t ptype        : 3;
+    uint_reg_t __reserved_1 : 1;
+    uint_reg_t dbits        : 1;
+    uint_reg_t __reserved_0 : 1;
+    uint_reg_t sbits        : 1;
+#endif
+  };
+
+  uint_reg_t word;
+} UART_TYPE_t;
+#endif /* !defined(__ASSEMBLER__) */
+
+#endif /* !defined(__ARCH_UART_H__) */
diff --git a/arch/tile/include/arch/uart_def.h b/arch/tile/include/arch/uart_def.h
new file mode 100644
index 000000000000..42bcaf535379
--- /dev/null
+++ b/arch/tile/include/arch/uart_def.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+#ifndef __ARCH_UART_DEF_H__
+#define __ARCH_UART_DEF_H__
+#define UART_DIVISOR 0x0158
+#define UART_FIFO_COUNT 0x0110
+#define UART_FLAG 0x0108
+#define UART_INTERRUPT_MASK 0x0208
+#define UART_INTERRUPT_MASK__RDAT_ERR_SHIFT 0
+#define UART_INTERRUPT_MASK__RDAT_ERR_WIDTH 1
+#define UART_INTERRUPT_MASK__RDAT_ERR_RESET_VAL 1
+#define UART_INTERRUPT_MASK__RDAT_ERR_RMASK 0x1
+#define UART_INTERRUPT_MASK__RDAT_ERR_MASK  0x1
+#define UART_INTERRUPT_MASK__RDAT_ERR_FIELD 0,0
+#define UART_INTERRUPT_MASK__WDAT_ERR_SHIFT 1
+#define UART_INTERRUPT_MASK__WDAT_ERR_WIDTH 1
+#define UART_INTERRUPT_MASK__WDAT_ERR_RESET_VAL 1
+#define UART_INTERRUPT_MASK__WDAT_ERR_RMASK 0x1
+#define UART_INTERRUPT_MASK__WDAT_ERR_MASK  0x2
+#define UART_INTERRUPT_MASK__WDAT_ERR_FIELD 1,1
+#define UART_INTERRUPT_MASK__FRAME_ERR_SHIFT 2
+#define UART_INTERRUPT_MASK__FRAME_ERR_WIDTH 1
+#define UART_INTERRUPT_MASK__FRAME_ERR_RESET_VAL 1
+#define UART_INTERRUPT_MASK__FRAME_ERR_RMASK 0x1
+#define UART_INTERRUPT_MASK__FRAME_ERR_MASK  0x4
+#define UART_INTERRUPT_MASK__FRAME_ERR_FIELD 2,2
+#define UART_INTERRUPT_MASK__PARITY_ERR_SHIFT 3
+#define UART_INTERRUPT_MASK__PARITY_ERR_WIDTH 1
+#define UART_INTERRUPT_MASK__PARITY_ERR_RESET_VAL 1
+#define UART_INTERRUPT_MASK__PARITY_ERR_RMASK 0x1
+#define UART_INTERRUPT_MASK__PARITY_ERR_MASK  0x8
+#define UART_INTERRUPT_MASK__PARITY_ERR_FIELD 3,3
+#define UART_INTERRUPT_MASK__RFIFO_OVERFLOW_SHIFT 4
+#define UART_INTERRUPT_MASK__RFIFO_OVERFLOW_WIDTH 1
+#define UART_INTERRUPT_MASK__RFIFO_OVERFLOW_RESET_VAL 1
+#define UART_INTERRUPT_MASK__RFIFO_OVERFLOW_RMASK 0x1
+#define UART_INTERRUPT_MASK__RFIFO_OVERFLOW_MASK  0x10
+#define UART_INTERRUPT_MASK__RFIFO_OVERFLOW_FIELD 4,4
+#define UART_INTERRUPT_MASK__RFIFO_AFULL_SHIFT 5
+#define UART_INTERRUPT_MASK__RFIFO_AFULL_WIDTH 1
+#define UART_INTERRUPT_MASK__RFIFO_AFULL_RESET_VAL 1
+#define UART_INTERRUPT_MASK__RFIFO_AFULL_RMASK 0x1
+#define UART_INTERRUPT_MASK__RFIFO_AFULL_MASK  0x20
+#define UART_INTERRUPT_MASK__RFIFO_AFULL_FIELD 5,5
+#define UART_INTERRUPT_MASK__TFIFO_RE_SHIFT 7
+#define UART_INTERRUPT_MASK__TFIFO_RE_WIDTH 1
+#define UART_INTERRUPT_MASK__TFIFO_RE_RESET_VAL 1
+#define UART_INTERRUPT_MASK__TFIFO_RE_RMASK 0x1
+#define UART_INTERRUPT_MASK__TFIFO_RE_MASK  0x80
+#define UART_INTERRUPT_MASK__TFIFO_RE_FIELD 7,7
+#define UART_INTERRUPT_MASK__RFIFO_WE_SHIFT 8
+#define UART_INTERRUPT_MASK__RFIFO_WE_WIDTH 1
+#define UART_INTERRUPT_MASK__RFIFO_WE_RESET_VAL 1
+#define UART_INTERRUPT_MASK__RFIFO_WE_RMASK 0x1
+#define UART_INTERRUPT_MASK__RFIFO_WE_MASK  0x100
+#define UART_INTERRUPT_MASK__RFIFO_WE_FIELD 8,8
+#define UART_INTERRUPT_MASK__WFIFO_RE_SHIFT 9
+#define UART_INTERRUPT_MASK__WFIFO_RE_WIDTH 1
+#define UART_INTERRUPT_MASK__WFIFO_RE_RESET_VAL 1
+#define UART_INTERRUPT_MASK__WFIFO_RE_RMASK 0x1
+#define UART_INTERRUPT_MASK__WFIFO_RE_MASK  0x200
+#define UART_INTERRUPT_MASK__WFIFO_RE_FIELD 9,9
+#define UART_INTERRUPT_MASK__RFIFO_ERR_SHIFT 10
+#define UART_INTERRUPT_MASK__RFIFO_ERR_WIDTH 1
+#define UART_INTERRUPT_MASK__RFIFO_ERR_RESET_VAL 1
+#define UART_INTERRUPT_MASK__RFIFO_ERR_RMASK 0x1
+#define UART_INTERRUPT_MASK__RFIFO_ERR_MASK  0x400
+#define UART_INTERRUPT_MASK__RFIFO_ERR_FIELD 10,10
+#define UART_INTERRUPT_MASK__TFIFO_AEMPTY_SHIFT 11
+#define UART_INTERRUPT_MASK__TFIFO_AEMPTY_WIDTH 1
+#define UART_INTERRUPT_MASK__TFIFO_AEMPTY_RESET_VAL 1
+#define UART_INTERRUPT_MASK__TFIFO_AEMPTY_RMASK 0x1
+#define UART_INTERRUPT_MASK__TFIFO_AEMPTY_MASK  0x800
+#define UART_INTERRUPT_MASK__TFIFO_AEMPTY_FIELD 11,11
+#define UART_INTERRUPT_STATUS 0x0200
+#define UART_RECEIVE_DATA 0x0148
+#define UART_TRANSMIT_DATA 0x0140
+#define UART_TYPE 0x0160
+#define UART_TYPE__SBITS_SHIFT 0
+#define UART_TYPE__SBITS_WIDTH 1
+#define UART_TYPE__SBITS_RESET_VAL 1
+#define UART_TYPE__SBITS_RMASK 0x1
+#define UART_TYPE__SBITS_MASK  0x1
+#define UART_TYPE__SBITS_FIELD 0,0
+#define UART_TYPE__SBITS_VAL_ONE_SBITS 0x0
+#define UART_TYPE__SBITS_VAL_TWO_SBITS 0x1
+#define UART_TYPE__DBITS_SHIFT 2
+#define UART_TYPE__DBITS_WIDTH 1
+#define UART_TYPE__DBITS_RESET_VAL 0
+#define UART_TYPE__DBITS_RMASK 0x1
+#define UART_TYPE__DBITS_MASK  0x4
+#define UART_TYPE__DBITS_FIELD 2,2
+#define UART_TYPE__DBITS_VAL_EIGHT_DBITS 0x0
+#define UART_TYPE__DBITS_VAL_SEVEN_DBITS 0x1
+#define UART_TYPE__PTYPE_SHIFT 4
+#define UART_TYPE__PTYPE_WIDTH 3
+#define UART_TYPE__PTYPE_RESET_VAL 3
+#define UART_TYPE__PTYPE_RMASK 0x7
+#define UART_TYPE__PTYPE_MASK  0x70
+#define UART_TYPE__PTYPE_FIELD 4,6
+#define UART_TYPE__PTYPE_VAL_NONE 0x0
+#define UART_TYPE__PTYPE_VAL_MARK 0x1
+#define UART_TYPE__PTYPE_VAL_SPACE 0x2
+#define UART_TYPE__PTYPE_VAL_EVEN 0x3
+#define UART_TYPE__PTYPE_VAL_ODD 0x4
+#endif /* !defined(__ARCH_UART_DEF_H__) */
diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild
index b17b9b8e53cd..664d6ad23f80 100644
--- a/arch/tile/include/asm/Kbuild
+++ b/arch/tile/include/asm/Kbuild
@@ -11,12 +11,13 @@ generic-y += errno.h
 generic-y += exec.h
 generic-y += fb.h
 generic-y += fcntl.h
+generic-y += hw_irq.h
 generic-y += ioctl.h
 generic-y += ioctls.h
 generic-y += ipcbuf.h
 generic-y += irq_regs.h
-generic-y += kdebug.h
 generic-y += local.h
+generic-y += local64.h
 generic-y += msgbuf.h
 generic-y += mutex.h
 generic-y += param.h
diff --git a/arch/tile/include/asm/atomic.h b/arch/tile/include/asm/atomic.h
index e71387ab20ca..d385eaadece7 100644
--- a/arch/tile/include/asm/atomic.h
+++ b/arch/tile/include/asm/atomic.h
@@ -114,6 +114,32 @@ static inline int atomic_read(const atomic_t *v)
 #define atomic_inc_and_test(v)		(atomic_inc_return(v) == 0)
 
 /**
+ * atomic_xchg - atomically exchange contents of memory with a new value
+ * @v: pointer of type atomic_t
+ * @i: integer value to store in memory
+ *
+ * Atomically sets @v to @i and returns old @v
+ */
+static inline int atomic_xchg(atomic_t *v, int n)
+{
+	return xchg(&v->counter, n);
+}
+
+/**
+ * atomic_cmpxchg - atomically exchange contents of memory if it matches
+ * @v: pointer of type atomic_t
+ * @o: old value that memory should have
+ * @n: new value to write to memory if it matches
+ *
+ * Atomically checks if @v holds @o and replaces it with @n if so.
+ * Returns the old value at @v.
+ */
+static inline int atomic_cmpxchg(atomic_t *v, int o, int n)
+{
+	return cmpxchg(&v->counter, o, n);
+}
+
+/**
  * atomic_add_negative - add and test if negative
  * @v: pointer of type atomic_t
  * @i: integer value to add
@@ -133,6 +159,32 @@ static inline int atomic_read(const atomic_t *v)
 
 #ifndef __ASSEMBLY__
 
+/**
+ * atomic64_xchg - atomically exchange contents of memory with a new value
+ * @v: pointer of type atomic64_t
+ * @i: integer value to store in memory
+ *
+ * Atomically sets @v to @i and returns old @v
+ */
+static inline u64 atomic64_xchg(atomic64_t *v, u64 n)
+{
+	return xchg64(&v->counter, n);
+}
+
+/**
+ * atomic64_cmpxchg - atomically exchange contents of memory if it matches
+ * @v: pointer of type atomic64_t
+ * @o: old value that memory should have
+ * @n: new value to write to memory if it matches
+ *
+ * Atomically checks if @v holds @o and replaces it with @n if so.
+ * Returns the old value at @v.
+ */
+static inline u64 atomic64_cmpxchg(atomic64_t *v, u64 o, u64 n)
+{
+	return cmpxchg64(&v->counter, o, n);
+}
+
 static inline long long atomic64_dec_if_positive(atomic64_t *v)
 {
 	long long c, old, dec;
diff --git a/arch/tile/include/asm/atomic_32.h b/arch/tile/include/asm/atomic_32.h
index e7fb5cfb9597..0d0395b1b152 100644
--- a/arch/tile/include/asm/atomic_32.h
+++ b/arch/tile/include/asm/atomic_32.h
@@ -22,40 +22,6 @@
 
 #ifndef __ASSEMBLY__
 
-/* Tile-specific routines to support <linux/atomic.h>. */
-int _atomic_xchg(atomic_t *v, int n);
-int _atomic_xchg_add(atomic_t *v, int i);
-int _atomic_xchg_add_unless(atomic_t *v, int a, int u);
-int _atomic_cmpxchg(atomic_t *v, int o, int n);
-
-/**
- * atomic_xchg - atomically exchange contents of memory with a new value
- * @v: pointer of type atomic_t
- * @i: integer value to store in memory
- *
- * Atomically sets @v to @i and returns old @v
- */
-static inline int atomic_xchg(atomic_t *v, int n)
-{
-	smp_mb();  /* barrier for proper semantics */
-	return _atomic_xchg(v, n);
-}
-
-/**
- * atomic_cmpxchg - atomically exchange contents of memory if it matches
- * @v: pointer of type atomic_t
- * @o: old value that memory should have
- * @n: new value to write to memory if it matches
- *
- * Atomically checks if @v holds @o and replaces it with @n if so.
- * Returns the old value at @v.
- */
-static inline int atomic_cmpxchg(atomic_t *v, int o, int n)
-{
-	smp_mb();  /* barrier for proper semantics */
-	return _atomic_cmpxchg(v, o, n);
-}
-
 /**
  * atomic_add - add integer to atomic variable
  * @i: integer value to add
@@ -65,7 +31,7 @@ static inline int atomic_cmpxchg(atomic_t *v, int o, int n)
  */
 static inline void atomic_add(int i, atomic_t *v)
 {
-	_atomic_xchg_add(v, i);
+	_atomic_xchg_add(&v->counter, i);
 }
 
 /**
@@ -78,7 +44,7 @@ static inline void atomic_add(int i, atomic_t *v)
 static inline int atomic_add_return(int i, atomic_t *v)
 {
 	smp_mb();  /* barrier for proper semantics */
-	return _atomic_xchg_add(v, i) + i;
+	return _atomic_xchg_add(&v->counter, i) + i;
 }
 
 /**
@@ -93,7 +59,7 @@ static inline int atomic_add_return(int i, atomic_t *v)
 static inline int __atomic_add_unless(atomic_t *v, int a, int u)
 {
 	smp_mb();  /* barrier for proper semantics */
-	return _atomic_xchg_add_unless(v, a, u);
+	return _atomic_xchg_add_unless(&v->counter, a, u);
 }
 
 /**
@@ -108,7 +74,7 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
  */
 static inline void atomic_set(atomic_t *v, int n)
 {
-	_atomic_xchg(v, n);
+	_atomic_xchg(&v->counter, n);
 }
 
 /* A 64bit atomic type */
@@ -119,11 +85,6 @@ typedef struct {
 
 #define ATOMIC64_INIT(val) { (val) }
 
-u64 _atomic64_xchg(atomic64_t *v, u64 n);
-u64 _atomic64_xchg_add(atomic64_t *v, u64 i);
-u64 _atomic64_xchg_add_unless(atomic64_t *v, u64 a, u64 u);
-u64 _atomic64_cmpxchg(atomic64_t *v, u64 o, u64 n);
-
 /**
  * atomic64_read - read atomic variable
  * @v: pointer of type atomic64_t
@@ -137,35 +98,7 @@ static inline u64 atomic64_read(const atomic64_t *v)
 	 * Casting away const is safe since the atomic support routines
 	 * do not write to memory if the value has not been modified.
 	 */
-	return _atomic64_xchg_add((atomic64_t *)v, 0);
-}
-
-/**
- * atomic64_xchg - atomically exchange contents of memory with a new value
- * @v: pointer of type atomic64_t
- * @i: integer value to store in memory
- *
- * Atomically sets @v to @i and returns old @v
- */
-static inline u64 atomic64_xchg(atomic64_t *v, u64 n)
-{
-	smp_mb();  /* barrier for proper semantics */
-	return _atomic64_xchg(v, n);
-}
-
-/**
- * atomic64_cmpxchg - atomically exchange contents of memory if it matches
- * @v: pointer of type atomic64_t
- * @o: old value that memory should have
- * @n: new value to write to memory if it matches
- *
- * Atomically checks if @v holds @o and replaces it with @n if so.
- * Returns the old value at @v.
- */
-static inline u64 atomic64_cmpxchg(atomic64_t *v, u64 o, u64 n)
-{
-	smp_mb();  /* barrier for proper semantics */
-	return _atomic64_cmpxchg(v, o, n);
+	return _atomic64_xchg_add((u64 *)&v->counter, 0);
 }
 
 /**
@@ -177,7 +110,7 @@ static inline u64 atomic64_cmpxchg(atomic64_t *v, u64 o, u64 n)
  */
 static inline void atomic64_add(u64 i, atomic64_t *v)
 {
-	_atomic64_xchg_add(v, i);
+	_atomic64_xchg_add(&v->counter, i);
 }
 
 /**
@@ -190,7 +123,7 @@ static inline void atomic64_add(u64 i, atomic64_t *v)
 static inline u64 atomic64_add_return(u64 i, atomic64_t *v)
 {
 	smp_mb();  /* barrier for proper semantics */
-	return _atomic64_xchg_add(v, i) + i;
+	return _atomic64_xchg_add(&v->counter, i) + i;
 }
 
 /**
@@ -205,7 +138,7 @@ static inline u64 atomic64_add_return(u64 i, atomic64_t *v)
 static inline u64 atomic64_add_unless(atomic64_t *v, u64 a, u64 u)
 {
 	smp_mb();  /* barrier for proper semantics */
-	return _atomic64_xchg_add_unless(v, a, u) != u;
+	return _atomic64_xchg_add_unless(&v->counter, a, u) != u;
 }
 
 /**
@@ -220,7 +153,7 @@ static inline u64 atomic64_add_unless(atomic64_t *v, u64 a, u64 u)
  */
 static inline void atomic64_set(atomic64_t *v, u64 n)
 {
-	_atomic64_xchg(v, n);
+	_atomic64_xchg(&v->counter, n);
 }
 
 #define atomic64_add_negative(a, v)	(atomic64_add_return((a), (v)) < 0)
@@ -252,21 +185,6 @@ static inline void atomic64_set(atomic64_t *v, u64 n)
  * Internal definitions only beyond this point.
  */
 
-#define ATOMIC_LOCKS_FOUND_VIA_TABLE() \
-  (!CHIP_HAS_CBOX_HOME_MAP() && defined(CONFIG_SMP))
-
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-
-/* Number of entries in atomic_lock_ptr[]. */
-#define ATOMIC_HASH_L1_SHIFT 6
-#define ATOMIC_HASH_L1_SIZE (1 << ATOMIC_HASH_L1_SHIFT)
-
-/* Number of locks in each struct pointed to by atomic_lock_ptr[]. */
-#define ATOMIC_HASH_L2_SHIFT (CHIP_L2_LOG_LINE_SIZE() - 2)
-#define ATOMIC_HASH_L2_SIZE (1 << ATOMIC_HASH_L2_SHIFT)
-
-#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
 /*
  * Number of atomic locks in atomic_locks[]. Must be a power of two.
  * There is no reason for more than PAGE_SIZE / 8 entries, since that
@@ -281,8 +199,6 @@ static inline void atomic64_set(atomic64_t *v, u64 n)
 extern int atomic_locks[];
 #endif
 
-#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
 /*
  * All the code that may fault while holding an atomic lock must
  * place the pointer to the lock in ATOMIC_LOCK_REG so the fault code
diff --git a/arch/tile/include/asm/atomic_64.h b/arch/tile/include/asm/atomic_64.h
index f4500c688ffa..ad220eed05fc 100644
--- a/arch/tile/include/asm/atomic_64.h
+++ b/arch/tile/include/asm/atomic_64.h
@@ -32,25 +32,6 @@
  * on any routine which updates memory and returns a value.
  */
 
-static inline int atomic_cmpxchg(atomic_t *v, int o, int n)
-{
-	int val;
-	__insn_mtspr(SPR_CMPEXCH_VALUE, o);
-	smp_mb();  /* barrier for proper semantics */
-	val = __insn_cmpexch4((void *)&v->counter, n);
-	smp_mb();  /* barrier for proper semantics */
-	return val;
-}
-
-static inline int atomic_xchg(atomic_t *v, int n)
-{
-	int val;
-	smp_mb();  /* barrier for proper semantics */
-	val = __insn_exch4((void *)&v->counter, n);
-	smp_mb();  /* barrier for proper semantics */
-	return val;
-}
-
 static inline void atomic_add(int i, atomic_t *v)
 {
 	__insn_fetchadd4((void *)&v->counter, i);
@@ -72,7 +53,7 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
 		if (oldval == u)
 			break;
 		guess = oldval;
-		oldval = atomic_cmpxchg(v, guess, guess + a);
+		oldval = cmpxchg(&v->counter, guess, guess + a);
 	} while (guess != oldval);
 	return oldval;
 }
@@ -84,25 +65,6 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
 #define atomic64_read(v)		((v)->counter)
 #define atomic64_set(v, i) ((v)->counter = (i))
 
-static inline long atomic64_cmpxchg(atomic64_t *v, long o, long n)
-{
-	long val;
-	smp_mb();  /* barrier for proper semantics */
-	__insn_mtspr(SPR_CMPEXCH_VALUE, o);
-	val = __insn_cmpexch((void *)&v->counter, n);
-	smp_mb();  /* barrier for proper semantics */
-	return val;
-}
-
-static inline long atomic64_xchg(atomic64_t *v, long n)
-{
-	long val;
-	smp_mb();  /* barrier for proper semantics */
-	val = __insn_exch((void *)&v->counter, n);
-	smp_mb();  /* barrier for proper semantics */
-	return val;
-}
-
 static inline void atomic64_add(long i, atomic64_t *v)
 {
 	__insn_fetchadd((void *)&v->counter, i);
@@ -124,7 +86,7 @@ static inline long atomic64_add_unless(atomic64_t *v, long a, long u)
 		if (oldval == u)
 			break;
 		guess = oldval;
-		oldval = atomic64_cmpxchg(v, guess, guess + a);
+		oldval = cmpxchg(&v->counter, guess, guess + a);
 	} while (guess != oldval);
 	return oldval != u;
 }
diff --git a/arch/tile/include/asm/barrier.h b/arch/tile/include/asm/barrier.h
index 990a217a0b72..a9a73da5865d 100644
--- a/arch/tile/include/asm/barrier.h
+++ b/arch/tile/include/asm/barrier.h
@@ -77,7 +77,6 @@
 
 #define __sync()	__insn_mf()
 
-#if !CHIP_HAS_MF_WAITS_FOR_VICTIMS()
 #include <hv/syscall_public.h>
 /*
  * Issue an uncacheable load to each memory controller, then
@@ -96,7 +95,6 @@ static inline void __mb_incoherent(void)
 		       "r20", "r21", "r22", "r23", "r24",
 		       "r25", "r26", "r27", "r28", "r29");
 }
-#endif
 
 /* Fence to guarantee visibility of stores to incoherent memory. */
 static inline void
@@ -104,7 +102,6 @@ mb_incoherent(void)
 {
 	__insn_mf();
 
-#if !CHIP_HAS_MF_WAITS_FOR_VICTIMS()
 	{
 #if CHIP_HAS_TILE_WRITE_PENDING()
 		const unsigned long WRITE_TIMEOUT_CYCLES = 400;
@@ -116,7 +113,6 @@ mb_incoherent(void)
 #endif /* CHIP_HAS_TILE_WRITE_PENDING() */
 		(void) __mb_incoherent();
 	}
-#endif /* CHIP_HAS_MF_WAITS_FOR_VICTIMS() */
 }
 
 #define fast_wmb()	__sync()
diff --git a/arch/tile/include/asm/bitops.h b/arch/tile/include/asm/bitops.h
index bd186c4eaa50..d5a206865036 100644
--- a/arch/tile/include/asm/bitops.h
+++ b/arch/tile/include/asm/bitops.h
@@ -29,17 +29,6 @@
 #endif
 
 /**
- * __ffs - find first set bit in word
- * @word: The word to search
- *
- * Undefined if no set bit exists, so code should check against 0 first.
- */
-static inline unsigned long __ffs(unsigned long word)
-{
-	return __builtin_ctzl(word);
-}
-
-/**
  * ffz - find first zero bit in word
  * @word: The word to search
  *
@@ -50,33 +39,6 @@ static inline unsigned long ffz(unsigned long word)
 	return __builtin_ctzl(~word);
 }
 
-/**
- * __fls - find last set bit in word
- * @word: The word to search
- *
- * Undefined if no set bit exists, so code should check against 0 first.
- */
-static inline unsigned long __fls(unsigned long word)
-{
-	return (sizeof(word) * 8) - 1 - __builtin_clzl(word);
-}
-
-/**
- * ffs - find first set bit in word
- * @x: the word to search
- *
- * This is defined the same way as the libc and compiler builtin ffs
- * routines, therefore differs in spirit from the other bitops.
- *
- * ffs(value) returns 0 if value is 0 or the position of the first
- * set bit if value is nonzero. The first (least significant) bit
- * is at position 1.
- */
-static inline int ffs(int x)
-{
-	return __builtin_ffs(x);
-}
-
 static inline int fls64(__u64 w)
 {
 	return (sizeof(__u64) * 8) - __builtin_clzll(w);
@@ -118,6 +80,9 @@ static inline unsigned long __arch_hweight64(__u64 w)
 	return __builtin_popcountll(w);
 }
 
+#include <asm-generic/bitops/builtin-__ffs.h>
+#include <asm-generic/bitops/builtin-__fls.h>
+#include <asm-generic/bitops/builtin-ffs.h>
 #include <asm-generic/bitops/const_hweight.h>
 #include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/find.h>
diff --git a/arch/tile/include/asm/bitops_32.h b/arch/tile/include/asm/bitops_32.h
index ddc4c1efde43..386865ad2f55 100644
--- a/arch/tile/include/asm/bitops_32.h
+++ b/arch/tile/include/asm/bitops_32.h
@@ -16,7 +16,7 @@
 #define _ASM_TILE_BITOPS_32_H
 
 #include <linux/compiler.h>
-#include <linux/atomic.h>
+#include <asm/barrier.h>
 
 /* Tile-specific routines to support <asm/bitops.h>. */
 unsigned long _atomic_or(volatile unsigned long *p, unsigned long mask);
diff --git a/arch/tile/include/asm/bitops_64.h b/arch/tile/include/asm/bitops_64.h
index 60b87ee54fb8..ad34cd056085 100644
--- a/arch/tile/include/asm/bitops_64.h
+++ b/arch/tile/include/asm/bitops_64.h
@@ -16,7 +16,7 @@
 #define _ASM_TILE_BITOPS_64_H
 
 #include <linux/compiler.h>
-#include <linux/atomic.h>
+#include <asm/cmpxchg.h>
 
 /* See <asm/bitops.h> for API comments. */
 
@@ -44,8 +44,7 @@ static inline void change_bit(unsigned nr, volatile unsigned long *addr)
 	oldval = *addr;
 	do {
 		guess = oldval;
-		oldval = atomic64_cmpxchg((atomic64_t *)addr,
-					  guess, guess ^ mask);
+		oldval = cmpxchg(addr, guess, guess ^ mask);
 	} while (guess != oldval);
 }
 
@@ -90,8 +89,7 @@ static inline int test_and_change_bit(unsigned nr,
 	oldval = *addr;
 	do {
 		guess = oldval;
-		oldval = atomic64_cmpxchg((atomic64_t *)addr,
-					  guess, guess ^ mask);
+		oldval = cmpxchg(addr, guess, guess ^ mask);
 	} while (guess != oldval);
 	return (oldval & mask) != 0;
 }
diff --git a/arch/tile/include/asm/cache.h b/arch/tile/include/asm/cache.h
index a9a529964e07..6160761d5f61 100644
--- a/arch/tile/include/asm/cache.h
+++ b/arch/tile/include/asm/cache.h
@@ -49,9 +49,16 @@
 #define __read_mostly __attribute__((__section__(".data..read_mostly")))
 
 /*
- * Attribute for data that is kept read/write coherent until the end of
- * initialization, then bumped to read/only incoherent for performance.
+ * Originally we used small TLB pages for kernel data and grouped some
+ * things together as "write once", enforcing the property at the end
+ * of initialization by making those pages read-only and non-coherent.
+ * This allowed better cache utilization since cache inclusion did not
+ * need to be maintained.  However, to do this requires an extra TLB
+ * entry, which on balance is more of a performance hit than the
+ * non-coherence is a performance gain, so we now just make "read
+ * mostly" and "write once" be synonyms.  We keep the attribute
+ * separate in case we change our minds at a future date.
  */
-#define __write_once __attribute__((__section__(".w1data")))
+#define __write_once __read_mostly
 
 #endif /* _ASM_TILE_CACHE_H */
diff --git a/arch/tile/include/asm/cacheflush.h b/arch/tile/include/asm/cacheflush.h
index 0fc63c488edf..92ee4c8a4f76 100644
--- a/arch/tile/include/asm/cacheflush.h
+++ b/arch/tile/include/asm/cacheflush.h
@@ -75,23 +75,6 @@ static inline void copy_to_user_page(struct vm_area_struct *vma,
 #define copy_from_user_page(vma, page, vaddr, dst, src, len) \
 	memcpy((dst), (src), (len))
 
-/*
- * Invalidate a VA range; pads to L2 cacheline boundaries.
- *
- * Note that on TILE64, __inv_buffer() actually flushes modified
- * cache lines in addition to invalidating them, i.e., it's the
- * same as __finv_buffer().
- */
-static inline void __inv_buffer(void *buffer, size_t size)
-{
-	char *next = (char *)((long)buffer & -L2_CACHE_BYTES);
-	char *finish = (char *)L2_CACHE_ALIGN((long)buffer + size);
-	while (next < finish) {
-		__insn_inv(next);
-		next += CHIP_INV_STRIDE();
-	}
-}
-
 /* Flush a VA range; pads to L2 cacheline boundaries. */
 static inline void __flush_buffer(void *buffer, size_t size)
 {
@@ -115,13 +98,6 @@ static inline void __finv_buffer(void *buffer, size_t size)
 }
 
 
-/* Invalidate a VA range and wait for it to be complete. */
-static inline void inv_buffer(void *buffer, size_t size)
-{
-	__inv_buffer(buffer, size);
-	mb();
-}
-
 /*
  * Flush a locally-homecached VA range and wait for the evicted
  * cachelines to hit memory.
@@ -142,6 +118,26 @@ static inline void finv_buffer_local(void *buffer, size_t size)
 	mb_incoherent();
 }
 
+#ifdef __tilepro__
+/* Invalidate a VA range; pads to L2 cacheline boundaries. */
+static inline void __inv_buffer(void *buffer, size_t size)
+{
+	char *next = (char *)((long)buffer & -L2_CACHE_BYTES);
+	char *finish = (char *)L2_CACHE_ALIGN((long)buffer + size);
+	while (next < finish) {
+		__insn_inv(next);
+		next += CHIP_INV_STRIDE();
+	}
+}
+
+/* Invalidate a VA range and wait for it to be complete. */
+static inline void inv_buffer(void *buffer, size_t size)
+{
+	__inv_buffer(buffer, size);
+	mb();
+}
+#endif
+
 /*
  * Flush and invalidate a VA range that is homed remotely, waiting
  * until the memory controller holds the flushed values.  If "hfh" is
diff --git a/arch/tile/include/asm/cmpxchg.h b/arch/tile/include/asm/cmpxchg.h
index 276f067e3640..4001d5eab4bb 100644
--- a/arch/tile/include/asm/cmpxchg.h
+++ b/arch/tile/include/asm/cmpxchg.h
@@ -20,53 +20,108 @@
 
 #ifndef __ASSEMBLY__
 
-/* Nonexistent functions intended to cause link errors. */
-extern unsigned long __xchg_called_with_bad_pointer(void);
-extern unsigned long __cmpxchg_called_with_bad_pointer(void);
+#include <asm/barrier.h>
 
-#define xchg(ptr, x)							\
+/* Nonexistent functions intended to cause compile errors. */
+extern void __xchg_called_with_bad_pointer(void)
+	__compiletime_error("Bad argument size for xchg");
+extern void __cmpxchg_called_with_bad_pointer(void)
+	__compiletime_error("Bad argument size for cmpxchg");
+
+#ifndef __tilegx__
+
+/* Note the _atomic_xxx() routines include a final mb(). */
+int _atomic_xchg(int *ptr, int n);
+int _atomic_xchg_add(int *v, int i);
+int _atomic_xchg_add_unless(int *v, int a, int u);
+int _atomic_cmpxchg(int *ptr, int o, int n);
+u64 _atomic64_xchg(u64 *v, u64 n);
+u64 _atomic64_xchg_add(u64 *v, u64 i);
+u64 _atomic64_xchg_add_unless(u64 *v, u64 a, u64 u);
+u64 _atomic64_cmpxchg(u64 *v, u64 o, u64 n);
+
+#define xchg(ptr, n)							\
+	({								\
+		if (sizeof(*(ptr)) != 4)				\
+			__xchg_called_with_bad_pointer();		\
+		smp_mb();						\
+		(typeof(*(ptr)))_atomic_xchg((int *)(ptr), (int)(n));	\
+	})
+
+#define cmpxchg(ptr, o, n)						\
+	({								\
+		if (sizeof(*(ptr)) != 4)				\
+			__cmpxchg_called_with_bad_pointer();		\
+		smp_mb();						\
+		(typeof(*(ptr)))_atomic_cmpxchg((int *)ptr, (int)o, (int)n); \
+	})
+
+#define xchg64(ptr, n)							\
+	({								\
+		if (sizeof(*(ptr)) != 8)				\
+			__xchg_called_with_bad_pointer();		\
+		smp_mb();						\
+		(typeof(*(ptr)))_atomic64_xchg((u64 *)(ptr), (u64)(n));	\
+	})
+
+#define cmpxchg64(ptr, o, n)						\
+	({								\
+		if (sizeof(*(ptr)) != 8)				\
+			__cmpxchg_called_with_bad_pointer();		\
+		smp_mb();						\
+		(typeof(*(ptr)))_atomic64_cmpxchg((u64 *)ptr, (u64)o, (u64)n); \
+	})
+
+#else
+
+#define xchg(ptr, n)							\
 	({								\
 		typeof(*(ptr)) __x;					\
+		smp_mb();						\
 		switch (sizeof(*(ptr))) {				\
 		case 4:							\
-			__x = (typeof(__x))(typeof(__x-__x))atomic_xchg( \
-				(atomic_t *)(ptr),			\
-				(u32)(typeof((x)-(x)))(x));		\
+			__x = (typeof(__x))(unsigned long)		\
+				__insn_exch4((ptr), (u32)(unsigned long)(n)); \
 			break;						\
 		case 8:							\
-			__x = (typeof(__x))(typeof(__x-__x))atomic64_xchg( \
-				(atomic64_t *)(ptr),			\
-				(u64)(typeof((x)-(x)))(x));		\
+			__x = (typeof(__x))			\
+				__insn_exch((ptr), (unsigned long)(n));	\
 			break;						\
 		default:						\
 			__xchg_called_with_bad_pointer();		\
+			break;						\
 		}							\
+		smp_mb();						\
 		__x;							\
 	})
 
 #define cmpxchg(ptr, o, n)						\
 	({								\
 		typeof(*(ptr)) __x;					\
+		__insn_mtspr(SPR_CMPEXCH_VALUE, (unsigned long)(o));	\
+		smp_mb();						\
 		switch (sizeof(*(ptr))) {				\
 		case 4:							\
-			__x = (typeof(__x))(typeof(__x-__x))atomic_cmpxchg( \
-				(atomic_t *)(ptr),			\
-				(u32)(typeof((o)-(o)))(o),		\
-				(u32)(typeof((n)-(n)))(n));		\
+			__x = (typeof(__x))(unsigned long)		\
+				__insn_cmpexch4((ptr), (u32)(unsigned long)(n)); \
 			break;						\
 		case 8:							\
-			__x = (typeof(__x))(typeof(__x-__x))atomic64_cmpxchg( \
-				(atomic64_t *)(ptr),			\
-				(u64)(typeof((o)-(o)))(o),		\
-				(u64)(typeof((n)-(n)))(n));		\
+			__x = (typeof(__x))__insn_cmpexch((ptr), (u64)(n)); \
 			break;						\
 		default:						\
 			__cmpxchg_called_with_bad_pointer();		\
+			break;						\
 		}							\
+		smp_mb();						\
 		__x;							\
 	})
 
-#define tas(ptr) (xchg((ptr), 1))
+#define xchg64 xchg
+#define cmpxchg64 cmpxchg
+
+#endif
+
+#define tas(ptr) xchg((ptr), 1)
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/tile/include/asm/device.h b/arch/tile/include/asm/device.h
index 5182705bd056..6ab8bf146d4c 100644
--- a/arch/tile/include/asm/device.h
+++ b/arch/tile/include/asm/device.h
@@ -23,7 +23,10 @@ struct dev_archdata {
 	/* Offset of the DMA address from the PA. */
 	dma_addr_t		dma_offset;
 
-	/* Highest DMA address that can be generated by this device. */
+	/*
+	 * Highest DMA address that can be generated by devices that
+	 * have limited DMA capability, i.e. non 64-bit capable.
+	 */
 	dma_addr_t		max_direct_dma_addr;
 };
 
diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h
index f2ff191376b4..1eae359d8315 100644
--- a/arch/tile/include/asm/dma-mapping.h
+++ b/arch/tile/include/asm/dma-mapping.h
@@ -20,9 +20,14 @@
 #include <linux/cache.h>
 #include <linux/io.h>
 
+#ifdef __tilegx__
+#define ARCH_HAS_DMA_GET_REQUIRED_MASK
+#endif
+
 extern struct dma_map_ops *tile_dma_map_ops;
 extern struct dma_map_ops *gx_pci_dma_map_ops;
 extern struct dma_map_ops *gx_legacy_pci_dma_map_ops;
+extern struct dma_map_ops *gx_hybrid_pci_dma_map_ops;
 
 static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 {
@@ -44,12 +49,12 @@ static inline void set_dma_offset(struct device *dev, dma_addr_t off)
 
 static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
 {
-	return paddr + get_dma_offset(dev);
+	return paddr;
 }
 
 static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 {
-	return daddr - get_dma_offset(dev);
+	return daddr;
 }
 
 static inline void dma_mark_clean(void *addr, size_t size) {}
@@ -87,11 +92,19 @@ dma_set_mask(struct device *dev, u64 mask)
 {
 	struct dma_map_ops *dma_ops = get_dma_ops(dev);
 
-	/* Handle legacy PCI devices with limited memory addressability. */
-	if ((dma_ops == gx_pci_dma_map_ops) && (mask <= DMA_BIT_MASK(32))) {
-		set_dma_ops(dev, gx_legacy_pci_dma_map_ops);
-		set_dma_offset(dev, 0);
-		if (mask > dev->archdata.max_direct_dma_addr)
+	/*
+	 * For PCI devices with 64-bit DMA addressing capability, promote
+	 * the dma_ops to hybrid, with the consistent memory DMA space limited
+	 * to 32-bit. For 32-bit capable devices, limit the streaming DMA
+	 * address range to max_direct_dma_addr.
+	 */
+	if (dma_ops == gx_pci_dma_map_ops ||
+	    dma_ops == gx_hybrid_pci_dma_map_ops ||
+	    dma_ops == gx_legacy_pci_dma_map_ops) {
+		if (mask == DMA_BIT_MASK(64) &&
+		    dma_ops == gx_legacy_pci_dma_map_ops)
+			set_dma_ops(dev, gx_hybrid_pci_dma_map_ops);
+		else if (mask > dev->archdata.max_direct_dma_addr)
 			mask = dev->archdata.max_direct_dma_addr;
 	}
 
diff --git a/arch/tile/include/asm/elf.h b/arch/tile/include/asm/elf.h
index ff8a93408823..41d9878a9686 100644
--- a/arch/tile/include/asm/elf.h
+++ b/arch/tile/include/asm/elf.h
@@ -30,7 +30,6 @@ typedef unsigned long elf_greg_t;
 #define ELF_NGREG (sizeof(struct pt_regs) / sizeof(elf_greg_t))
 typedef elf_greg_t elf_gregset_t[ELF_NGREG];
 
-#define EM_TILE64  187
 #define EM_TILEPRO 188
 #define EM_TILEGX  191
 
@@ -132,6 +131,15 @@ extern int dump_task_regs(struct task_struct *, elf_gregset_t *);
 struct linux_binprm;
 extern int arch_setup_additional_pages(struct linux_binprm *bprm,
 				       int executable_stack);
+#define ARCH_DLINFO \
+do { \
+	NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_BASE); \
+} while (0)
+
+struct mm_struct;
+extern unsigned long arch_randomize_brk(struct mm_struct *mm);
+#define arch_randomize_brk arch_randomize_brk
+
 #ifdef CONFIG_COMPAT
 
 #define COMPAT_ELF_PLATFORM "tilegx-m32"
diff --git a/arch/tile/include/asm/fixmap.h b/arch/tile/include/asm/fixmap.h
index e16dbf929cb5..c6b9c1b38fd1 100644
--- a/arch/tile/include/asm/fixmap.h
+++ b/arch/tile/include/asm/fixmap.h
@@ -78,14 +78,6 @@ enum fixed_addresses {
 #endif
 };
 
-extern void __set_fixmap(enum fixed_addresses idx,
-			 unsigned long phys, pgprot_t flags);
-
-#define set_fixmap(idx, phys) \
-		__set_fixmap(idx, phys, PAGE_KERNEL)
-#define clear_fixmap(idx) \
-		__set_fixmap(idx, 0, __pgprot(0))
-
 #define __FIXADDR_SIZE	(__end_of_permanent_fixed_addresses << PAGE_SHIFT)
 #define __FIXADDR_BOOT_SIZE	(__end_of_fixed_addresses << PAGE_SHIFT)
 #define FIXADDR_START		(FIXADDR_TOP + PAGE_SIZE - __FIXADDR_SIZE)
diff --git a/arch/tile/include/asm/ftrace.h b/arch/tile/include/asm/ftrace.h
index 461459b06d98..13a9bb81a8ab 100644
--- a/arch/tile/include/asm/ftrace.h
+++ b/arch/tile/include/asm/ftrace.h
@@ -15,6 +15,26 @@
 #ifndef _ASM_TILE_FTRACE_H
 #define _ASM_TILE_FTRACE_H
 
-/* empty */
+#ifdef CONFIG_FUNCTION_TRACER
+
+#define MCOUNT_ADDR ((unsigned long)(__mcount))
+#define MCOUNT_INSN_SIZE 8		/* sizeof mcount call */
+
+#ifndef __ASSEMBLY__
+extern void __mcount(void);
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+static inline unsigned long ftrace_call_adjust(unsigned long addr)
+{
+	return addr;
+}
+
+struct dyn_arch_ftrace {
+};
+#endif /*  CONFIG_DYNAMIC_FTRACE */
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* CONFIG_FUNCTION_TRACER */
 
 #endif /* _ASM_TILE_FTRACE_H */
diff --git a/arch/tile/include/asm/futex.h b/arch/tile/include/asm/futex.h
index 5909ac3d7218..1a6ef1b69cb1 100644
--- a/arch/tile/include/asm/futex.h
+++ b/arch/tile/include/asm/futex.h
@@ -43,6 +43,7 @@
 	    ".pushsection .fixup,\"ax\"\n"			\
 	    "0: { movei %0, %5; j 9f }\n"			\
 	    ".section __ex_table,\"a\"\n"			\
+	    ".align 8\n"					\
 	    ".quad 1b, 0b\n"					\
 	    ".popsection\n"					\
 	    "9:"						\
diff --git a/arch/tile/include/asm/homecache.h b/arch/tile/include/asm/homecache.h
index 7b7771328642..7ddd1b8d6910 100644
--- a/arch/tile/include/asm/homecache.h
+++ b/arch/tile/include/asm/homecache.h
@@ -33,8 +33,7 @@ struct zone;
 
 /*
  * Is this page immutable (unwritable) and thus able to be cached more
- * widely than would otherwise be possible?  On tile64 this means we
- * mark the PTE to cache locally; on tilepro it means we have "nc" set.
+ * widely than would otherwise be possible?  This means we have "nc" set.
  */
 #define PAGE_HOME_IMMUTABLE -2
 
@@ -44,16 +43,8 @@ struct zone;
  */
 #define PAGE_HOME_INCOHERENT -3
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 /* Home for the page is distributed via hash-for-home. */
 #define PAGE_HOME_HASH -4
-#endif
-
-/* Homing is unknown or unspecified.  Not valid for page_home(). */
-#define PAGE_HOME_UNKNOWN -5
-
-/* Home on the current cpu.  Not valid for page_home(). */
-#define PAGE_HOME_HERE -6
 
 /* Support wrapper to use instead of explicit hv_flush_remote(). */
 extern void flush_remote(unsigned long cache_pfn, unsigned long cache_length,
diff --git a/arch/tile/include/asm/io.h b/arch/tile/include/asm/io.h
index 31672918064c..9fe434969fab 100644
--- a/arch/tile/include/asm/io.h
+++ b/arch/tile/include/asm/io.h
@@ -19,7 +19,8 @@
 #include <linux/bug.h>
 #include <asm/page.h>
 
-#define IO_SPACE_LIMIT 0xfffffffful
+/* Maximum PCI I/O space address supported. */
+#define IO_SPACE_LIMIT 0xffffffff
 
 /*
  * Convert a physical pointer to a virtual kernel pointer for /dev/mem
@@ -254,7 +255,7 @@ static inline void writeq(u64 val, unsigned long addr)
 
 static inline void memset_io(volatile void *dst, int val, size_t len)
 {
-	int x;
+	size_t x;
 	BUG_ON((unsigned long)dst & 0x3);
 	val = (val & 0xff) * 0x01010101;
 	for (x = 0; x < len; x += 4)
@@ -264,7 +265,7 @@ static inline void memset_io(volatile void *dst, int val, size_t len)
 static inline void memcpy_fromio(void *dst, const volatile void __iomem *src,
 				 size_t len)
 {
-	int x;
+	size_t x;
 	BUG_ON((unsigned long)src & 0x3);
 	for (x = 0; x < len; x += 4)
 		*(u32 *)(dst + x) = readl(src + x);
@@ -273,7 +274,7 @@ static inline void memcpy_fromio(void *dst, const volatile void __iomem *src,
 static inline void memcpy_toio(volatile void __iomem *dst, const void *src,
 				size_t len)
 {
-	int x;
+	size_t x;
 	BUG_ON((unsigned long)dst & 0x3);
 	for (x = 0; x < len; x += 4)
 		writel(*(u32 *)(src + x), dst + x);
@@ -281,8 +282,108 @@ static inline void memcpy_toio(volatile void __iomem *dst, const void *src,
 
 #endif
 
+#if CHIP_HAS_MMIO() && defined(CONFIG_TILE_PCI_IO)
+
+static inline u8 inb(unsigned long addr)
+{
+	return readb((volatile void __iomem *) addr);
+}
+
+static inline u16 inw(unsigned long addr)
+{
+	return readw((volatile void __iomem *) addr);
+}
+
+static inline u32 inl(unsigned long addr)
+{
+	return readl((volatile void __iomem *) addr);
+}
+
+static inline void outb(u8 b, unsigned long addr)
+{
+	writeb(b, (volatile void __iomem *) addr);
+}
+
+static inline void outw(u16 b, unsigned long addr)
+{
+	writew(b, (volatile void __iomem *) addr);
+}
+
+static inline void outl(u32 b, unsigned long addr)
+{
+	writel(b, (volatile void __iomem *) addr);
+}
+
+static inline void insb(unsigned long addr, void *buffer, int count)
+{
+	if (count) {
+		u8 *buf = buffer;
+		do {
+			u8 x = inb(addr);
+			*buf++ = x;
+		} while (--count);
+	}
+}
+
+static inline void insw(unsigned long addr, void *buffer, int count)
+{
+	if (count) {
+		u16 *buf = buffer;
+		do {
+			u16 x = inw(addr);
+			*buf++ = x;
+		} while (--count);
+	}
+}
+
+static inline void insl(unsigned long addr, void *buffer, int count)
+{
+	if (count) {
+		u32 *buf = buffer;
+		do {
+			u32 x = inl(addr);
+			*buf++ = x;
+		} while (--count);
+	}
+}
+
+static inline void outsb(unsigned long addr, const void *buffer, int count)
+{
+	if (count) {
+		const u8 *buf = buffer;
+		do {
+			outb(*buf++, addr);
+		} while (--count);
+	}
+}
+
+static inline void outsw(unsigned long addr, const void *buffer, int count)
+{
+	if (count) {
+		const u16 *buf = buffer;
+		do {
+			outw(*buf++, addr);
+		} while (--count);
+	}
+}
+
+static inline void outsl(unsigned long addr, const void *buffer, int count)
+{
+	if (count) {
+		const u32 *buf = buffer;
+		do {
+			outl(*buf++, addr);
+		} while (--count);
+	}
+}
+
+extern void __iomem *ioport_map(unsigned long port, unsigned int len);
+extern void ioport_unmap(void __iomem *addr);
+
+#else
+
 /*
- * The Tile architecture does not support IOPORT, even with PCI.
+ * The TilePro architecture does not support IOPORT, even with PCI.
  * Unfortunately we can't yet simply not declare these methods,
  * since some generic code that compiles into the kernel, but
  * we never run, uses them unconditionally.
@@ -290,7 +391,12 @@ static inline void memcpy_toio(volatile void __iomem *dst, const void *src,
 
 static inline long ioport_panic(void)
 {
+#ifdef __tilegx__
+	panic("PCI IO space support is disabled. Configure the kernel with"
+	      " CONFIG_TILE_PCI_IO to enable it");
+#else
 	panic("inb/outb and friends do not exist on tile");
+#endif
 	return 0;
 }
 
@@ -335,13 +441,6 @@ static inline void outl(u32 b, unsigned long addr)
 	ioport_panic();
 }
 
-#define inb_p(addr)	inb(addr)
-#define inw_p(addr)	inw(addr)
-#define inl_p(addr)	inl(addr)
-#define outb_p(x, addr)	outb((x), (addr))
-#define outw_p(x, addr)	outw((x), (addr))
-#define outl_p(x, addr)	outl((x), (addr))
-
 static inline void insb(unsigned long addr, void *buffer, int count)
 {
 	ioport_panic();
@@ -372,6 +471,15 @@ static inline void outsl(unsigned long addr, const void *buffer, int count)
 	ioport_panic();
 }
 
+#endif /* CHIP_HAS_MMIO() && defined(CONFIG_TILE_PCI_IO) */
+
+#define inb_p(addr)	inb(addr)
+#define inw_p(addr)	inw(addr)
+#define inl_p(addr)	inl(addr)
+#define outb_p(x, addr)	outb((x), (addr))
+#define outw_p(x, addr)	outw((x), (addr))
+#define outl_p(x, addr)	outl((x), (addr))
+
 #define ioread16be(addr)	be16_to_cpu(ioread16(addr))
 #define ioread32be(addr)	be32_to_cpu(ioread32(addr))
 #define iowrite16be(v, addr)	iowrite16(be16_to_cpu(v), (addr))
diff --git a/arch/tile/include/asm/irqflags.h b/arch/tile/include/asm/irqflags.h
index c96f9bbb760d..71af5747874d 100644
--- a/arch/tile/include/asm/irqflags.h
+++ b/arch/tile/include/asm/irqflags.h
@@ -124,6 +124,12 @@
 DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 #define INITIAL_INTERRUPTS_ENABLED (1ULL << INT_MEM_ERROR)
 
+#ifdef CONFIG_DEBUG_PREEMPT
+/* Due to inclusion issues, we can't rely on <linux/smp.h> here. */
+extern unsigned int debug_smp_processor_id(void);
+# define smp_processor_id() debug_smp_processor_id()
+#endif
+
 /* Disable interrupts. */
 #define arch_local_irq_disable() \
 	interrupt_mask_set_mask(LINUX_MASKABLE_INTERRUPTS)
@@ -132,9 +138,18 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 #define arch_local_irq_disable_all() \
 	interrupt_mask_set_mask(-1ULL)
 
+/*
+ * Read the set of maskable interrupts.
+ * We avoid the preemption warning here via __this_cpu_ptr since even
+ * if irqs are already enabled, it's harmless to read the wrong cpu's
+ * enabled mask.
+ */
+#define arch_local_irqs_enabled() \
+	(*__this_cpu_ptr(&interrupts_enabled_mask))
+
 /* Re-enable all maskable interrupts. */
 #define arch_local_irq_enable() \
-	interrupt_mask_reset_mask(__get_cpu_var(interrupts_enabled_mask))
+	interrupt_mask_reset_mask(arch_local_irqs_enabled())
 
 /* Disable or enable interrupts based on flag argument. */
 #define arch_local_irq_restore(disabled) do { \
@@ -161,7 +176,7 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 
 /* Prevent the given interrupt from being enabled next time we enable irqs. */
 #define arch_local_irq_mask(interrupt) \
-	(__get_cpu_var(interrupts_enabled_mask) &= ~(1ULL << (interrupt)))
+	this_cpu_and(interrupts_enabled_mask, ~(1ULL << (interrupt)))
 
 /* Prevent the given interrupt from being enabled immediately. */
 #define arch_local_irq_mask_now(interrupt) do { \
@@ -171,7 +186,7 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 
 /* Allow the given interrupt to be enabled next time we enable irqs. */
 #define arch_local_irq_unmask(interrupt) \
-	(__get_cpu_var(interrupts_enabled_mask) |= (1ULL << (interrupt)))
+	this_cpu_or(interrupts_enabled_mask, (1ULL << (interrupt)))
 
 /* Allow the given interrupt to be enabled immediately, if !irqs_disabled. */
 #define arch_local_irq_unmask_now(interrupt) do { \
diff --git a/arch/tile/include/asm/hw_irq.h b/arch/tile/include/asm/kdebug.h
index 4fac5fbf333e..5bbbfa904c2d 100644
--- a/arch/tile/include/asm/hw_irq.h
+++ b/arch/tile/include/asm/kdebug.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
  *
  *   This program is free software; you can redistribute it and/or
  *   modify it under the terms of the GNU General Public License
@@ -12,7 +12,17 @@
  *   more details.
  */
 
-#ifndef _ASM_TILE_HW_IRQ_H
-#define _ASM_TILE_HW_IRQ_H
+#ifndef _ASM_TILE_KDEBUG_H
+#define _ASM_TILE_KDEBUG_H
 
-#endif /* _ASM_TILE_HW_IRQ_H */
+#include <linux/notifier.h>
+
+enum die_val {
+	DIE_OOPS = 1,
+	DIE_BREAK,
+	DIE_SSTEPBP,
+	DIE_PAGE_FAULT,
+	DIE_COMPILED_BPT
+};
+
+#endif /* _ASM_TILE_KDEBUG_H */
diff --git a/arch/tile/include/asm/kgdb.h b/arch/tile/include/asm/kgdb.h
new file mode 100644
index 000000000000..280c181cf0db
--- /dev/null
+++ b/arch/tile/include/asm/kgdb.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * TILE-Gx KGDB support.
+ */
+
+#ifndef __TILE_KGDB_H__
+#define __TILE_KGDB_H__
+
+#include <linux/kdebug.h>
+#include <arch/opcode.h>
+
+#define GDB_SIZEOF_REG		sizeof(unsigned long)
+
+/*
+ * TILE-Gx gdb is expecting the following register layout:
+ * 56 GPRs(R0 - R52, TP, SP, LR), 8 special GPRs(networks and ZERO),
+ * plus the PC and the faultnum.
+ *
+ * Even though kernel not use the 8 special GPRs, they need to be present
+ * in the registers sent for correct processing in the host-side gdb.
+ *
+ */
+#define DBG_MAX_REG_NUM		(56+8+2)
+#define NUMREGBYTES		(DBG_MAX_REG_NUM * GDB_SIZEOF_REG)
+
+/*
+ * BUFMAX defines the maximum number of characters in inbound/outbound
+ * buffers at least NUMREGBYTES*2 are needed for register packets,
+ * Longer buffer is needed to list all threads.
+ */
+#define BUFMAX			2048
+
+#define BREAK_INSTR_SIZE	TILEGX_BUNDLE_SIZE_IN_BYTES
+
+/*
+ * Require cache flush for set/clear a software breakpoint or write memory.
+ */
+#define CACHE_FLUSH_IS_SAFE	1
+
+/*
+ * The compiled-in breakpoint instruction can be used to "break" into
+ * the debugger via magic system request key (sysrq-G).
+ */
+static tile_bundle_bits compiled_bpt = TILEGX_BPT_BUNDLE | DIE_COMPILED_BPT;
+
+enum tilegx_regnum {
+	TILEGX_PC_REGNUM = TREG_LAST_GPR + 9,
+	TILEGX_FAULTNUM_REGNUM,
+};
+
+/*
+ * Generate a breakpoint exception to "break" into the debugger.
+ */
+static inline void arch_kgdb_breakpoint(void)
+{
+	asm volatile (".quad %0\n\t"
+		      ::""(compiled_bpt));
+}
+
+#endif /* __TILE_KGDB_H__ */
diff --git a/arch/tile/include/asm/kprobes.h b/arch/tile/include/asm/kprobes.h
new file mode 100644
index 000000000000..d8f9a83943b1
--- /dev/null
+++ b/arch/tile/include/asm/kprobes.h
@@ -0,0 +1,79 @@
+/*
+ * arch/tile/include/asm/kprobes.h
+ *
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _ASM_TILE_KPROBES_H
+#define _ASM_TILE_KPROBES_H
+
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/percpu.h>
+
+#include <arch/opcode.h>
+
+#define __ARCH_WANT_KPROBES_INSN_SLOT
+#define MAX_INSN_SIZE			2
+
+#define kretprobe_blacklist_size 0
+
+typedef tile_bundle_bits kprobe_opcode_t;
+
+#define flush_insn_slot(p)						\
+	flush_icache_range((unsigned long)p->addr,			\
+			   (unsigned long)p->addr +			\
+			   (MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
+
+struct kprobe;
+
+/* Architecture specific copy of original instruction. */
+struct arch_specific_insn {
+	kprobe_opcode_t *insn;
+};
+
+struct prev_kprobe {
+	struct kprobe *kp;
+	unsigned long status;
+	unsigned long saved_pc;
+};
+
+#define MAX_JPROBES_STACK_SIZE 128
+#define MAX_JPROBES_STACK_ADDR \
+	(((unsigned long)current_thread_info()) + THREAD_SIZE - 32 \
+		- sizeof(struct pt_regs))
+
+#define MIN_JPROBES_STACK_SIZE(ADDR)					\
+	((((ADDR) + MAX_JPROBES_STACK_SIZE) > MAX_JPROBES_STACK_ADDR)	\
+		? MAX_JPROBES_STACK_ADDR - (ADDR)			\
+		: MAX_JPROBES_STACK_SIZE)
+
+/* per-cpu kprobe control block. */
+struct kprobe_ctlblk {
+	unsigned long kprobe_status;
+	unsigned long kprobe_saved_pc;
+	unsigned long jprobe_saved_sp;
+	struct prev_kprobe prev_kprobe;
+	struct pt_regs jprobe_saved_regs;
+	char jprobes_stack[MAX_JPROBES_STACK_SIZE];
+};
+
+extern tile_bundle_bits breakpoint2_insn;
+extern tile_bundle_bits breakpoint_insn;
+
+void arch_remove_kprobe(struct kprobe *);
+
+extern int kprobe_exceptions_notify(struct notifier_block *self,
+			     unsigned long val, void *data);
+
+#endif /* _ASM_TILE_KPROBES_H */
diff --git a/arch/tile/include/asm/mmu.h b/arch/tile/include/asm/mmu.h
index e2c789096795..0cab1182bde1 100644
--- a/arch/tile/include/asm/mmu.h
+++ b/arch/tile/include/asm/mmu.h
@@ -22,6 +22,7 @@ struct mm_context {
 	 * semaphore but atomically, but it is conservatively set.
 	 */
 	unsigned long priority_cached;
+	unsigned long vdso_base;
 };
 
 typedef struct mm_context mm_context_t;
diff --git a/arch/tile/include/asm/mmu_context.h b/arch/tile/include/asm/mmu_context.h
index 37f0b741dee7..4734215e2ad4 100644
--- a/arch/tile/include/asm/mmu_context.h
+++ b/arch/tile/include/asm/mmu_context.h
@@ -45,7 +45,7 @@ static inline void __install_page_table(pgd_t *pgdir, int asid, pgprot_t prot)
 
 static inline void install_page_table(pgd_t *pgdir, int asid)
 {
-	pte_t *ptep = virt_to_pte(NULL, (unsigned long)pgdir);
+	pte_t *ptep = virt_to_kpte((unsigned long)pgdir);
 	__install_page_table(pgdir, asid, *ptep);
 }
 
diff --git a/arch/tile/include/asm/mmzone.h b/arch/tile/include/asm/mmzone.h
index 9d3dbce8f953..804f1098b6cd 100644
--- a/arch/tile/include/asm/mmzone.h
+++ b/arch/tile/include/asm/mmzone.h
@@ -42,7 +42,7 @@ static inline int pfn_to_nid(unsigned long pfn)
 
 #define kern_addr_valid(kaddr)	virt_addr_valid((void *)kaddr)
 
-static inline int pfn_valid(int pfn)
+static inline int pfn_valid(unsigned long pfn)
 {
 	int nid = pfn_to_nid(pfn);
 
diff --git a/arch/tile/include/asm/page.h b/arch/tile/include/asm/page.h
index dd033a4fd627..6346888f7bdc 100644
--- a/arch/tile/include/asm/page.h
+++ b/arch/tile/include/asm/page.h
@@ -39,6 +39,12 @@
 #define HPAGE_MASK	(~(HPAGE_SIZE - 1))
 
 /*
+ * We do define AT_SYSINFO_EHDR to support vDSO,
+ * but don't use the gate mechanism.
+ */
+#define __HAVE_ARCH_GATE_AREA		1
+
+/*
  * If the Kconfig doesn't specify, set a maximum zone order that
  * is enough so that we can create huge pages from small pages given
  * the respective sizes of the two page types.  See <linux/mmzone.h>.
@@ -142,8 +148,12 @@ static inline __attribute_const__ int get_order(unsigned long size)
 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 #endif
 
+/* Allow overriding how much VA or PA the kernel will use. */
+#define MAX_PA_WIDTH CHIP_PA_WIDTH()
+#define MAX_VA_WIDTH CHIP_VA_WIDTH()
+
 /* Each memory controller has PAs distinct in their high bits. */
-#define NR_PA_HIGHBIT_SHIFT (CHIP_PA_WIDTH() - CHIP_LOG_NUM_MSHIMS())
+#define NR_PA_HIGHBIT_SHIFT (MAX_PA_WIDTH - CHIP_LOG_NUM_MSHIMS())
 #define NR_PA_HIGHBIT_VALUES (1 << CHIP_LOG_NUM_MSHIMS())
 #define __pa_to_highbits(pa) ((phys_addr_t)(pa) >> NR_PA_HIGHBIT_SHIFT)
 #define __pfn_to_highbits(pfn) ((pfn) >> (NR_PA_HIGHBIT_SHIFT - PAGE_SHIFT))
@@ -154,7 +164,7 @@ static inline __attribute_const__ int get_order(unsigned long size)
  * We reserve the lower half of memory for user-space programs, and the
  * upper half for system code.  We re-map all of physical memory in the
  * upper half, which takes a quarter of our VA space.  Then we have
- * the vmalloc regions.  The supervisor code lives at 0xfffffff700000000,
+ * the vmalloc regions.  The supervisor code lives at the highest address,
  * with the hypervisor above that.
  *
  * Loadable kernel modules are placed immediately after the static
@@ -166,26 +176,19 @@ static inline __attribute_const__ int get_order(unsigned long size)
  * Similarly, for now we don't play any struct page mapping games.
  */
 
-#if CHIP_PA_WIDTH() + 2 > CHIP_VA_WIDTH()
+#if MAX_PA_WIDTH + 2 > MAX_VA_WIDTH
 # error Too much PA to map with the VA available!
 #endif
-#define HALF_VA_SPACE           (_AC(1, UL) << (CHIP_VA_WIDTH() - 1))
 
-#define MEM_LOW_END		(HALF_VA_SPACE - 1)         /* low half */
-#define MEM_HIGH_START		(-HALF_VA_SPACE)            /* high half */
-#define PAGE_OFFSET		MEM_HIGH_START
-#define FIXADDR_BASE		_AC(0xfffffff400000000, UL) /* 4 GB */
-#define FIXADDR_TOP		_AC(0xfffffff500000000, UL) /* 4 GB */
+#define PAGE_OFFSET		(-(_AC(1, UL) << (MAX_VA_WIDTH - 1)))
+#define KERNEL_HIGH_VADDR	_AC(0xfffffff800000000, UL)  /* high 32GB */
+#define FIXADDR_BASE		(KERNEL_HIGH_VADDR - 0x400000000) /* 4 GB */
+#define FIXADDR_TOP		(KERNEL_HIGH_VADDR - 0x300000000) /* 4 GB */
 #define _VMALLOC_START		FIXADDR_TOP
-#define HUGE_VMAP_BASE		_AC(0xfffffff600000000, UL) /* 4 GB */
-#define MEM_SV_START		_AC(0xfffffff700000000, UL) /* 256 MB */
-#define MEM_SV_INTRPT		MEM_SV_START
-#define MEM_MODULE_START	_AC(0xfffffff710000000, UL) /* 256 MB */
+#define HUGE_VMAP_BASE		(KERNEL_HIGH_VADDR - 0x200000000) /* 4 GB */
+#define MEM_SV_START		(KERNEL_HIGH_VADDR - 0x100000000) /* 256 MB */
+#define MEM_MODULE_START	(MEM_SV_START + (256*1024*1024)) /* 256 MB */
 #define MEM_MODULE_END		(MEM_MODULE_START + (256*1024*1024))
-#define MEM_HV_START		_AC(0xfffffff800000000, UL) /* 32 GB */
-
-/* Highest DTLB address we will use */
-#define KERNEL_HIGH_VADDR	MEM_SV_START
 
 #else /* !__tilegx__ */
 
@@ -207,25 +210,18 @@ static inline __attribute_const__ int get_order(unsigned long size)
  * values, and after that, we show "typical" values, since the actual
  * addresses depend on kernel #defines.
  *
- * MEM_HV_INTRPT                   0xfe000000
- * MEM_SV_INTRPT (kernel code)     0xfd000000
+ * MEM_HV_START                    0xfe000000
+ * MEM_SV_START  (kernel code)     0xfd000000
  * MEM_USER_INTRPT (user vector)   0xfc000000
- * FIX_KMAP_xxx                    0xf8000000 (via NR_CPUS * KM_TYPE_NR)
- * PKMAP_BASE                      0xf7000000 (via LAST_PKMAP)
- * HUGE_VMAP                       0xf3000000 (via CONFIG_NR_HUGE_VMAPS)
- * VMALLOC_START                   0xf0000000 (via __VMALLOC_RESERVE)
+ * FIX_KMAP_xxx                    0xfa000000 (via NR_CPUS * KM_TYPE_NR)
+ * PKMAP_BASE                      0xf9000000 (via LAST_PKMAP)
+ * VMALLOC_START                   0xf7000000 (via VMALLOC_RESERVE)
  * mapped LOWMEM                   0xc0000000
  */
 
 #define MEM_USER_INTRPT		_AC(0xfc000000, UL)
-#if CONFIG_KERNEL_PL == 1
-#define MEM_SV_INTRPT		_AC(0xfd000000, UL)
-#define MEM_HV_INTRPT		_AC(0xfe000000, UL)
-#else
-#define MEM_GUEST_INTRPT	_AC(0xfd000000, UL)
-#define MEM_SV_INTRPT		_AC(0xfe000000, UL)
-#define MEM_HV_INTRPT		_AC(0xff000000, UL)
-#endif
+#define MEM_SV_START		_AC(0xfd000000, UL)
+#define MEM_HV_START		_AC(0xfe000000, UL)
 
 #define INTRPT_SIZE		0x4000
 
@@ -246,7 +242,7 @@ static inline __attribute_const__ int get_order(unsigned long size)
 
 #endif /* __tilegx__ */
 
-#ifndef __ASSEMBLY__
+#if !defined(__ASSEMBLY__) && !defined(VDSO_BUILD)
 
 #ifdef CONFIG_HIGHMEM
 
@@ -332,6 +328,7 @@ static inline int pfn_valid(unsigned long pfn)
 
 struct mm_struct;
 extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr);
+extern pte_t *virt_to_kpte(unsigned long kaddr);
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/arch/tile/include/asm/pci.h b/arch/tile/include/asm/pci.h
index 54a924208d3c..dfedd7ac7298 100644
--- a/arch/tile/include/asm/pci.h
+++ b/arch/tile/include/asm/pci.h
@@ -17,7 +17,6 @@
 
 #include <linux/dma-mapping.h>
 #include <linux/pci.h>
-#include <linux/numa.h>
 #include <asm-generic/pci_iomap.h>
 
 #ifndef __tilegx__
@@ -29,7 +28,6 @@ struct pci_controller {
 	int index;		/* PCI domain number */
 	struct pci_bus *root_bus;
 
-	int first_busno;
 	int last_busno;
 
 	int hv_cfg_fd[2];	/* config{0,1} fds for this PCIe controller */
@@ -124,6 +122,11 @@ static inline void pci_iounmap(struct pci_dev *dev, void __iomem *addr) {}
  * the CPA plus TILE_PCI_MEM_MAP_BASE_OFFSET. To support 32-bit
  * devices, we create a separate map region that handles the low
  * 4GB.
+ *
+ * This design lets us avoid the "PCI hole" problem where the host bridge
+ * won't pass DMA traffic with target addresses that happen to fall within the
+ * BAR space. This enables us to use all the physical memory for DMA, instead
+ * of wasting the same amount of physical memory as the BAR window size.
  */
 #define	TILE_PCI_MEM_MAP_BASE_OFFSET	(1ULL << CHIP_PA_WIDTH())
 
@@ -145,6 +148,10 @@ struct pci_controller {
 
 	int pio_mem_index;	/* PIO region index for memory access */
 
+#ifdef CONFIG_TILE_PCI_IO
+	int pio_io_index;	/* PIO region index for I/O space access */
+#endif
+
 	/*
 	 * Mem-Map regions for all the memory controllers so that Linux can
 	 * map all of its physical memory space to the PCI bus.
@@ -154,6 +161,10 @@ struct pci_controller {
 	int index;		/* PCI domain number */
 	struct pci_bus *root_bus;
 
+	/* PCI I/O space resource for this controller. */
+	struct resource io_space;
+	char io_space_name[32];
+
 	/* PCI memory space resource for this controller. */
 	struct resource mem_space;
 	char mem_space_name[32];
@@ -166,13 +177,11 @@ struct pci_controller {
 
 	/* Table that maps the INTx numbers to Linux irq numbers. */
 	int irq_intx_table[4];
-
-	/* Address ranges that are routed to this controller/bridge. */
-	struct resource mem_resources[3];
 };
 
 extern struct pci_controller pci_controllers[TILEGX_NUM_TRIO * TILEGX_TRIO_PCIES];
 extern gxio_trio_context_t trio_contexts[TILEGX_NUM_TRIO];
+extern int num_trio_shims;
 
 extern void pci_iounmap(struct pci_dev *dev, void __iomem *);
 
@@ -211,7 +220,8 @@ static inline int pcibios_assign_all_busses(void)
 }
 
 #define PCIBIOS_MIN_MEM		0
-#define PCIBIOS_MIN_IO		0
+/* Minimum PCI I/O address, starting at the page boundary. */
+#define PCIBIOS_MIN_IO		PAGE_SIZE
 
 /* Use any cpu for PCI. */
 #define cpumask_of_pcibus(bus) cpu_online_mask
diff --git a/arch/tile/include/asm/pgtable_32.h b/arch/tile/include/asm/pgtable_32.h
index 4ce4a7a99c24..63142ab3b3dd 100644
--- a/arch/tile/include/asm/pgtable_32.h
+++ b/arch/tile/include/asm/pgtable_32.h
@@ -84,10 +84,12 @@ extern unsigned long VMALLOC_RESERVE /* = CONFIG_VMALLOC_RESERVE */;
 /* We have no pmd or pud since we are strictly a two-level page table */
 #include <asm-generic/pgtable-nopmd.h>
 
+static inline int pud_huge_page(pud_t pud)	{ return 0; }
+
 /* We don't define any pgds for these addresses. */
 static inline int pgd_addr_invalid(unsigned long addr)
 {
-	return addr >= MEM_HV_INTRPT;
+	return addr >= MEM_HV_START;
 }
 
 /*
diff --git a/arch/tile/include/asm/pgtable_64.h b/arch/tile/include/asm/pgtable_64.h
index 2492fa5478e7..3421177f7370 100644
--- a/arch/tile/include/asm/pgtable_64.h
+++ b/arch/tile/include/asm/pgtable_64.h
@@ -63,6 +63,15 @@
 /* We have no pud since we are a three-level page table. */
 #include <asm-generic/pgtable-nopud.h>
 
+/*
+ * pmds are the same as pgds and ptes, so converting is a no-op.
+ */
+#define pmd_pte(pmd) (pmd)
+#define pmdp_ptep(pmdp) (pmdp)
+#define pte_pmd(pte) (pte)
+
+#define pud_pte(pud) ((pud).pgd)
+
 static inline int pud_none(pud_t pud)
 {
 	return pud_val(pud) == 0;
@@ -73,6 +82,11 @@ static inline int pud_present(pud_t pud)
 	return pud_val(pud) & _PAGE_PRESENT;
 }
 
+static inline int pud_huge_page(pud_t pud)
+{
+	return pud_val(pud) & _PAGE_HUGE_PAGE;
+}
+
 #define pmd_ERROR(e) \
 	pr_err("%s:%d: bad pmd 0x%016llx.\n", __FILE__, __LINE__, pmd_val(e))
 
@@ -89,6 +103,9 @@ static inline int pud_bad(pud_t pud)
 /* Return the page-table frame number (ptfn) that a pud_t points at. */
 #define pud_ptfn(pud) hv_pte_get_ptfn((pud).pgd)
 
+/* Return the page frame number (pfn) that a pud_t points at. */
+#define pud_pfn(pud) pte_pfn(pud_pte(pud))
+
 /*
  * A given kernel pud_t maps to a kernel pmd_t table at a specific
  * virtual address.  Since kernel pmd_t tables can be aligned at
@@ -123,8 +140,7 @@ static inline unsigned long pgd_addr_normalize(unsigned long addr)
 /* We don't define any pgds for these addresses. */
 static inline int pgd_addr_invalid(unsigned long addr)
 {
-	return addr >= MEM_HV_START ||
-		(addr > MEM_LOW_END && addr < MEM_HIGH_START);
+	return addr >= KERNEL_HIGH_VADDR || addr != pgd_addr_normalize(addr);
 }
 
 /*
@@ -152,13 +168,6 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 	return hv_pte(__insn_exch(&ptep->val, 0UL));
 }
 
-/*
- * pmds are the same as pgds and ptes, so converting is a no-op.
- */
-#define pmd_pte(pmd) (pmd)
-#define pmdp_ptep(pmdp) (pmdp)
-#define pte_pmd(pte) (pte)
-
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_TILE_PGTABLE_64_H */
diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h
index b3f104953da2..42323636c459 100644
--- a/arch/tile/include/asm/processor.h
+++ b/arch/tile/include/asm/processor.h
@@ -15,6 +15,8 @@
 #ifndef _ASM_TILE_PROCESSOR_H
 #define _ASM_TILE_PROCESSOR_H
 
+#include <arch/chip.h>
+
 #ifndef __ASSEMBLY__
 
 /*
@@ -25,7 +27,6 @@
 #include <asm/ptrace.h>
 #include <asm/percpu.h>
 
-#include <arch/chip.h>
 #include <arch/spr_def.h>
 
 struct task_struct;
@@ -110,18 +111,16 @@ struct thread_struct {
 	unsigned long long interrupt_mask;
 	/* User interrupt-control 0 state */
 	unsigned long intctrl_0;
-#if CHIP_HAS_PROC_STATUS_SPR()
+	/* Is this task currently doing a backtrace? */
+	bool in_backtrace;
 	/* Any other miscellaneous processor state bits */
 	unsigned long proc_status;
-#endif
 #if !CHIP_HAS_FIXED_INTVEC_BASE()
 	/* Interrupt base for PL0 interrupts */
 	unsigned long interrupt_vector_base;
 #endif
-#if CHIP_HAS_TILE_RTF_HWM()
 	/* Tile cache retry fifo high-water mark */
 	unsigned long tile_rtf_hwm;
-#endif
 #if CHIP_HAS_DSTREAM_PF()
 	/* Data stream prefetch control */
 	unsigned long dstream_pf;
@@ -134,21 +133,16 @@ struct thread_struct {
 	/* Async DMA TLB fault information */
 	struct async_tlb dma_async_tlb;
 #endif
-#if CHIP_HAS_SN_PROC()
-	/* Was static network processor when we were switched out? */
-	int sn_proc_running;
-	/* Async SNI TLB fault information */
-	struct async_tlb sn_async_tlb;
-#endif
 };
 
 #endif /* !__ASSEMBLY__ */
 
 /*
  * Start with "sp" this many bytes below the top of the kernel stack.
- * This preserves the invariant that a called function may write to *sp.
+ * This allows us to be cache-aware when handling the initial save
+ * of the pt_regs value to the stack.
  */
-#define STACK_TOP_DELTA 8
+#define STACK_TOP_DELTA 64
 
 /*
  * When entering the kernel via a fault, start with the top of the
@@ -164,7 +158,7 @@ struct thread_struct {
 #ifndef __ASSEMBLY__
 
 #ifdef __tilegx__
-#define TASK_SIZE_MAX		(MEM_LOW_END + 1)
+#define TASK_SIZE_MAX		(_AC(1, UL) << (MAX_VA_WIDTH - 1))
 #else
 #define TASK_SIZE_MAX		PAGE_OFFSET
 #endif
@@ -178,10 +172,10 @@ struct thread_struct {
 #define TASK_SIZE		TASK_SIZE_MAX
 #endif
 
-/* We provide a minimal "vdso" a la x86; just the sigreturn code for now. */
-#define VDSO_BASE		(TASK_SIZE - PAGE_SIZE)
+#define VDSO_BASE	((unsigned long)current->active_mm->context.vdso_base)
+#define VDSO_SYM(x)	(VDSO_BASE + (unsigned long)(x))
 
-#define STACK_TOP		VDSO_BASE
+#define STACK_TOP		TASK_SIZE
 
 /* STACK_TOP_MAX is used temporarily in execve and should not check COMPAT. */
 #define STACK_TOP_MAX		TASK_SIZE_MAX
@@ -232,21 +226,28 @@ extern int do_work_pending(struct pt_regs *regs, u32 flags);
 unsigned long get_wchan(struct task_struct *p);
 
 /* Return initial ksp value for given task. */
-#define task_ksp0(task) ((unsigned long)(task)->stack + THREAD_SIZE)
+#define task_ksp0(task) \
+	((unsigned long)(task)->stack + THREAD_SIZE - STACK_TOP_DELTA)
 
 /* Return some info about the user process TASK. */
-#define KSTK_TOP(task)	(task_ksp0(task) - STACK_TOP_DELTA)
 #define task_pt_regs(task) \
-  ((struct pt_regs *)(task_ksp0(task) - KSTK_PTREGS_GAP) - 1)
+	((struct pt_regs *)(task_ksp0(task) - KSTK_PTREGS_GAP) - 1)
 #define current_pt_regs()                                   \
-  ((struct pt_regs *)((stack_pointer | (THREAD_SIZE - 1)) - \
-                      (KSTK_PTREGS_GAP - 1)) - 1)
+	((struct pt_regs *)((stack_pointer | (THREAD_SIZE - 1)) - \
+			    STACK_TOP_DELTA - (KSTK_PTREGS_GAP - 1)) - 1)
 #define task_sp(task)	(task_pt_regs(task)->sp)
 #define task_pc(task)	(task_pt_regs(task)->pc)
 /* Aliases for pc and sp (used in fs/proc/array.c) */
 #define KSTK_EIP(task)	task_pc(task)
 #define KSTK_ESP(task)	task_sp(task)
 
+/* Fine-grained unaligned JIT support */
+#define GET_UNALIGN_CTL(tsk, adr)	get_unalign_ctl((tsk), (adr))
+#define SET_UNALIGN_CTL(tsk, val)	set_unalign_ctl((tsk), (val))
+
+extern int get_unalign_ctl(struct task_struct *tsk, unsigned long adr);
+extern int set_unalign_ctl(struct task_struct *tsk, unsigned int val);
+
 /* Standard format for printing registers and other word-size data. */
 #ifdef __tilegx__
 # define REGFMT "0x%016lx"
@@ -275,7 +276,6 @@ extern char chip_model[64];
 /* Data on which physical memory controller corresponds to which NUMA node. */
 extern int node_controller[];
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 /* Does the heap allocator return hash-for-home pages by default? */
 extern int hash_default;
 
@@ -285,11 +285,6 @@ extern int kstack_hash;
 /* Does MAP_ANONYMOUS return hash-for-home pages by default? */
 #define uheap_hash hash_default
 
-#else
-#define hash_default 0
-#define kstack_hash 0
-#define uheap_hash 0
-#endif
 
 /* Are we using huge pages in the TLB for kernel data? */
 extern int kdata_huge;
@@ -337,7 +332,6 @@ extern int kdata_huge;
 
 /*
  * Provide symbolic constants for PLs.
- * Note that assembly code assumes that USER_PL is zero.
  */
 #define USER_PL 0
 #if CONFIG_KERNEL_PL == 2
@@ -346,20 +340,38 @@ extern int kdata_huge;
 #define KERNEL_PL CONFIG_KERNEL_PL
 
 /* SYSTEM_SAVE_K_0 holds the current cpu number ORed with ksp0. */
-#define CPU_LOG_MASK_VALUE 12
-#define CPU_MASK_VALUE ((1 << CPU_LOG_MASK_VALUE) - 1)
-#if CONFIG_NR_CPUS > CPU_MASK_VALUE
-# error Too many cpus!
+#ifdef __tilegx__
+#define CPU_SHIFT 48
+#if CHIP_VA_WIDTH() > CPU_SHIFT
+# error Too many VA bits!
 #endif
+#define MAX_CPU_ID ((1 << (64 - CPU_SHIFT)) - 1)
+#define raw_smp_processor_id() \
+	((int)(__insn_mfspr(SPR_SYSTEM_SAVE_K_0) >> CPU_SHIFT))
+#define get_current_ksp0() \
+	((unsigned long)(((long)__insn_mfspr(SPR_SYSTEM_SAVE_K_0) << \
+			  (64 - CPU_SHIFT)) >> (64 - CPU_SHIFT)))
+#define next_current_ksp0(task) ({ \
+	unsigned long __ksp0 = task_ksp0(task) & ((1UL << CPU_SHIFT) - 1); \
+	unsigned long __cpu = (long)raw_smp_processor_id() << CPU_SHIFT; \
+	__ksp0 | __cpu; \
+})
+#else
+#define LOG2_NR_CPU_IDS 6
+#define MAX_CPU_ID ((1 << LOG2_NR_CPU_IDS) - 1)
 #define raw_smp_processor_id() \
-	((int)__insn_mfspr(SPR_SYSTEM_SAVE_K_0) & CPU_MASK_VALUE)
+	((int)__insn_mfspr(SPR_SYSTEM_SAVE_K_0) & MAX_CPU_ID)
 #define get_current_ksp0() \
-	(__insn_mfspr(SPR_SYSTEM_SAVE_K_0) & ~CPU_MASK_VALUE)
+	(__insn_mfspr(SPR_SYSTEM_SAVE_K_0) & ~MAX_CPU_ID)
 #define next_current_ksp0(task) ({ \
 	unsigned long __ksp0 = task_ksp0(task); \
 	int __cpu = raw_smp_processor_id(); \
-	BUG_ON(__ksp0 & CPU_MASK_VALUE); \
+	BUG_ON(__ksp0 & MAX_CPU_ID); \
 	__ksp0 | __cpu; \
 })
+#endif
+#if CONFIG_NR_CPUS > (MAX_CPU_ID + 1)
+# error Too many cpus!
+#endif
 
 #endif /* _ASM_TILE_PROCESSOR_H */
diff --git a/arch/tile/include/asm/ptrace.h b/arch/tile/include/asm/ptrace.h
index fd412260aff7..b9620c077abc 100644
--- a/arch/tile/include/asm/ptrace.h
+++ b/arch/tile/include/asm/ptrace.h
@@ -33,12 +33,13 @@ typedef unsigned long pt_reg_t;
 
 #ifndef __ASSEMBLY__
 
+#define regs_return_value(regs) ((regs)->regs[0])
 #define instruction_pointer(regs) ((regs)->pc)
 #define profile_pc(regs) instruction_pointer(regs)
 #define user_stack_pointer(regs) ((regs)->sp)
 
 /* Does the process account for user or for system time? */
-#define user_mode(regs) (EX1_PL((regs)->ex1) == USER_PL)
+#define user_mode(regs) (EX1_PL((regs)->ex1) < KERNEL_PL)
 
 /* Fill in a struct pt_regs with the current kernel registers. */
 struct pt_regs *get_pt_regs(struct pt_regs *);
@@ -79,8 +80,7 @@ extern void single_step_execve(void);
 
 struct task_struct;
 
-extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
-			 int error_code);
+extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs);
 
 #ifdef __tilegx__
 /* We need this since sigval_t has a user pointer in it, for GETSIGINFO etc. */
diff --git a/arch/tile/include/asm/sections.h b/arch/tile/include/asm/sections.h
index 7d8a935a9238..5d5d3b739a6b 100644
--- a/arch/tile/include/asm/sections.h
+++ b/arch/tile/include/asm/sections.h
@@ -25,10 +25,16 @@ extern char _sinitdata[], _einitdata[];
 /* Write-once data is writable only till the end of initialization. */
 extern char __w1data_begin[], __w1data_end[];
 
+extern char vdso_start[], vdso_end[];
+#ifdef CONFIG_COMPAT
+extern char vdso32_start[], vdso32_end[];
+#endif
 
 /* Not exactly sections, but PC comparison points in the code. */
 extern char __rt_sigreturn[], __rt_sigreturn_end[];
-#ifndef __tilegx__
+#ifdef __tilegx__
+extern char __start_unalign_asm_code[], __end_unalign_asm_code[];
+#else
 extern char sys_cmpxchg[], __sys_cmpxchg_end[];
 extern char __sys_cmpxchg_grab_lock[];
 extern char __start_atomic_asm_code[], __end_atomic_asm_code[];
diff --git a/arch/tile/include/asm/setup.h b/arch/tile/include/asm/setup.h
index d048888c5d9a..e98909033e5b 100644
--- a/arch/tile/include/asm/setup.h
+++ b/arch/tile/include/asm/setup.h
@@ -24,9 +24,8 @@
  */
 #define MAXMEM_PFN	PFN_DOWN(MAXMEM)
 
+int tile_console_write(const char *buf, int count);
 void early_panic(const char *fmt, ...);
-void warn_early_printk(void);
-void __init disable_early_printk(void);
 
 /* Init-time routine to do tile-specific per-cpu setup. */
 void setup_cpu(int boot);
diff --git a/arch/tile/include/asm/smp.h b/arch/tile/include/asm/smp.h
index 1aa759aeb5b3..9a326b64f7ae 100644
--- a/arch/tile/include/asm/smp.h
+++ b/arch/tile/include/asm/smp.h
@@ -101,10 +101,8 @@ void print_disabled_cpus(void);
 extern struct cpumask cpu_lotar_map;
 #define cpu_is_valid_lotar(cpu) cpumask_test_cpu((cpu), &cpu_lotar_map)
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 /* Which processors are used for hash-for-home mapping */
 extern struct cpumask hash_for_home_map;
-#endif
 
 /* Which cpus can have their cache flushed by hv_flush_remote(). */
 extern struct cpumask cpu_cacheable_map;
diff --git a/arch/tile/include/asm/spinlock_64.h b/arch/tile/include/asm/spinlock_64.h
index 5f8b6a095fd8..9a12b9c7e5d3 100644
--- a/arch/tile/include/asm/spinlock_64.h
+++ b/arch/tile/include/asm/spinlock_64.h
@@ -27,7 +27,7 @@
  * Return the "current" portion of a ticket lock value,
  * i.e. the number that currently owns the lock.
  */
-static inline int arch_spin_current(u32 val)
+static inline u32 arch_spin_current(u32 val)
 {
 	return val >> __ARCH_SPIN_CURRENT_SHIFT;
 }
@@ -36,7 +36,7 @@ static inline int arch_spin_current(u32 val)
  * Return the "next" portion of a ticket lock value,
  * i.e. the number that the next task to try to acquire the lock will get.
  */
-static inline int arch_spin_next(u32 val)
+static inline u32 arch_spin_next(u32 val)
 {
 	return val & __ARCH_SPIN_NEXT_MASK;
 }
diff --git a/arch/tile/include/asm/string.h b/arch/tile/include/asm/string.h
index 7535cf1a30e4..92b271bd9ebd 100644
--- a/arch/tile/include/asm/string.h
+++ b/arch/tile/include/asm/string.h
@@ -21,8 +21,10 @@
 #define __HAVE_ARCH_MEMMOVE
 #define __HAVE_ARCH_STRCHR
 #define __HAVE_ARCH_STRLEN
+#define __HAVE_ARCH_STRNLEN
 
 extern __kernel_size_t strlen(const char *);
+extern __kernel_size_t strnlen(const char *, __kernel_size_t);
 extern char *strchr(const char *s, int c);
 extern void *memchr(const void *s, int c, size_t n);
 extern void *memset(void *, int, __kernel_size_t);
diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h
index d1733dee98a2..b8aa6df3e102 100644
--- a/arch/tile/include/asm/thread_info.h
+++ b/arch/tile/include/asm/thread_info.h
@@ -39,6 +39,11 @@ struct thread_info {
 	struct restart_block	restart_block;
 	struct single_step_state *step_state;	/* single step state
 						   (if non-zero) */
+	int			align_ctl;	/* controls unaligned access */
+#ifdef __tilegx__
+	unsigned long		unalign_jit_tmp[4]; /* temp r0..r3 storage */
+	void __user		*unalign_jit_base; /* unalign fixup JIT base */
+#endif
 };
 
 /*
@@ -56,6 +61,7 @@ struct thread_info {
 		.fn = do_no_restart_syscall,	\
 	},					\
 	.step_state	= NULL,			\
+	.align_ctl	= 0,			\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/tile/include/asm/traps.h b/arch/tile/include/asm/traps.h
index e28c3df4176a..4b99a1c3aab2 100644
--- a/arch/tile/include/asm/traps.h
+++ b/arch/tile/include/asm/traps.h
@@ -15,12 +15,13 @@
 #ifndef _ASM_TILE_TRAPS_H
 #define _ASM_TILE_TRAPS_H
 
+#ifndef __ASSEMBLY__
 #include <arch/chip.h>
 
 /* mm/fault.c */
 void do_page_fault(struct pt_regs *, int fault_num,
 		   unsigned long address, unsigned long write);
-#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
+#if CHIP_HAS_TILE_DMA()
 void do_async_page_fault(struct pt_regs *);
 #endif
 
@@ -69,6 +70,16 @@ void gx_singlestep_handle(struct pt_regs *, int fault_num);
 
 /* kernel/intvec_64.S */
 void fill_ra_stack(void);
+
+/* Handle unalign data fixup. */
+extern void do_unaligned(struct pt_regs *regs, int vecnum);
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+#ifdef __tilegx__
+/* 128 byte JIT per unalign fixup. */
+#define UNALIGN_JIT_SHIFT    7
 #endif
 
 #endif /* _ASM_TILE_TRAPS_H */
diff --git a/arch/tile/include/asm/uaccess.h b/arch/tile/include/asm/uaccess.h
index e4d44bd7df27..b6cde3209b96 100644
--- a/arch/tile/include/asm/uaccess.h
+++ b/arch/tile/include/asm/uaccess.h
@@ -127,8 +127,10 @@ extern int fixup_exception(struct pt_regs *regs);
 
 #ifdef __LP64__
 #define _ASM_PTR	".quad"
+#define _ASM_ALIGN	".align 8"
 #else
 #define _ASM_PTR	".long"
+#define _ASM_ALIGN	".align 4"
 #endif
 
 #define __get_user_asm(OP, x, ptr, ret)					\
@@ -137,6 +139,7 @@ extern int fixup_exception(struct pt_regs *regs);
 		     "0: { movei %1, 0; movei %0, %3 }\n"		\
 		     "j 9f\n"						\
 		     ".section __ex_table,\"a\"\n"			\
+		     _ASM_ALIGN "\n"					\
 		     _ASM_PTR " 1b, 0b\n"				\
 		     ".popsection\n"					\
 		     "9:"						\
@@ -168,6 +171,7 @@ extern int fixup_exception(struct pt_regs *regs);
 			     "0: { movei %1, 0; movei %2, 0 }\n"	\
 			     "{ movei %0, %4; j 9f }\n"			\
 			     ".section __ex_table,\"a\"\n"		\
+			     ".align 4\n"				\
 			     ".word 1b, 0b\n"				\
 			     ".word 2b, 0b\n"				\
 			     ".popsection\n"				\
@@ -224,6 +228,7 @@ extern int __get_user_bad(void)
 		     ".pushsection .fixup,\"ax\"\n"			\
 		     "0: { movei %0, %3; j 9f }\n"			\
 		     ".section __ex_table,\"a\"\n"			\
+		     _ASM_ALIGN "\n"					\
 		     _ASM_PTR " 1b, 0b\n"				\
 		     ".popsection\n"					\
 		     "9:"						\
@@ -248,6 +253,7 @@ extern int __get_user_bad(void)
 			     ".pushsection .fixup,\"ax\"\n"		\
 			     "0: { movei %0, %4; j 9f }\n"		\
 			     ".section __ex_table,\"a\"\n"		\
+			     ".align 4\n"				\
 			     ".word 1b, 0b\n"				\
 			     ".word 2b, 0b\n"				\
 			     ".popsection\n"				\
@@ -567,37 +573,6 @@ static inline unsigned long __must_check flush_user(
 }
 
 /**
- * inv_user: - Invalidate a block of memory in user space from cache.
- * @mem:   Destination address, in user space.
- * @len:   Number of bytes to invalidate.
- *
- * Returns number of bytes that could not be invalidated.
- * On success, this will be zero.
- *
- * Note that on Tile64, the "inv" operation is in fact a
- * "flush and invalidate", so cache write-backs will occur prior
- * to the cache being marked invalid.
- */
-extern unsigned long inv_user_asm(void __user *mem, unsigned long len);
-static inline unsigned long __must_check __inv_user(
-	void __user *mem, unsigned long len)
-{
-	int retval;
-
-	might_fault();
-	retval = inv_user_asm(mem, len);
-	mb_incoherent();
-	return retval;
-}
-static inline unsigned long __must_check inv_user(
-	void __user *mem, unsigned long len)
-{
-	if (access_ok(VERIFY_WRITE, mem, len))
-		return __inv_user(mem, len);
-	return len;
-}
-
-/**
  * finv_user: - Flush-inval a block of memory in user space from cache.
  * @mem:   Destination address, in user space.
  * @len:   Number of bytes to invalidate.
diff --git a/arch/tile/include/asm/unaligned.h b/arch/tile/include/asm/unaligned.h
index 37dfbe598872..5a58a0d11449 100644
--- a/arch/tile/include/asm/unaligned.h
+++ b/arch/tile/include/asm/unaligned.h
@@ -15,11 +15,15 @@
 #ifndef _ASM_TILE_UNALIGNED_H
 #define _ASM_TILE_UNALIGNED_H
 
-#include <linux/unaligned/le_struct.h>
-#include <linux/unaligned/be_byteshift.h>
-#include <linux/unaligned/generic.h>
-#define get_unaligned	__get_unaligned_le
-#define put_unaligned	__put_unaligned_le
+/*
+ * We could implement faster get_unaligned_[be/le]64 using the ldna
+ * instruction on tilegx; however, we need to either copy all of the
+ * other generic functions to here (which is pretty ugly) or else
+ * modify both the generic code and other arch code to allow arch
+ * specific unaligned data access functions.  Given these functions
+ * are not often called, we'll stick with the generic version.
+ */
+#include <asm-generic/unaligned.h>
 
 /*
  * Is the kernel doing fixups of unaligned accesses?  If <0, no kernel
diff --git a/arch/tile/include/asm/vdso.h b/arch/tile/include/asm/vdso.h
new file mode 100644
index 000000000000..9f6a78d665fa
--- /dev/null
+++ b/arch/tile/include/asm/vdso.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef __TILE_VDSO_H__
+#define __TILE_VDSO_H__
+
+#include <linux/types.h>
+
+/*
+ * Note about the vdso_data structure:
+ *
+ * NEVER USE THEM IN USERSPACE CODE DIRECTLY. The layout of the
+ * structure is supposed to be known only to the function in the vdso
+ * itself and may change without notice.
+ */
+
+struct vdso_data {
+	__u64 tz_update_count;  /* Timezone atomicity ctr             */
+	__u64 tb_update_count;  /* Timebase atomicity ctr             */
+	__u64 xtime_tod_stamp;  /* TOD clock for xtime                */
+	__u64 xtime_clock_sec;  /* Kernel time second                 */
+	__u64 xtime_clock_nsec; /* Kernel time nanosecond             */
+	__u64 wtom_clock_sec;   /* Wall to monotonic clock second     */
+	__u64 wtom_clock_nsec;  /* Wall to monotonic clock nanosecond */
+	__u32 mult;             /* Cycle to nanosecond multiplier     */
+	__u32 shift;            /* Cycle to nanosecond divisor (power of two) */
+	__u32 tz_minuteswest;   /* Minutes west of Greenwich          */
+	__u32 tz_dsttime;       /* Type of dst correction             */
+};
+
+extern struct vdso_data *vdso_data;
+
+/* __vdso_rt_sigreturn is defined with the addresses in the vdso page. */
+extern void __vdso_rt_sigreturn(void);
+
+extern int setup_vdso_pages(void);
+
+#endif /* __TILE_VDSO_H__ */
diff --git a/arch/tile/include/gxio/iorpc_trio.h b/arch/tile/include/gxio/iorpc_trio.h
index 58105c31228b..d95b96fd6c93 100644
--- a/arch/tile/include/gxio/iorpc_trio.h
+++ b/arch/tile/include/gxio/iorpc_trio.h
@@ -30,6 +30,7 @@
 
 #define GXIO_TRIO_OP_ALLOC_MEMORY_MAPS IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1404)
 
+#define GXIO_TRIO_OP_ALLOC_SCATTER_QUEUES IORPC_OPCODE(IORPC_FORMAT_NONE, 0x140e)
 #define GXIO_TRIO_OP_ALLOC_PIO_REGIONS IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1412)
 
 #define GXIO_TRIO_OP_INIT_PIO_REGION_AUX IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1414)
@@ -54,6 +55,10 @@ int gxio_trio_alloc_memory_maps(gxio_trio_context_t * context,
 				unsigned int flags);
 
 
+int gxio_trio_alloc_scatter_queues(gxio_trio_context_t * context,
+				   unsigned int count, unsigned int first,
+				   unsigned int flags);
+
 int gxio_trio_alloc_pio_regions(gxio_trio_context_t * context,
 				unsigned int count, unsigned int first,
 				unsigned int flags);
diff --git a/arch/tile/include/gxio/iorpc_uart.h b/arch/tile/include/gxio/iorpc_uart.h
new file mode 100644
index 000000000000..55429d48ea56
--- /dev/null
+++ b/arch/tile/include/gxio/iorpc_uart.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#ifndef __GXIO_UART_LINUX_RPC_H__
+#define __GXIO_UART_LINUX_RPC_H__
+
+#include <hv/iorpc.h>
+
+#include <hv/drv_uart_intf.h>
+#include <gxio/uart.h>
+#include <gxio/kiorpc.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <asm/pgtable.h>
+
+#define GXIO_UART_OP_CFG_INTERRUPT     IORPC_OPCODE(IORPC_FORMAT_KERNEL_INTERRUPT, 0x1900)
+#define GXIO_UART_OP_GET_MMIO_BASE     IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8000)
+#define GXIO_UART_OP_CHECK_MMIO_OFFSET IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8001)
+
+int gxio_uart_cfg_interrupt(gxio_uart_context_t *context, int inter_x,
+			    int inter_y, int inter_ipi, int inter_event);
+
+int gxio_uart_get_mmio_base(gxio_uart_context_t *context, HV_PTE *base);
+
+int gxio_uart_check_mmio_offset(gxio_uart_context_t *context,
+				unsigned long offset, unsigned long size);
+
+#endif /* !__GXIO_UART_LINUX_RPC_H__ */
diff --git a/arch/tile/include/gxio/uart.h b/arch/tile/include/gxio/uart.h
new file mode 100644
index 000000000000..438ee7e46c7b
--- /dev/null
+++ b/arch/tile/include/gxio/uart.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _GXIO_UART_H_
+#define _GXIO_UART_H_
+
+#include "common.h"
+
+#include <hv/drv_uart_intf.h>
+#include <hv/iorpc.h>
+
+/*
+ *
+ * An API for manipulating UART interface.
+ */
+
+/*
+ *
+ * The Rshim allows access to the processor's UART interface.
+ */
+
+/* A context object used to manage UART resources. */
+typedef struct {
+
+	/* File descriptor for calling up to the hypervisor. */
+	int fd;
+
+	/* The VA at which our MMIO registers are mapped. */
+	char *mmio_base;
+
+} gxio_uart_context_t;
+
+/* Request UART interrupts.
+ *
+ *  Request that interrupts be delivered to a tile when the UART's
+ *  Receive FIFO is written, or the Write FIFO is read.
+ *
+ * @param context Pointer to a properly initialized gxio_uart_context_t.
+ * @param bind_cpu_x X coordinate of CPU to which interrupt will be delivered.
+ * @param bind_cpu_y Y coordinate of CPU to which interrupt will be delivered.
+ * @param bind_interrupt IPI interrupt number.
+ * @param bind_event Sub-interrupt event bit number; a negative value can
+ *  disable the interrupt.
+ * @return Zero if all of the requested UART events were successfully
+ *  configured to interrupt.
+ */
+extern int gxio_uart_cfg_interrupt(gxio_uart_context_t *context,
+				   int bind_cpu_x,
+				   int bind_cpu_y,
+				   int bind_interrupt, int bind_event);
+
+/* Initialize a UART context.
+ *
+ *  A properly initialized context must be obtained before any of the other
+ *  gxio_uart routines may be used.
+ *
+ * @param context Pointer to a gxio_uart_context_t, which will be initialized
+ *  by this routine, if it succeeds.
+ * @param uart_index Index of the UART to use.
+ * @return Zero if the context was successfully initialized, else a
+ *  GXIO_ERR_xxx error code.
+ */
+extern int gxio_uart_init(gxio_uart_context_t *context, int uart_index);
+
+/* Destroy a UART context.
+ *
+ *  Once destroyed, a context may not be used with any gxio_uart routines
+ *  other than gxio_uart_init().  After this routine returns, no further
+ *  interrupts requested on this context will be delivered.  The state and
+ *  configuration of the pins which had been attached to this context are
+ *  unchanged by this operation.
+ *
+ * @param context Pointer to a gxio_uart_context_t.
+ * @return Zero if the context was successfully destroyed, else a
+ *  GXIO_ERR_xxx error code.
+ */
+extern int gxio_uart_destroy(gxio_uart_context_t *context);
+
+/* Write UART register.
+ * @param context Pointer to a gxio_uart_context_t.
+ * @param offset UART register offset.
+ * @param word Data will be wrote to UART reigister.
+ */
+extern void gxio_uart_write(gxio_uart_context_t *context, uint64_t offset,
+			    uint64_t word);
+
+/* Read UART register.
+ * @param context Pointer to a gxio_uart_context_t.
+ * @param offset UART register offset.
+ * @return Data read from UART register.
+ */
+extern uint64_t gxio_uart_read(gxio_uart_context_t *context, uint64_t offset);
+
+#endif /* _GXIO_UART_H_ */
diff --git a/arch/tile/include/hv/drv_trio_intf.h b/arch/tile/include/hv/drv_trio_intf.h
index ef9f3f52ee27..237e04dee66c 100644
--- a/arch/tile/include/hv/drv_trio_intf.h
+++ b/arch/tile/include/hv/drv_trio_intf.h
@@ -64,8 +64,9 @@ struct pcie_port_property
    *  will not consider it an error if the link comes up as a x8 link. */
   uint8_t allow_x8: 1;
 
-  /** Reserved. */
-  uint8_t reserved: 1;
+  /** If true, this link is connected to a device which may or may not
+   *  be present. */
+  uint8_t removable: 1;
 
 };
 
@@ -167,6 +168,9 @@ pcie_stream_intr_config_sel_t;
 struct pcie_trio_ports_property
 {
   struct pcie_port_property ports[TILEGX_TRIO_PCIES];
+
+  /** Set if this TRIO belongs to a Gx72 device. */
+  uint8_t is_gx72;
 };
 
 /* Flags indicating traffic class. */
diff --git a/arch/tile/include/hv/drv_uart_intf.h b/arch/tile/include/hv/drv_uart_intf.h
new file mode 100644
index 000000000000..f5379e2404fd
--- /dev/null
+++ b/arch/tile/include/hv/drv_uart_intf.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * Interface definitions for the UART driver.
+ */
+
+#ifndef _SYS_HV_DRV_UART_INTF_H
+#define _SYS_HV_DRV_UART_INTF_H
+
+#include <arch/uart.h>
+
+/** Number of UART ports supported. */
+#define TILEGX_UART_NR        2
+
+/** The mmap file offset (PA) of the UART MMIO region. */
+#define HV_UART_MMIO_OFFSET   0
+
+/** The maximum size of the UARTs MMIO region (64K Bytes). */
+#define HV_UART_MMIO_SIZE     (1UL << 16)
+
+#endif /* _SYS_HV_DRV_UART_INTF_H */
diff --git a/arch/tile/include/hv/hypervisor.h b/arch/tile/include/hv/hypervisor.h
index 837dca5328c2..dfcdeb61ba34 100644
--- a/arch/tile/include/hv/hypervisor.h
+++ b/arch/tile/include/hv/hypervisor.h
@@ -318,8 +318,11 @@
 /** hv_set_pte_super_shift */
 #define HV_DISPATCH_SET_PTE_SUPER_SHIFT           57
 
+/** hv_console_set_ipi */
+#define HV_DISPATCH_CONSOLE_SET_IPI               63
+
 /** One more than the largest dispatch value */
-#define _HV_DISPATCH_END                          58
+#define _HV_DISPATCH_END                          64
 
 
 #ifndef __ASSEMBLER__
@@ -541,14 +544,24 @@ typedef enum {
   HV_CONFSTR_CPUMOD_REV      = 18,
 
   /** Human-readable CPU module description. */
-  HV_CONFSTR_CPUMOD_DESC     = 19
+  HV_CONFSTR_CPUMOD_DESC     = 19,
+
+  /** Per-tile hypervisor statistics.  When this identifier is specified,
+   *  the hv_confstr call takes two extra arguments.  The first is the
+   *  HV_XY_TO_LOTAR of the target tile's coordinates.  The second is
+   *  a flag word.  The only current flag is the lowest bit, which means
+   *  "zero out the stats instead of retrieving them"; in this case the
+   *  buffer and buffer length are ignored. */
+  HV_CONFSTR_HV_STATS        = 20
 
 } HV_ConfstrQuery;
 
 /** Query a configuration string from the hypervisor.
  *
  * @param query Identifier for the specific string to be retrieved
- *        (HV_CONFSTR_xxx).
+ *        (HV_CONFSTR_xxx).  Some strings may require or permit extra
+ *        arguments to be appended which select specific objects to be
+ *        described; see the string descriptions above.
  * @param buf Buffer in which to place the string.
  * @param len Length of the buffer.
  * @return If query is valid, then the length of the corresponding string,
@@ -556,21 +569,16 @@ typedef enum {
  *        was truncated.  If query is invalid, HV_EINVAL.  If the specified
  *        buffer is not writable by the client, HV_EFAULT.
  */
-int hv_confstr(HV_ConfstrQuery query, HV_VirtAddr buf, int len);
+int hv_confstr(HV_ConfstrQuery query, HV_VirtAddr buf, int len, ...);
 
 /** Tile coordinate */
 typedef struct
 {
-#ifndef __BIG_ENDIAN__
   /** X coordinate, relative to supervisor's top-left coordinate */
   int x;
 
   /** Y coordinate, relative to supervisor's top-left coordinate */
   int y;
-#else
-  int y;
-  int x;
-#endif
 } HV_Coord;
 
 
@@ -585,6 +593,30 @@ typedef struct
  */
 int hv_get_ipi_pte(HV_Coord tile, int pl, HV_PTE* pte);
 
+/** Configure the console interrupt.
+ *
+ * When the console client interrupt is enabled, the hypervisor will
+ * deliver the specified IPI to the client in the following situations:
+ *
+ * - The console has at least one character available for input.
+ *
+ * - The console can accept new characters for output, and the last call
+ *   to hv_console_write() did not write all of the characters requested
+ *   by the client.
+ *
+ * Note that in some system configurations, console interrupt will not
+ * be available; clients should be prepared for this routine to fail and
+ * to fall back to periodic console polling in that case.
+ *
+ * @param ipi Index of the IPI register which will receive the interrupt.
+ * @param event IPI event number for console interrupt. If less than 0,
+ *        disable the console IPI interrupt.
+ * @param coord Tile to be targeted for console interrupt.
+ * @return 0 on success, otherwise, HV_EINVAL if illegal parameter,
+ *         HV_ENOTSUP if console interrupt are not available.
+ */
+int hv_console_set_ipi(int ipi, int event, HV_Coord coord);
+
 #else /* !CHIP_HAS_IPI() */
 
 /** A set of interrupts. */
@@ -1092,13 +1124,8 @@ HV_VirtAddrRange hv_inquire_virtual(int idx);
 /** A range of ASID values. */
 typedef struct
 {
-#ifndef __BIG_ENDIAN__
   HV_ASID start;        /**< First ASID in the range. */
   unsigned int size;    /**< Number of ASIDs. Zero for an invalid range. */
-#else
-  unsigned int size;    /**< Number of ASIDs. Zero for an invalid range. */
-  HV_ASID start;        /**< First ASID in the range. */
-#endif
 } HV_ASIDRange;
 
 /** Returns information about a range of ASIDs.
@@ -1422,7 +1449,6 @@ typedef enum
 /** Message recipient. */
 typedef struct
 {
-#ifndef __BIG_ENDIAN__
   /** X coordinate, relative to supervisor's top-left coordinate */
   unsigned int x:11;
 
@@ -1431,11 +1457,6 @@ typedef struct
 
   /** Status of this recipient */
   HV_Recip_State state:10;
-#else //__BIG_ENDIAN__
-  HV_Recip_State state:10;
-  unsigned int y:11;
-  unsigned int x:11;
-#endif
 } HV_Recipient;
 
 /** Send a message to a set of recipients.
diff --git a/arch/tile/include/uapi/arch/Kbuild b/arch/tile/include/uapi/arch/Kbuild
index 4ebc34f4768d..97dfbecec6b6 100644
--- a/arch/tile/include/uapi/arch/Kbuild
+++ b/arch/tile/include/uapi/arch/Kbuild
@@ -1,7 +1,6 @@
 # UAPI Header export list
 header-y += abi.h
 header-y += chip.h
-header-y += chip_tile64.h
 header-y += chip_tilegx.h
 header-y += chip_tilepro.h
 header-y += icache.h
diff --git a/arch/tile/include/uapi/arch/chip.h b/arch/tile/include/uapi/arch/chip.h
index 926d3db0e91e..4c91f90b9369 100644
--- a/arch/tile/include/uapi/arch/chip.h
+++ b/arch/tile/include/uapi/arch/chip.h
@@ -12,9 +12,7 @@
  *   more details.
  */
 
-#if __tile_chip__ == 0
-#include <arch/chip_tile64.h>
-#elif __tile_chip__ == 1
+#if __tile_chip__ == 1
 #include <arch/chip_tilepro.h>
 #elif defined(__tilegx__)
 #include <arch/chip_tilegx.h>
diff --git a/arch/tile/include/uapi/arch/chip_tile64.h b/arch/tile/include/uapi/arch/chip_tile64.h
deleted file mode 100644
index 261aaba092d4..000000000000
--- a/arch/tile/include/uapi/arch/chip_tile64.h
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
- * Copyright 2010 Tilera Corporation. All Rights Reserved.
- *
- *   This program is free software; you can redistribute it and/or
- *   modify it under the terms of the GNU General Public License
- *   as published by the Free Software Foundation, version 2.
- *
- *   This program is distributed in the hope that it will be useful, but
- *   WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- *   NON INFRINGEMENT.  See the GNU General Public License for
- *   more details.
- */
-
-/*
- * @file
- * Global header file.
- * This header file specifies defines for TILE64.
- */
-
-#ifndef __ARCH_CHIP_H__
-#define __ARCH_CHIP_H__
-
-/** Specify chip version.
- * When possible, prefer the CHIP_xxx symbols below for future-proofing.
- * This is intended for cross-compiling; native compilation should
- * use the predefined __tile_chip__ symbol.
- */
-#define TILE_CHIP 0
-
-/** Specify chip revision.
- * This provides for the case of a respin of a particular chip type;
- * the normal value for this symbol is "0".
- * This is intended for cross-compiling; native compilation should
- * use the predefined __tile_chip_rev__ symbol.
- */
-#define TILE_CHIP_REV 0
-
-/** The name of this architecture. */
-#define CHIP_ARCH_NAME "tile64"
-
-/** The ELF e_machine type for binaries for this chip. */
-#define CHIP_ELF_TYPE() EM_TILE64
-
-/** The alternate ELF e_machine type for binaries for this chip. */
-#define CHIP_COMPAT_ELF_TYPE() 0x2506
-
-/** What is the native word size of the machine? */
-#define CHIP_WORD_SIZE() 32
-
-/** How many bits of a virtual address are used. Extra bits must be
- * the sign extension of the low bits.
- */
-#define CHIP_VA_WIDTH() 32
-
-/** How many bits are in a physical address? */
-#define CHIP_PA_WIDTH() 36
-
-/** Size of the L2 cache, in bytes. */
-#define CHIP_L2_CACHE_SIZE() 65536
-
-/** Log size of an L2 cache line in bytes. */
-#define CHIP_L2_LOG_LINE_SIZE() 6
-
-/** Size of an L2 cache line, in bytes. */
-#define CHIP_L2_LINE_SIZE() (1 << CHIP_L2_LOG_LINE_SIZE())
-
-/** Associativity of the L2 cache. */
-#define CHIP_L2_ASSOC() 2
-
-/** Size of the L1 data cache, in bytes. */
-#define CHIP_L1D_CACHE_SIZE() 8192
-
-/** Log size of an L1 data cache line in bytes. */
-#define CHIP_L1D_LOG_LINE_SIZE() 4
-
-/** Size of an L1 data cache line, in bytes. */
-#define CHIP_L1D_LINE_SIZE() (1 << CHIP_L1D_LOG_LINE_SIZE())
-
-/** Associativity of the L1 data cache. */
-#define CHIP_L1D_ASSOC() 2
-
-/** Size of the L1 instruction cache, in bytes. */
-#define CHIP_L1I_CACHE_SIZE() 8192
-
-/** Log size of an L1 instruction cache line in bytes. */
-#define CHIP_L1I_LOG_LINE_SIZE() 6
-
-/** Size of an L1 instruction cache line, in bytes. */
-#define CHIP_L1I_LINE_SIZE() (1 << CHIP_L1I_LOG_LINE_SIZE())
-
-/** Associativity of the L1 instruction cache. */
-#define CHIP_L1I_ASSOC() 1
-
-/** Stride with which flush instructions must be issued. */
-#define CHIP_FLUSH_STRIDE() CHIP_L2_LINE_SIZE()
-
-/** Stride with which inv instructions must be issued. */
-#define CHIP_INV_STRIDE() CHIP_L1D_LINE_SIZE()
-
-/** Stride with which finv instructions must be issued. */
-#define CHIP_FINV_STRIDE() CHIP_L1D_LINE_SIZE()
-
-/** Can the local cache coherently cache data that is homed elsewhere? */
-#define CHIP_HAS_COHERENT_LOCAL_CACHE() 0
-
-/** How many simultaneous outstanding victims can the L2 cache have? */
-#define CHIP_MAX_OUTSTANDING_VICTIMS() 2
-
-/** Does the TLB support the NC and NOALLOC bits? */
-#define CHIP_HAS_NC_AND_NOALLOC_BITS() 0
-
-/** Does the chip support hash-for-home caching? */
-#define CHIP_HAS_CBOX_HOME_MAP() 0
-
-/** Number of entries in the chip's home map tables. */
-/* #define CHIP_CBOX_HOME_MAP_SIZE() -- does not apply to chip 0 */
-
-/** Do uncacheable requests miss in the cache regardless of whether
- * there is matching data? */
-#define CHIP_HAS_ENFORCED_UNCACHEABLE_REQUESTS() 0
-
-/** Does the mf instruction wait for victims? */
-#define CHIP_HAS_MF_WAITS_FOR_VICTIMS() 1
-
-/** Does the chip have an "inv" instruction that doesn't also flush? */
-#define CHIP_HAS_INV() 0
-
-/** Does the chip have a "wh64" instruction? */
-#define CHIP_HAS_WH64() 0
-
-/** Does this chip have a 'dword_align' instruction? */
-#define CHIP_HAS_DWORD_ALIGN() 0
-
-/** Number of performance counters. */
-#define CHIP_PERFORMANCE_COUNTERS() 2
-
-/** Does this chip have auxiliary performance counters? */
-#define CHIP_HAS_AUX_PERF_COUNTERS() 0
-
-/** Is the CBOX_MSR1 SPR supported? */
-#define CHIP_HAS_CBOX_MSR1() 0
-
-/** Is the TILE_RTF_HWM SPR supported? */
-#define CHIP_HAS_TILE_RTF_HWM() 0
-
-/** Is the TILE_WRITE_PENDING SPR supported? */
-#define CHIP_HAS_TILE_WRITE_PENDING() 0
-
-/** Is the PROC_STATUS SPR supported? */
-#define CHIP_HAS_PROC_STATUS_SPR() 0
-
-/** Is the DSTREAM_PF SPR supported? */
-#define CHIP_HAS_DSTREAM_PF() 0
-
-/** Log of the number of mshims we have. */
-#define CHIP_LOG_NUM_MSHIMS() 2
-
-/** Are the bases of the interrupt vector areas fixed? */
-#define CHIP_HAS_FIXED_INTVEC_BASE() 1
-
-/** Are the interrupt masks split up into 2 SPRs? */
-#define CHIP_HAS_SPLIT_INTR_MASK() 1
-
-/** Is the cycle count split up into 2 SPRs? */
-#define CHIP_HAS_SPLIT_CYCLE() 1
-
-/** Does the chip have a static network? */
-#define CHIP_HAS_SN() 1
-
-/** Does the chip have a static network processor? */
-#define CHIP_HAS_SN_PROC() 1
-
-/** Size of the L1 static network processor instruction cache, in bytes. */
-#define CHIP_L1SNI_CACHE_SIZE() 2048
-
-/** Does the chip have DMA support in each tile? */
-#define CHIP_HAS_TILE_DMA() 1
-
-/** Does the chip have the second revision of the directly accessible
- *  dynamic networks?  This encapsulates a number of characteristics,
- *  including the absence of the catch-all, the absence of inline message
- *  tags, the absence of support for network context-switching, and so on.
- */
-#define CHIP_HAS_REV1_XDN() 0
-
-/** Does the chip have cmpexch and similar (fetchadd, exch, etc.)? */
-#define CHIP_HAS_CMPEXCH() 0
-
-/** Does the chip have memory-mapped I/O support? */
-#define CHIP_HAS_MMIO() 0
-
-/** Does the chip have post-completion interrupts? */
-#define CHIP_HAS_POST_COMPLETION_INTERRUPTS() 0
-
-/** Does the chip have native single step support? */
-#define CHIP_HAS_SINGLE_STEP() 0
-
-#ifndef __OPEN_SOURCE__  /* features only relevant to hypervisor-level code */
-
-/** How many entries are present in the instruction TLB? */
-#define CHIP_ITLB_ENTRIES() 8
-
-/** How many entries are present in the data TLB? */
-#define CHIP_DTLB_ENTRIES() 16
-
-/** How many MAF entries does the XAUI shim have? */
-#define CHIP_XAUI_MAF_ENTRIES() 16
-
-/** Does the memory shim have a source-id table? */
-#define CHIP_HAS_MSHIM_SRCID_TABLE() 1
-
-/** Does the L1 instruction cache clear on reset? */
-#define CHIP_HAS_L1I_CLEAR_ON_RESET() 0
-
-/** Does the chip come out of reset with valid coordinates on all tiles?
- * Note that if defined, this also implies that the upper left is 1,1.
- */
-#define CHIP_HAS_VALID_TILE_COORD_RESET() 0
-
-/** Does the chip have unified packet formats? */
-#define CHIP_HAS_UNIFIED_PACKET_FORMATS() 0
-
-/** Does the chip support write reordering? */
-#define CHIP_HAS_WRITE_REORDERING() 0
-
-/** Does the chip support Y-X routing as well as X-Y? */
-#define CHIP_HAS_Y_X_ROUTING() 0
-
-/** Is INTCTRL_3 managed with the correct MPL? */
-#define CHIP_HAS_INTCTRL_3_STATUS_FIX() 0
-
-/** Is it possible to configure the chip to be big-endian? */
-#define CHIP_HAS_BIG_ENDIAN_CONFIG() 0
-
-/** Is the CACHE_RED_WAY_OVERRIDDEN SPR supported? */
-#define CHIP_HAS_CACHE_RED_WAY_OVERRIDDEN() 0
-
-/** Is the DIAG_TRACE_WAY SPR supported? */
-#define CHIP_HAS_DIAG_TRACE_WAY() 0
-
-/** Is the MEM_STRIPE_CONFIG SPR supported? */
-#define CHIP_HAS_MEM_STRIPE_CONFIG() 0
-
-/** Are the TLB_PERF SPRs supported? */
-#define CHIP_HAS_TLB_PERF() 0
-
-/** Is the VDN_SNOOP_SHIM_CTL SPR supported? */
-#define CHIP_HAS_VDN_SNOOP_SHIM_CTL() 0
-
-/** Does the chip support rev1 DMA packets? */
-#define CHIP_HAS_REV1_DMA_PACKETS() 0
-
-/** Does the chip have an IPI shim? */
-#define CHIP_HAS_IPI() 0
-
-#endif /* !__OPEN_SOURCE__ */
-#endif /* __ARCH_CHIP_H__ */
diff --git a/arch/tile/include/uapi/arch/opcode_tilegx.h b/arch/tile/include/uapi/arch/opcode_tilegx.h
index c14d02c81600..d76ff2db745e 100644
--- a/arch/tile/include/uapi/arch/opcode_tilegx.h
+++ b/arch/tile/include/uapi/arch/opcode_tilegx.h
@@ -61,6 +61,7 @@ typedef tilegx_bundle_bits tile_bundle_bits;
 #define TILE_BUNDLE_ALIGNMENT_IN_BYTES TILEGX_BUNDLE_ALIGNMENT_IN_BYTES
 #define TILE_LOG2_BUNDLE_ALIGNMENT_IN_BYTES \
   TILEGX_LOG2_BUNDLE_ALIGNMENT_IN_BYTES
+#define TILE_BPT_BUNDLE TILEGX_BPT_BUNDLE
 
 /* 64-bit pattern for a { bpt ; nop } bundle. */
 #define TILEGX_BPT_BUNDLE 0x286a44ae51485000ULL
diff --git a/arch/tile/include/uapi/arch/opcode_tilepro.h b/arch/tile/include/uapi/arch/opcode_tilepro.h
index 71b763b8ce83..4451cff1a861 100644
--- a/arch/tile/include/uapi/arch/opcode_tilepro.h
+++ b/arch/tile/include/uapi/arch/opcode_tilepro.h
@@ -71,6 +71,7 @@ typedef tilepro_bundle_bits tile_bundle_bits;
 #define TILE_BUNDLE_ALIGNMENT_IN_BYTES TILEPRO_BUNDLE_ALIGNMENT_IN_BYTES
 #define TILE_LOG2_BUNDLE_ALIGNMENT_IN_BYTES \
   TILEPRO_LOG2_BUNDLE_ALIGNMENT_IN_BYTES
+#define TILE_BPT_BUNDLE TILEPRO_BPT_BUNDLE
 
 /* 64-bit pattern for a { bpt ; nop } bundle. */
 #define TILEPRO_BPT_BUNDLE 0x400b3cae70166000ULL
diff --git a/arch/tile/include/uapi/arch/spr_def_32.h b/arch/tile/include/uapi/arch/spr_def_32.h
index c689446e6284..78daa3146d25 100644
--- a/arch/tile/include/uapi/arch/spr_def_32.h
+++ b/arch/tile/include/uapi/arch/spr_def_32.h
@@ -200,8 +200,6 @@
 #define SPR_SIM_CONTROL 0x4e0c
 #define SPR_SNCTL 0x0805
 #define SPR_SNCTL__FRZFABRIC_MASK  0x1
-#define SPR_SNCTL__FRZPROC_MASK  0x2
-#define SPR_SNPC 0x080b
 #define SPR_SNSTATIC 0x080c
 #define SPR_SYSTEM_SAVE_0_0 0x4b00
 #define SPR_SYSTEM_SAVE_0_1 0x4b01
diff --git a/arch/tile/include/uapi/asm/auxvec.h b/arch/tile/include/uapi/asm/auxvec.h
index 1d393edb0641..c93e92709f14 100644
--- a/arch/tile/include/uapi/asm/auxvec.h
+++ b/arch/tile/include/uapi/asm/auxvec.h
@@ -15,6 +15,7 @@
 #ifndef _ASM_TILE_AUXVEC_H
 #define _ASM_TILE_AUXVEC_H
 
-/* No extensions to auxvec */
+/* The vDSO location. */
+#define AT_SYSINFO_EHDR         33
 
 #endif /* _ASM_TILE_AUXVEC_H */
diff --git a/arch/tile/include/uapi/asm/cachectl.h b/arch/tile/include/uapi/asm/cachectl.h
index af4c9f9154d1..572ddcad2090 100644
--- a/arch/tile/include/uapi/asm/cachectl.h
+++ b/arch/tile/include/uapi/asm/cachectl.h
@@ -29,8 +29,8 @@
  * to honor the arguments at some point.)
  *
  * Flush and invalidation of memory can normally be performed with the
- * __insn_flush(), __insn_inv(), and __insn_finv() instructions from
- * userspace.  The DCACHE option to the system call allows userspace
+ * __insn_flush() and __insn_finv() instructions from userspace.
+ * The DCACHE option to the system call allows userspace
  * to flush the entire L1+L2 data cache from the core.  In this case,
  * the address and length arguments are not used.  The DCACHE flush is
  * restricted to the current core, not all cores in the address space.
diff --git a/arch/tile/kernel/Makefile b/arch/tile/kernel/Makefile
index 5334be8e2538..27a2bf39dae8 100644
--- a/arch/tile/kernel/Makefile
+++ b/arch/tile/kernel/Makefile
@@ -3,11 +3,17 @@
 #
 
 extra-y := vmlinux.lds head_$(BITS).o
-obj-y := backtrace.o entry.o irq.o messaging.o \
+obj-y := backtrace.o entry.o hvglue.o irq.o messaging.o \
 	pci-dma.o proc.o process.o ptrace.o reboot.o \
-	setup.o signal.o single_step.o stack.o sys.o sysfs.o time.o traps.o \
+	setup.o signal.o single_step.o stack.o sys.o \
+	sysfs.o time.o traps.o unaligned.o vdso.o \
 	intvec_$(BITS).o regs_$(BITS).o tile-desc_$(BITS).o
 
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_ftrace.o = -pg
+CFLAGS_REMOVE_early_printk.o = -pg
+endif
+
 obj-$(CONFIG_HARDWALL)		+= hardwall.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_signal.o
 obj-$(CONFIG_SMP)		+= smpboot.o smp.o tlb.o
@@ -20,3 +26,9 @@ else
 obj-$(CONFIG_PCI)		+= pci.o
 endif
 obj-$(CONFIG_TILE_USB)		+= usb.o
+obj-$(CONFIG_TILE_HVGLUE_TRACE)	+= hvglue_trace.o
+obj-$(CONFIG_FUNCTION_TRACER)	+= ftrace.o mcount_64.o
+obj-$(CONFIG_KPROBES)		+= kprobes.o
+obj-$(CONFIG_KGDB)		+= kgdb.o
+
+obj-y				+= vdso/
diff --git a/arch/tile/kernel/asm-offsets.c b/arch/tile/kernel/asm-offsets.c
index 01ddf19cc36d..375e7c321eef 100644
--- a/arch/tile/kernel/asm-offsets.c
+++ b/arch/tile/kernel/asm-offsets.c
@@ -14,13 +14,6 @@
  * Generates definitions from c-type structures used by assembly sources.
  */
 
-#include <linux/kbuild.h>
-#include <linux/thread_info.h>
-#include <linux/sched.h>
-#include <linux/hardirq.h>
-#include <linux/ptrace.h>
-#include <hv/hypervisor.h>
-
 /* Check for compatible compiler early in the build. */
 #ifdef CONFIG_TILEGX
 # ifndef __tilegx__
@@ -31,46 +24,61 @@
 # endif
 #else
 # ifdef __tilegx__
-#  error Can not build TILEPro/TILE64 configurations with tilegx compiler
+#  error Can not build TILEPro configurations with tilegx compiler
 # endif
 #endif
 
+#include <linux/kbuild.h>
+#include <linux/thread_info.h>
+#include <linux/sched.h>
+#include <linux/hardirq.h>
+#include <linux/ptrace.h>
+#include <hv/hypervisor.h>
+
 void foo(void)
 {
-	DEFINE(SINGLESTEP_STATE_BUFFER_OFFSET, \
+	DEFINE(SINGLESTEP_STATE_BUFFER_OFFSET,
 	       offsetof(struct single_step_state, buffer));
-	DEFINE(SINGLESTEP_STATE_FLAGS_OFFSET, \
+	DEFINE(SINGLESTEP_STATE_FLAGS_OFFSET,
 	       offsetof(struct single_step_state, flags));
-	DEFINE(SINGLESTEP_STATE_ORIG_PC_OFFSET, \
+	DEFINE(SINGLESTEP_STATE_ORIG_PC_OFFSET,
 	       offsetof(struct single_step_state, orig_pc));
-	DEFINE(SINGLESTEP_STATE_NEXT_PC_OFFSET, \
+	DEFINE(SINGLESTEP_STATE_NEXT_PC_OFFSET,
 	       offsetof(struct single_step_state, next_pc));
-	DEFINE(SINGLESTEP_STATE_BRANCH_NEXT_PC_OFFSET, \
+	DEFINE(SINGLESTEP_STATE_BRANCH_NEXT_PC_OFFSET,
 	       offsetof(struct single_step_state, branch_next_pc));
-	DEFINE(SINGLESTEP_STATE_UPDATE_VALUE_OFFSET, \
+	DEFINE(SINGLESTEP_STATE_UPDATE_VALUE_OFFSET,
 	       offsetof(struct single_step_state, update_value));
 
-	DEFINE(THREAD_INFO_TASK_OFFSET, \
+	DEFINE(THREAD_INFO_TASK_OFFSET,
 	       offsetof(struct thread_info, task));
-	DEFINE(THREAD_INFO_FLAGS_OFFSET, \
+	DEFINE(THREAD_INFO_FLAGS_OFFSET,
 	       offsetof(struct thread_info, flags));
-	DEFINE(THREAD_INFO_STATUS_OFFSET, \
+	DEFINE(THREAD_INFO_STATUS_OFFSET,
 	       offsetof(struct thread_info, status));
-	DEFINE(THREAD_INFO_HOMECACHE_CPU_OFFSET, \
+	DEFINE(THREAD_INFO_HOMECACHE_CPU_OFFSET,
 	       offsetof(struct thread_info, homecache_cpu));
-	DEFINE(THREAD_INFO_STEP_STATE_OFFSET, \
+	DEFINE(THREAD_INFO_PREEMPT_COUNT_OFFSET,
+	       offsetof(struct thread_info, preempt_count));
+	DEFINE(THREAD_INFO_STEP_STATE_OFFSET,
 	       offsetof(struct thread_info, step_state));
+#ifdef __tilegx__
+	DEFINE(THREAD_INFO_UNALIGN_JIT_BASE_OFFSET,
+	       offsetof(struct thread_info, unalign_jit_base));
+	DEFINE(THREAD_INFO_UNALIGN_JIT_TMP_OFFSET,
+	       offsetof(struct thread_info, unalign_jit_tmp));
+#endif
 
 	DEFINE(TASK_STRUCT_THREAD_KSP_OFFSET,
 	       offsetof(struct task_struct, thread.ksp));
 	DEFINE(TASK_STRUCT_THREAD_PC_OFFSET,
 	       offsetof(struct task_struct, thread.pc));
 
-	DEFINE(HV_TOPOLOGY_WIDTH_OFFSET, \
+	DEFINE(HV_TOPOLOGY_WIDTH_OFFSET,
 	       offsetof(HV_Topology, width));
-	DEFINE(HV_TOPOLOGY_HEIGHT_OFFSET, \
+	DEFINE(HV_TOPOLOGY_HEIGHT_OFFSET,
 	       offsetof(HV_Topology, height));
 
-	DEFINE(IRQ_CPUSTAT_SYSCALL_COUNT_OFFSET, \
+	DEFINE(IRQ_CPUSTAT_SYSCALL_COUNT_OFFSET,
 	       offsetof(irq_cpustat_t, irq_syscall_count));
 }
diff --git a/arch/tile/kernel/compat_signal.c b/arch/tile/kernel/compat_signal.c
index d0a052e725be..85e00b2f39bf 100644
--- a/arch/tile/kernel/compat_signal.c
+++ b/arch/tile/kernel/compat_signal.c
@@ -32,6 +32,7 @@
 #include <asm/ucontext.h>
 #include <asm/sigframe.h>
 #include <asm/syscalls.h>
+#include <asm/vdso.h>
 #include <arch/interrupts.h>
 
 struct compat_ucontext {
@@ -227,7 +228,7 @@ int compat_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	if (err)
 		goto give_sigsegv;
 
-	restorer = VDSO_BASE;
+	restorer = VDSO_SYM(&__vdso_rt_sigreturn);
 	if (ka->sa.sa_flags & SA_RESTORER)
 		restorer = ptr_to_compat_reg(ka->sa.sa_restorer);
 
diff --git a/arch/tile/kernel/early_printk.c b/arch/tile/kernel/early_printk.c
index 34d72a151bf3..b608e00e7f6d 100644
--- a/arch/tile/kernel/early_printk.c
+++ b/arch/tile/kernel/early_printk.c
@@ -23,19 +23,24 @@
 
 static void early_hv_write(struct console *con, const char *s, unsigned n)
 {
-	hv_console_write((HV_VirtAddr) s, n);
+	tile_console_write(s, n);
+
+	/*
+	 * Convert NL to NLCR (close enough to CRNL) during early boot.
+	 * We assume newlines are at the ends of strings, which turns out
+	 * to be good enough for early boot console output.
+	 */
+	if (n && s[n-1] == '\n')
+		tile_console_write("\r", 1);
 }
 
 static struct console early_hv_console = {
 	.name =		"earlyhv",
 	.write =	early_hv_write,
-	.flags =	CON_PRINTBUFFER,
+	.flags =	CON_PRINTBUFFER | CON_BOOT,
 	.index =	-1,
 };
 
-/* Direct interface for emergencies */
-static int early_console_complete;
-
 void early_panic(const char *fmt, ...)
 {
 	va_list ap;
@@ -43,51 +48,21 @@ void early_panic(const char *fmt, ...)
 	va_start(ap, fmt);
 	early_printk("Kernel panic - not syncing: ");
 	early_vprintk(fmt, ap);
-	early_console->write(early_console, "\n", 1);
+	early_printk("\n");
 	va_end(ap);
 	dump_stack();
 	hv_halt();
 }
 
-static int __initdata keep_early;
-
 static int __init setup_early_printk(char *str)
 {
 	if (early_console)
 		return 1;
 
-	if (str != NULL && strncmp(str, "keep", 4) == 0)
-		keep_early = 1;
-
 	early_console = &early_hv_console;
 	register_console(early_console);
 
 	return 0;
 }
 
-void __init disable_early_printk(void)
-{
-	early_console_complete = 1;
-	if (!early_console)
-		return;
-	if (!keep_early) {
-		early_printk("disabling early console\n");
-		unregister_console(early_console);
-		early_console = NULL;
-	} else {
-		early_printk("keeping early console\n");
-	}
-}
-
-void warn_early_printk(void)
-{
-	if (early_console_complete || early_console)
-		return;
-	early_printk("\
-Machine shutting down before console output is fully initialized.\n\
-You may wish to reboot and add the option 'earlyprintk' to your\n\
-boot command line to see any diagnostic early console output.\n\
-");
-}
-
 early_param("earlyprintk", setup_early_printk);
diff --git a/arch/tile/kernel/entry.S b/arch/tile/kernel/entry.S
index f116cb0bce20..3d9175992a20 100644
--- a/arch/tile/kernel/entry.S
+++ b/arch/tile/kernel/entry.S
@@ -27,22 +27,6 @@ STD_ENTRY(current_text_addr)
 	{ move r0, lr; jrp lr }
 	STD_ENDPROC(current_text_addr)
 
-/*
- * We don't run this function directly, but instead copy it to a page
- * we map into every user process.  See vdso_setup().
- *
- * Note that libc has a copy of this function that it uses to compare
- * against the PC when a stack backtrace ends, so if this code is
- * changed, the libc implementation(s) should also be updated.
- */
-	.pushsection .data
-ENTRY(__rt_sigreturn)
-	moveli TREG_SYSCALL_NR_NAME,__NR_rt_sigreturn
-	swint1
-	ENDPROC(__rt_sigreturn)
-	ENTRY(__rt_sigreturn_end)
-	.popsection
-
 STD_ENTRY(dump_stack)
 	{ move r2, lr; lnk r1 }
 	{ move r4, r52; addli r1, r1, dump_stack - . }
diff --git a/arch/tile/kernel/ftrace.c b/arch/tile/kernel/ftrace.c
new file mode 100644
index 000000000000..f1c452092eeb
--- /dev/null
+++ b/arch/tile/kernel/ftrace.c
@@ -0,0 +1,246 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * TILE-Gx specific ftrace support
+ */
+
+#include <linux/ftrace.h>
+#include <linux/uaccess.h>
+
+#include <asm/cacheflush.h>
+#include <asm/ftrace.h>
+#include <asm/sections.h>
+
+#include <arch/opcode.h>
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+static inline tilegx_bundle_bits NOP(void)
+{
+	return create_UnaryOpcodeExtension_X0(FNOP_UNARY_OPCODE_X0) |
+		create_RRROpcodeExtension_X0(UNARY_RRR_0_OPCODE_X0) |
+		create_Opcode_X0(RRR_0_OPCODE_X0) |
+		create_UnaryOpcodeExtension_X1(NOP_UNARY_OPCODE_X1) |
+		create_RRROpcodeExtension_X1(UNARY_RRR_0_OPCODE_X1) |
+		create_Opcode_X1(RRR_0_OPCODE_X1);
+}
+
+static int machine_stopped __read_mostly;
+
+int ftrace_arch_code_modify_prepare(void)
+{
+	machine_stopped = 1;
+	return 0;
+}
+
+int ftrace_arch_code_modify_post_process(void)
+{
+	flush_icache_range(0, CHIP_L1I_CACHE_SIZE());
+	machine_stopped = 0;
+	return 0;
+}
+
+/*
+ * Put { move r10, lr; jal ftrace_caller } in a bundle, this lets dynamic
+ * tracer just add one cycle overhead to every kernel function when disabled.
+ */
+static unsigned long ftrace_gen_branch(unsigned long pc, unsigned long addr,
+				       bool link)
+{
+	tilegx_bundle_bits opcode_x0, opcode_x1;
+	long pcrel_by_instr = (addr - pc) >> TILEGX_LOG2_BUNDLE_SIZE_IN_BYTES;
+
+	if (link) {
+		/* opcode: jal addr */
+		opcode_x1 =
+			create_Opcode_X1(JUMP_OPCODE_X1) |
+			create_JumpOpcodeExtension_X1(JAL_JUMP_OPCODE_X1) |
+			create_JumpOff_X1(pcrel_by_instr);
+	} else {
+		/* opcode: j addr */
+		opcode_x1 =
+			create_Opcode_X1(JUMP_OPCODE_X1) |
+			create_JumpOpcodeExtension_X1(J_JUMP_OPCODE_X1) |
+			create_JumpOff_X1(pcrel_by_instr);
+	}
+
+	if (addr == FTRACE_ADDR) {
+		/* opcode: or r10, lr, zero */
+		opcode_x0 =
+			create_Dest_X0(10) |
+			create_SrcA_X0(TREG_LR) |
+			create_SrcB_X0(TREG_ZERO) |
+			create_RRROpcodeExtension_X0(OR_RRR_0_OPCODE_X0) |
+			create_Opcode_X0(RRR_0_OPCODE_X0);
+	} else {
+		/* opcode: fnop */
+		opcode_x0 =
+			create_UnaryOpcodeExtension_X0(FNOP_UNARY_OPCODE_X0) |
+			create_RRROpcodeExtension_X0(UNARY_RRR_0_OPCODE_X0) |
+			create_Opcode_X0(RRR_0_OPCODE_X0);
+	}
+
+	return opcode_x1 | opcode_x0;
+}
+
+static unsigned long ftrace_nop_replace(struct dyn_ftrace *rec)
+{
+	return NOP();
+}
+
+static unsigned long ftrace_call_replace(unsigned long pc, unsigned long addr)
+{
+	return ftrace_gen_branch(pc, addr, true);
+}
+
+static int ftrace_modify_code(unsigned long pc, unsigned long old,
+			      unsigned long new)
+{
+	unsigned long pc_wr;
+
+	/* Check if the address is in kernel text space and module space. */
+	if (!kernel_text_address(pc))
+		return -EINVAL;
+
+	/* Operate on writable kernel text mapping. */
+	pc_wr = pc - MEM_SV_START + PAGE_OFFSET;
+
+	if (probe_kernel_write((void *)pc_wr, &new, MCOUNT_INSN_SIZE))
+		return -EPERM;
+
+	smp_wmb();
+
+	if (!machine_stopped && num_online_cpus() > 1)
+		flush_icache_range(pc, pc + MCOUNT_INSN_SIZE);
+
+	return 0;
+}
+
+int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+	unsigned long pc, old;
+	unsigned long new;
+	int ret;
+
+	pc = (unsigned long)&ftrace_call;
+	memcpy(&old, &ftrace_call, MCOUNT_INSN_SIZE);
+	new = ftrace_call_replace(pc, (unsigned long)func);
+
+	ret = ftrace_modify_code(pc, old, new);
+
+	return ret;
+}
+
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned long new, old;
+	unsigned long ip = rec->ip;
+
+	old = ftrace_nop_replace(rec);
+	new = ftrace_call_replace(ip, addr);
+
+	return ftrace_modify_code(rec->ip, old, new);
+}
+
+int ftrace_make_nop(struct module *mod,
+		    struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned long ip = rec->ip;
+	unsigned long old;
+	unsigned long new;
+	int ret;
+
+	old = ftrace_call_replace(ip, addr);
+	new = ftrace_nop_replace(rec);
+	ret = ftrace_modify_code(ip, old, new);
+
+	return ret;
+}
+
+int __init ftrace_dyn_arch_init(void *data)
+{
+	*(unsigned long *)data = 0;
+
+	return 0;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
+			   unsigned long frame_pointer)
+{
+	unsigned long return_hooker = (unsigned long) &return_to_handler;
+	struct ftrace_graph_ent trace;
+	unsigned long old;
+	int err;
+
+	if (unlikely(atomic_read(&current->tracing_graph_pause)))
+		return;
+
+	old = *parent;
+	*parent = return_hooker;
+
+	err = ftrace_push_return_trace(old, self_addr, &trace.depth,
+				       frame_pointer);
+	if (err == -EBUSY) {
+		*parent = old;
+		return;
+	}
+
+	trace.func = self_addr;
+
+	/* Only trace if the calling function expects to */
+	if (!ftrace_graph_entry(&trace)) {
+		current->curr_ret_stack--;
+		*parent = old;
+	}
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern unsigned long ftrace_graph_call;
+
+static int __ftrace_modify_caller(unsigned long *callsite,
+				  void (*func) (void), bool enable)
+{
+	unsigned long caller_fn = (unsigned long) func;
+	unsigned long pc = (unsigned long) callsite;
+	unsigned long branch = ftrace_gen_branch(pc, caller_fn, false);
+	unsigned long nop = NOP();
+	unsigned long old = enable ? nop : branch;
+	unsigned long new = enable ? branch : nop;
+
+	return ftrace_modify_code(pc, old, new);
+}
+
+static int ftrace_modify_graph_caller(bool enable)
+{
+	int ret;
+
+	ret = __ftrace_modify_caller(&ftrace_graph_call,
+				     ftrace_graph_caller,
+				     enable);
+
+	return ret;
+}
+
+int ftrace_enable_ftrace_graph_caller(void)
+{
+	return ftrace_modify_graph_caller(true);
+}
+
+int ftrace_disable_ftrace_graph_caller(void)
+{
+	return ftrace_modify_graph_caller(false);
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/tile/kernel/hardwall.c b/arch/tile/kernel/hardwall.c
index 38ac189d9575..df27a1fd94a3 100644
--- a/arch/tile/kernel/hardwall.c
+++ b/arch/tile/kernel/hardwall.c
@@ -272,9 +272,9 @@ static void hardwall_setup_func(void *info)
 	struct hardwall_info *r = info;
 	struct hardwall_type *hwt = r->type;
 
-	int cpu = smp_processor_id();
-	int x = cpu % smp_width;
-	int y = cpu / smp_width;
+	int cpu = smp_processor_id();  /* on_each_cpu disables preemption */
+	int x = cpu_x(cpu);
+	int y = cpu_y(cpu);
 	int bits = 0;
 	if (x == r->ulhc_x)
 		bits |= W_PROTECT;
@@ -317,6 +317,7 @@ static void hardwall_protect_rectangle(struct hardwall_info *r)
 	on_each_cpu_mask(&rect_cpus, hardwall_setup_func, r, 1);
 }
 
+/* Entered from INT_xDN_FIREWALL interrupt vector with irqs disabled. */
 void __kprobes do_hardwall_trap(struct pt_regs* regs, int fault_num)
 {
 	struct hardwall_info *rect;
@@ -325,7 +326,6 @@ void __kprobes do_hardwall_trap(struct pt_regs* regs, int fault_num)
 	struct siginfo info;
 	int cpu = smp_processor_id();
 	int found_processes;
-	unsigned long flags;
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
 	irq_enter();
@@ -346,7 +346,7 @@ void __kprobes do_hardwall_trap(struct pt_regs* regs, int fault_num)
 	BUG_ON(hwt->disabled);
 
 	/* This tile trapped a network access; find the rectangle. */
-	spin_lock_irqsave(&hwt->lock, flags);
+	spin_lock(&hwt->lock);
 	list_for_each_entry(rect, &hwt->list, list) {
 		if (cpumask_test_cpu(cpu, &rect->cpumask))
 			break;
@@ -401,7 +401,7 @@ void __kprobes do_hardwall_trap(struct pt_regs* regs, int fault_num)
 		pr_notice("hardwall: no associated processes!\n");
 
  done:
-	spin_unlock_irqrestore(&hwt->lock, flags);
+	spin_unlock(&hwt->lock);
 
 	/*
 	 * We have to disable firewall interrupts now, or else when we
@@ -540,6 +540,14 @@ static struct hardwall_info *hardwall_create(struct hardwall_type *hwt,
 		}
 	}
 
+	/*
+	 * Eliminate cpus that are not part of this Linux client.
+	 * Note that this allows for configurations that we might not want to
+	 * support, such as one client on every even cpu, another client on
+	 * every odd cpu.
+	 */
+	cpumask_and(&info->cpumask, &info->cpumask, cpu_online_mask);
+
 	/* Confirm it doesn't overlap and add it to the list. */
 	spin_lock_irqsave(&hwt->lock, flags);
 	list_for_each_entry(iter, &hwt->list, list) {
@@ -612,7 +620,7 @@ static int hardwall_activate(struct hardwall_info *info)
 
 /*
  * Deactivate a task's hardwall.  Must hold lock for hardwall_type.
- * This method may be called from free_task(), so we don't want to
+ * This method may be called from exit_thread(), so we don't want to
  * rely on too many fields of struct task_struct still being valid.
  * We assume the cpus_allowed, pid, and comm fields are still valid.
  */
@@ -653,7 +661,7 @@ static int hardwall_deactivate(struct hardwall_type *hwt,
 		return -EINVAL;
 
 	printk(KERN_DEBUG "Pid %d (%s) deactivated for %s hardwall: cpu %d\n",
-	       task->pid, task->comm, hwt->name, smp_processor_id());
+	       task->pid, task->comm, hwt->name, raw_smp_processor_id());
 	return 0;
 }
 
@@ -795,8 +803,8 @@ static void reset_xdn_network_state(struct hardwall_type *hwt)
 	/* Reset UDN coordinates to their standard value */
 	{
 		unsigned int cpu = smp_processor_id();
-		unsigned int x = cpu % smp_width;
-		unsigned int y = cpu / smp_width;
+		unsigned int x = cpu_x(cpu);
+		unsigned int y = cpu_y(cpu);
 		__insn_mtspr(SPR_UDN_TILE_COORD, (x << 18) | (y << 7));
 	}
 
diff --git a/arch/tile/kernel/head_32.S b/arch/tile/kernel/head_32.S
index ac115307e5e4..8d5b40ff2922 100644
--- a/arch/tile/kernel/head_32.S
+++ b/arch/tile/kernel/head_32.S
@@ -39,12 +39,12 @@ ENTRY(_start)
 	}
 	{
 	  moveli r0, _HV_VERSION_OLD_HV_INIT
-	  jal hv_init
+	  jal _hv_init
 	}
 	/* Get a reasonable default ASID in r0 */
 	{
 	  move r0, zero
-	  jal hv_inquire_asid
+	  jal _hv_inquire_asid
 	}
 	/* Install the default page table */
 	{
@@ -64,7 +64,7 @@ ENTRY(_start)
 	  auli r0, r0, ha16(swapper_pg_dir - PAGE_OFFSET)
 	}
 	{
-	  inv r6
+	  finv r6
 	  move r1, zero   /* high 32 bits of CPA is zero */
 	}
 	{
@@ -73,12 +73,12 @@ ENTRY(_start)
 	}
 	{
 	  auli lr, lr, ha16(1f)
-	  j hv_install_context
+	  j _hv_install_context
 	}
 1:
 
 	/* Get our processor number and save it away in SAVE_K_0. */
-	jal hv_inquire_topology
+	jal _hv_inquire_topology
 	mulll_uu r4, r1, r2        /* r1 == y, r2 == width */
 	add r4, r4, r0             /* r0 == x, so r4 == cpu == y*width + x */
 
@@ -86,7 +86,7 @@ ENTRY(_start)
 	/*
 	 * Load up our per-cpu offset.  When the first (master) tile
 	 * boots, this value is still zero, so we will load boot_pc
-	 * with start_kernel, and boot_sp with init_stack + THREAD_SIZE.
+	 * with start_kernel, and boot_sp at the top of init_stack.
 	 * The master tile initializes the per-cpu offset array, so that
 	 * when subsequent (secondary) tiles boot, they will instead load
 	 * from their per-cpu versions of boot_sp and boot_pc.
@@ -126,7 +126,6 @@ ENTRY(_start)
 	lw sp, r1
 	or r4, sp, r4
 	mtspr SPR_SYSTEM_SAVE_K_0, r4  /* save ksp0 + cpu */
-	addi sp, sp, -STACK_TOP_DELTA
 	{
 	  move lr, zero   /* stop backtraces in the called function */
 	  jr r0
@@ -163,8 +162,8 @@ ENTRY(swapper_pg_dir)
 	.set addr, addr + PGDIR_SIZE
 	.endr
 
-	/* The true text VAs are mapped as VA = PA + MEM_SV_INTRPT */
-	PTE MEM_SV_INTRPT, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \
+	/* The true text VAs are mapped as VA = PA + MEM_SV_START */
+	PTE MEM_SV_START, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \
 			      (1 << (HV_PTE_INDEX_EXECUTABLE - 32))
 	.org swapper_pg_dir + PGDIR_SIZE
 	END(swapper_pg_dir)
diff --git a/arch/tile/kernel/head_64.S b/arch/tile/kernel/head_64.S
index 6093964fa5c7..bd0e12f283f3 100644
--- a/arch/tile/kernel/head_64.S
+++ b/arch/tile/kernel/head_64.S
@@ -25,6 +25,15 @@
 #include <arch/chip.h>
 #include <arch/spr_def.h>
 
+/* Extract two 32-bit bit values that were read into one register. */
+#ifdef __BIG_ENDIAN__
+#define GET_FIRST_INT(rd, rs) shrsi rd, rs, 32
+#define GET_SECOND_INT(rd, rs) addxi rd, rs, 0
+#else
+#define GET_FIRST_INT(rd, rs) addxi rd, rs, 0
+#define GET_SECOND_INT(rd, rs) shrsi rd, rs, 32
+#endif
+
 /*
  * This module contains the entry code for kernel images. It performs the
  * minimal setup needed to call the generic C routines.
@@ -46,11 +55,11 @@ ENTRY(_start)
 	  movei r2, TILE_CHIP_REV
 	  movei r3, KERNEL_PL
 	}
-	jal hv_init
+	jal _hv_init
 	/* Get a reasonable default ASID in r0 */
 	{
 	  move r0, zero
-	  jal hv_inquire_asid
+	  jal _hv_inquire_asid
 	}
 
 	/*
@@ -61,7 +70,7 @@ ENTRY(_start)
 	 * other CPUs should see a properly-constructed page table.
 	 */
 	{
-	  v4int_l r2, zero, r0    /* ASID for hv_install_context */
+	  GET_FIRST_INT(r2, r0)    /* ASID for hv_install_context */
 	  moveli r4, hw1_last(swapper_pgprot - PAGE_OFFSET)
 	}
 	{
@@ -77,7 +86,7 @@ ENTRY(_start)
 	{
 	  /* After initializing swapper_pgprot, HV_PTE_GLOBAL is set. */
 	  bfextu r7, r1, HV_PTE_INDEX_GLOBAL, HV_PTE_INDEX_GLOBAL
-	  inv r4
+	  finv r4
 	}
 	bnez r7, .Lno_write
 	{
@@ -121,29 +130,24 @@ ENTRY(_start)
 	}
 	{
 	  moveli r3, CTX_PAGE_FLAG
-	  j hv_install_context
+	  j _hv_install_context
 	}
 1:
 
 	/* Install the interrupt base. */
-	moveli r0, hw2_last(MEM_SV_START)
-	shl16insli r0, r0, hw1(MEM_SV_START)
-	shl16insli r0, r0, hw0(MEM_SV_START)
+	moveli r0, hw2_last(intrpt_start)
+	shl16insli r0, r0, hw1(intrpt_start)
+	shl16insli r0, r0, hw0(intrpt_start)
 	mtspr SPR_INTERRUPT_VECTOR_BASE_K, r0
 
-	/*
-	 * Get our processor number and save it away in SAVE_K_0.
-	 * Extract stuff from the topology structure: r4 = y, r6 = x,
-	 * r5 = width.  FIXME: consider whether we want to just make these
-	 * 64-bit values (and if so fix smp_topology write below, too).
-	 */
-	jal hv_inquire_topology
+	/* Get our processor number and save it away in SAVE_K_0. */
+	jal _hv_inquire_topology
 	{
-	  v4int_l r5, zero, r1    /* r5 = width */
-	  shrui r4, r0, 32        /* r4 = y */
+	  GET_FIRST_INT(r5, r1)   /* r5 = width */
+	  GET_SECOND_INT(r4, r0)  /* r4 = y */
 	}
 	{
-	  v4int_l r6, zero, r0    /* r6 = x */
+	  GET_FIRST_INT(r6, r0)   /* r6 = x */
 	  mul_lu_lu r4, r4, r5
 	}
 	{
@@ -154,7 +158,7 @@ ENTRY(_start)
 	/*
 	 * Load up our per-cpu offset.  When the first (master) tile
 	 * boots, this value is still zero, so we will load boot_pc
-	 * with start_kernel, and boot_sp with init_stack + THREAD_SIZE.
+	 * with start_kernel, and boot_sp with at the top of init_stack.
 	 * The master tile initializes the per-cpu offset array, so that
 	 * when subsequent (secondary) tiles boot, they will instead load
 	 * from their per-cpu versions of boot_sp and boot_pc.
@@ -198,9 +202,9 @@ ENTRY(_start)
 	}
 	ld r0, r0
 	ld sp, r1
-	or r4, sp, r4
+	shli r4, r4, CPU_SHIFT
+	bfins r4, sp, 0, CPU_SHIFT-1
 	mtspr SPR_SYSTEM_SAVE_K_0, r4  /* save ksp0 + cpu */
-	addi sp, sp, -STACK_TOP_DELTA
 	{
 	  move lr, zero   /* stop backtraces in the called function */
 	  jr r0
diff --git a/arch/tile/kernel/hvglue.S b/arch/tile/kernel/hvglue.S
new file mode 100644
index 000000000000..2ab456622391
--- /dev/null
+++ b/arch/tile/kernel/hvglue.S
@@ -0,0 +1,74 @@
+/* Hypervisor call vector addresses; see <hv/hypervisor.h> */
+.macro gensym sym, val, size
+.org \val
+.global _\sym
+.type _\sym,function
+_\sym:
+.size _\sym,\size
+#ifndef CONFIG_TILE_HVGLUE_TRACE
+.globl \sym
+.set \sym,_\sym
+#endif
+.endm
+
+.section .hvglue,"x",@nobits
+.align 8
+gensym hv_init, 0x20, 32
+gensym hv_install_context, 0x40, 32
+gensym hv_sysconf, 0x60, 32
+gensym hv_get_rtc, 0x80, 32
+gensym hv_set_rtc, 0xa0, 32
+gensym hv_flush_asid, 0xc0, 32
+gensym hv_flush_page, 0xe0, 32
+gensym hv_flush_pages, 0x100, 32
+gensym hv_restart, 0x120, 32
+gensym hv_halt, 0x140, 32
+gensym hv_power_off, 0x160, 32
+gensym hv_inquire_physical, 0x180, 32
+gensym hv_inquire_memory_controller, 0x1a0, 32
+gensym hv_inquire_virtual, 0x1c0, 32
+gensym hv_inquire_asid, 0x1e0, 32
+gensym hv_nanosleep, 0x200, 32
+gensym hv_console_read_if_ready, 0x220, 32
+gensym hv_console_write, 0x240, 32
+gensym hv_downcall_dispatch, 0x260, 32
+gensym hv_inquire_topology, 0x280, 32
+gensym hv_fs_findfile, 0x2a0, 32
+gensym hv_fs_fstat, 0x2c0, 32
+gensym hv_fs_pread, 0x2e0, 32
+gensym hv_physaddr_read64, 0x300, 32
+gensym hv_physaddr_write64, 0x320, 32
+gensym hv_get_command_line, 0x340, 32
+gensym hv_set_caching, 0x360, 32
+gensym hv_bzero_page, 0x380, 32
+gensym hv_register_message_state, 0x3a0, 32
+gensym hv_send_message, 0x3c0, 32
+gensym hv_receive_message, 0x3e0, 32
+gensym hv_inquire_context, 0x400, 32
+gensym hv_start_all_tiles, 0x420, 32
+gensym hv_dev_open, 0x440, 32
+gensym hv_dev_close, 0x460, 32
+gensym hv_dev_pread, 0x480, 32
+gensym hv_dev_pwrite, 0x4a0, 32
+gensym hv_dev_poll, 0x4c0, 32
+gensym hv_dev_poll_cancel, 0x4e0, 32
+gensym hv_dev_preada, 0x500, 32
+gensym hv_dev_pwritea, 0x520, 32
+gensym hv_flush_remote, 0x540, 32
+gensym hv_console_putc, 0x560, 32
+gensym hv_inquire_tiles, 0x580, 32
+gensym hv_confstr, 0x5a0, 32
+gensym hv_reexec, 0x5c0, 32
+gensym hv_set_command_line, 0x5e0, 32
+gensym hv_clear_intr, 0x600, 32
+gensym hv_enable_intr, 0x620, 32
+gensym hv_disable_intr, 0x640, 32
+gensym hv_raise_intr, 0x660, 32
+gensym hv_trigger_ipi, 0x680, 32
+gensym hv_store_mapping, 0x6a0, 32
+gensym hv_inquire_realpa, 0x6c0, 32
+gensym hv_flush_all, 0x6e0, 32
+gensym hv_get_ipi_pte, 0x700, 32
+gensym hv_set_pte_super_shift, 0x720, 32
+gensym hv_console_set_ipi, 0x7e0, 32
+gensym hv_glue_internals, 0x800, 30720
diff --git a/arch/tile/kernel/hvglue.lds b/arch/tile/kernel/hvglue.lds
deleted file mode 100644
index d44c5a67a1ed..000000000000
--- a/arch/tile/kernel/hvglue.lds
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Hypervisor call vector addresses; see <hv/hypervisor.h> */
-hv_init = TEXT_OFFSET + 0x10020;
-hv_install_context = TEXT_OFFSET + 0x10040;
-hv_sysconf = TEXT_OFFSET + 0x10060;
-hv_get_rtc = TEXT_OFFSET + 0x10080;
-hv_set_rtc = TEXT_OFFSET + 0x100a0;
-hv_flush_asid = TEXT_OFFSET + 0x100c0;
-hv_flush_page = TEXT_OFFSET + 0x100e0;
-hv_flush_pages = TEXT_OFFSET + 0x10100;
-hv_restart = TEXT_OFFSET + 0x10120;
-hv_halt = TEXT_OFFSET + 0x10140;
-hv_power_off = TEXT_OFFSET + 0x10160;
-hv_inquire_physical = TEXT_OFFSET + 0x10180;
-hv_inquire_memory_controller = TEXT_OFFSET + 0x101a0;
-hv_inquire_virtual = TEXT_OFFSET + 0x101c0;
-hv_inquire_asid = TEXT_OFFSET + 0x101e0;
-hv_nanosleep = TEXT_OFFSET + 0x10200;
-hv_console_read_if_ready = TEXT_OFFSET + 0x10220;
-hv_console_write = TEXT_OFFSET + 0x10240;
-hv_downcall_dispatch = TEXT_OFFSET + 0x10260;
-hv_inquire_topology = TEXT_OFFSET + 0x10280;
-hv_fs_findfile = TEXT_OFFSET + 0x102a0;
-hv_fs_fstat = TEXT_OFFSET + 0x102c0;
-hv_fs_pread = TEXT_OFFSET + 0x102e0;
-hv_physaddr_read64 = TEXT_OFFSET + 0x10300;
-hv_physaddr_write64 = TEXT_OFFSET + 0x10320;
-hv_get_command_line = TEXT_OFFSET + 0x10340;
-hv_set_caching = TEXT_OFFSET + 0x10360;
-hv_bzero_page = TEXT_OFFSET + 0x10380;
-hv_register_message_state = TEXT_OFFSET + 0x103a0;
-hv_send_message = TEXT_OFFSET + 0x103c0;
-hv_receive_message = TEXT_OFFSET + 0x103e0;
-hv_inquire_context = TEXT_OFFSET + 0x10400;
-hv_start_all_tiles = TEXT_OFFSET + 0x10420;
-hv_dev_open = TEXT_OFFSET + 0x10440;
-hv_dev_close = TEXT_OFFSET + 0x10460;
-hv_dev_pread = TEXT_OFFSET + 0x10480;
-hv_dev_pwrite = TEXT_OFFSET + 0x104a0;
-hv_dev_poll = TEXT_OFFSET + 0x104c0;
-hv_dev_poll_cancel = TEXT_OFFSET + 0x104e0;
-hv_dev_preada = TEXT_OFFSET + 0x10500;
-hv_dev_pwritea = TEXT_OFFSET + 0x10520;
-hv_flush_remote = TEXT_OFFSET + 0x10540;
-hv_console_putc = TEXT_OFFSET + 0x10560;
-hv_inquire_tiles = TEXT_OFFSET + 0x10580;
-hv_confstr = TEXT_OFFSET + 0x105a0;
-hv_reexec = TEXT_OFFSET + 0x105c0;
-hv_set_command_line = TEXT_OFFSET + 0x105e0;
-hv_clear_intr = TEXT_OFFSET + 0x10600;
-hv_enable_intr = TEXT_OFFSET + 0x10620;
-hv_disable_intr = TEXT_OFFSET + 0x10640;
-hv_raise_intr = TEXT_OFFSET + 0x10660;
-hv_trigger_ipi = TEXT_OFFSET + 0x10680;
-hv_store_mapping = TEXT_OFFSET + 0x106a0;
-hv_inquire_realpa = TEXT_OFFSET + 0x106c0;
-hv_flush_all = TEXT_OFFSET + 0x106e0;
-hv_get_ipi_pte = TEXT_OFFSET + 0x10700;
-hv_set_pte_super_shift = TEXT_OFFSET + 0x10720;
-hv_glue_internals = TEXT_OFFSET + 0x10740;
diff --git a/arch/tile/kernel/hvglue_trace.c b/arch/tile/kernel/hvglue_trace.c
new file mode 100644
index 000000000000..85c74ad29312
--- /dev/null
+++ b/arch/tile/kernel/hvglue_trace.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/*
+ * Pull in the hypervisor header so we declare all the ABI functions
+ * with the underscore versions, then undef the names so that we can
+ * provide our own wrapper versions.
+ */
+#define hv_init _hv_init
+#define hv_install_context _hv_install_context
+#define hv_sysconf _hv_sysconf
+#define hv_get_rtc _hv_get_rtc
+#define hv_set_rtc _hv_set_rtc
+#define hv_flush_asid _hv_flush_asid
+#define hv_flush_page _hv_flush_page
+#define hv_flush_pages _hv_flush_pages
+#define hv_restart _hv_restart
+#define hv_halt _hv_halt
+#define hv_power_off _hv_power_off
+#define hv_inquire_physical _hv_inquire_physical
+#define hv_inquire_memory_controller _hv_inquire_memory_controller
+#define hv_inquire_virtual _hv_inquire_virtual
+#define hv_inquire_asid _hv_inquire_asid
+#define hv_nanosleep _hv_nanosleep
+#define hv_console_read_if_ready _hv_console_read_if_ready
+#define hv_console_write _hv_console_write
+#define hv_downcall_dispatch _hv_downcall_dispatch
+#define hv_inquire_topology _hv_inquire_topology
+#define hv_fs_findfile _hv_fs_findfile
+#define hv_fs_fstat _hv_fs_fstat
+#define hv_fs_pread _hv_fs_pread
+#define hv_physaddr_read64 _hv_physaddr_read64
+#define hv_physaddr_write64 _hv_physaddr_write64
+#define hv_get_command_line _hv_get_command_line
+#define hv_set_caching _hv_set_caching
+#define hv_bzero_page _hv_bzero_page
+#define hv_register_message_state _hv_register_message_state
+#define hv_send_message _hv_send_message
+#define hv_receive_message _hv_receive_message
+#define hv_inquire_context _hv_inquire_context
+#define hv_start_all_tiles _hv_start_all_tiles
+#define hv_dev_open _hv_dev_open
+#define hv_dev_close _hv_dev_close
+#define hv_dev_pread _hv_dev_pread
+#define hv_dev_pwrite _hv_dev_pwrite
+#define hv_dev_poll _hv_dev_poll
+#define hv_dev_poll_cancel _hv_dev_poll_cancel
+#define hv_dev_preada _hv_dev_preada
+#define hv_dev_pwritea _hv_dev_pwritea
+#define hv_flush_remote _hv_flush_remote
+#define hv_console_putc _hv_console_putc
+#define hv_inquire_tiles _hv_inquire_tiles
+#define hv_confstr _hv_confstr
+#define hv_reexec _hv_reexec
+#define hv_set_command_line _hv_set_command_line
+#define hv_clear_intr _hv_clear_intr
+#define hv_enable_intr _hv_enable_intr
+#define hv_disable_intr _hv_disable_intr
+#define hv_raise_intr _hv_raise_intr
+#define hv_trigger_ipi _hv_trigger_ipi
+#define hv_store_mapping _hv_store_mapping
+#define hv_inquire_realpa _hv_inquire_realpa
+#define hv_flush_all _hv_flush_all
+#define hv_get_ipi_pte _hv_get_ipi_pte
+#define hv_set_pte_super_shift _hv_set_pte_super_shift
+#define hv_console_set_ipi _hv_console_set_ipi
+#include <hv/hypervisor.h>
+#undef hv_init
+#undef hv_install_context
+#undef hv_sysconf
+#undef hv_get_rtc
+#undef hv_set_rtc
+#undef hv_flush_asid
+#undef hv_flush_page
+#undef hv_flush_pages
+#undef hv_restart
+#undef hv_halt
+#undef hv_power_off
+#undef hv_inquire_physical
+#undef hv_inquire_memory_controller
+#undef hv_inquire_virtual
+#undef hv_inquire_asid
+#undef hv_nanosleep
+#undef hv_console_read_if_ready
+#undef hv_console_write
+#undef hv_downcall_dispatch
+#undef hv_inquire_topology
+#undef hv_fs_findfile
+#undef hv_fs_fstat
+#undef hv_fs_pread
+#undef hv_physaddr_read64
+#undef hv_physaddr_write64
+#undef hv_get_command_line
+#undef hv_set_caching
+#undef hv_bzero_page
+#undef hv_register_message_state
+#undef hv_send_message
+#undef hv_receive_message
+#undef hv_inquire_context
+#undef hv_start_all_tiles
+#undef hv_dev_open
+#undef hv_dev_close
+#undef hv_dev_pread
+#undef hv_dev_pwrite
+#undef hv_dev_poll
+#undef hv_dev_poll_cancel
+#undef hv_dev_preada
+#undef hv_dev_pwritea
+#undef hv_flush_remote
+#undef hv_console_putc
+#undef hv_inquire_tiles
+#undef hv_confstr
+#undef hv_reexec
+#undef hv_set_command_line
+#undef hv_clear_intr
+#undef hv_enable_intr
+#undef hv_disable_intr
+#undef hv_raise_intr
+#undef hv_trigger_ipi
+#undef hv_store_mapping
+#undef hv_inquire_realpa
+#undef hv_flush_all
+#undef hv_get_ipi_pte
+#undef hv_set_pte_super_shift
+#undef hv_console_set_ipi
+
+/*
+ * Provide macros based on <linux/syscalls.h> to provide a wrapper
+ * function that invokes the same function with an underscore prefix.
+ * We can't use the existing __SC_xxx macros because we need to
+ * support up to nine arguments rather than up to six, and also this
+ * way the file stands alone from possible changes in the
+ * implementation of <linux/syscalls.h>.
+ */
+#define HV_WRAP0(type, name)					\
+	type name(void);					\
+	type name(void)						\
+	{							\
+		return _##name();				\
+	}
+#define __HV_DECL1(t1, a1)	t1 a1
+#define __HV_DECL2(t2, a2, ...) t2 a2, __HV_DECL1(__VA_ARGS__)
+#define __HV_DECL3(t3, a3, ...) t3 a3, __HV_DECL2(__VA_ARGS__)
+#define __HV_DECL4(t4, a4, ...) t4 a4, __HV_DECL3(__VA_ARGS__)
+#define __HV_DECL5(t5, a5, ...) t5 a5, __HV_DECL4(__VA_ARGS__)
+#define __HV_DECL6(t6, a6, ...) t6 a6, __HV_DECL5(__VA_ARGS__)
+#define __HV_DECL7(t7, a7, ...) t7 a7, __HV_DECL6(__VA_ARGS__)
+#define __HV_DECL8(t8, a8, ...) t8 a8, __HV_DECL7(__VA_ARGS__)
+#define __HV_DECL9(t9, a9, ...) t9 a9, __HV_DECL8(__VA_ARGS__)
+#define __HV_PASS1(t1, a1)	a1
+#define __HV_PASS2(t2, a2, ...) a2, __HV_PASS1(__VA_ARGS__)
+#define __HV_PASS3(t3, a3, ...) a3, __HV_PASS2(__VA_ARGS__)
+#define __HV_PASS4(t4, a4, ...) a4, __HV_PASS3(__VA_ARGS__)
+#define __HV_PASS5(t5, a5, ...) a5, __HV_PASS4(__VA_ARGS__)
+#define __HV_PASS6(t6, a6, ...) a6, __HV_PASS5(__VA_ARGS__)
+#define __HV_PASS7(t7, a7, ...) a7, __HV_PASS6(__VA_ARGS__)
+#define __HV_PASS8(t8, a8, ...) a8, __HV_PASS7(__VA_ARGS__)
+#define __HV_PASS9(t9, a9, ...) a9, __HV_PASS8(__VA_ARGS__)
+#define HV_WRAPx(x, type, name, ...)				\
+	type name(__HV_DECL##x(__VA_ARGS__));			\
+	type name(__HV_DECL##x(__VA_ARGS__))			\
+	{							\
+		return _##name(__HV_PASS##x(__VA_ARGS__));	\
+	}
+#define HV_WRAP1(type, name, ...) HV_WRAPx(1, type, name, __VA_ARGS__)
+#define HV_WRAP2(type, name, ...) HV_WRAPx(2, type, name, __VA_ARGS__)
+#define HV_WRAP3(type, name, ...) HV_WRAPx(3, type, name, __VA_ARGS__)
+#define HV_WRAP4(type, name, ...) HV_WRAPx(4, type, name, __VA_ARGS__)
+#define HV_WRAP5(type, name, ...) HV_WRAPx(5, type, name, __VA_ARGS__)
+#define HV_WRAP6(type, name, ...) HV_WRAPx(6, type, name, __VA_ARGS__)
+#define HV_WRAP7(type, name, ...) HV_WRAPx(7, type, name, __VA_ARGS__)
+#define HV_WRAP8(type, name, ...) HV_WRAPx(8, type, name, __VA_ARGS__)
+#define HV_WRAP9(type, name, ...) HV_WRAPx(9, type, name, __VA_ARGS__)
+
+/* List all the hypervisor API functions. */
+HV_WRAP4(void, hv_init, HV_VersionNumber, interface_version_number,
+	 int, chip_num, int, chip_rev_num, int, client_pl)
+HV_WRAP1(long, hv_sysconf, HV_SysconfQuery, query)
+HV_WRAP3(int, hv_confstr, HV_ConfstrQuery, query, HV_VirtAddr, buf, int, len)
+#if CHIP_HAS_IPI()
+HV_WRAP3(int, hv_get_ipi_pte, HV_Coord, tile, int, pl, HV_PTE*, pte)
+HV_WRAP3(int, hv_console_set_ipi, int, ipi, int, event, HV_Coord, coord);
+#else
+HV_WRAP1(void, hv_enable_intr, HV_IntrMask, enab_mask)
+HV_WRAP1(void, hv_disable_intr, HV_IntrMask, disab_mask)
+HV_WRAP1(void, hv_clear_intr, HV_IntrMask, clear_mask)
+HV_WRAP1(void, hv_raise_intr, HV_IntrMask, raise_mask)
+HV_WRAP2(HV_Errno, hv_trigger_ipi, HV_Coord, tile, int, interrupt)
+#endif /* !CHIP_HAS_IPI() */
+HV_WRAP3(int, hv_store_mapping, HV_VirtAddr, va, unsigned int, len,
+	 HV_PhysAddr, pa)
+HV_WRAP2(HV_PhysAddr, hv_inquire_realpa, HV_PhysAddr, cpa, unsigned int, len)
+HV_WRAP0(HV_RTCTime, hv_get_rtc)
+HV_WRAP1(void, hv_set_rtc, HV_RTCTime, time)
+HV_WRAP4(int, hv_install_context, HV_PhysAddr, page_table, HV_PTE, access,
+	 HV_ASID, asid, __hv32, flags)
+HV_WRAP2(int, hv_set_pte_super_shift, int, level, int, log2_count)
+HV_WRAP0(HV_Context, hv_inquire_context)
+HV_WRAP1(int, hv_flush_asid, HV_ASID, asid)
+HV_WRAP2(int, hv_flush_page, HV_VirtAddr, address, HV_PageSize, page_size)
+HV_WRAP3(int, hv_flush_pages, HV_VirtAddr, start, HV_PageSize, page_size,
+	 unsigned long, size)
+HV_WRAP1(int, hv_flush_all, int, preserve_global)
+HV_WRAP2(void, hv_restart, HV_VirtAddr, cmd, HV_VirtAddr, args)
+HV_WRAP0(void, hv_halt)
+HV_WRAP0(void, hv_power_off)
+HV_WRAP1(int, hv_reexec, HV_PhysAddr, entry)
+HV_WRAP0(HV_Topology, hv_inquire_topology)
+HV_WRAP3(HV_Errno, hv_inquire_tiles, HV_InqTileSet, set, HV_VirtAddr, cpumask,
+	 int, length)
+HV_WRAP1(HV_PhysAddrRange, hv_inquire_physical, int, idx)
+HV_WRAP2(HV_MemoryControllerInfo, hv_inquire_memory_controller, HV_Coord, coord,
+	 int, controller)
+HV_WRAP1(HV_VirtAddrRange, hv_inquire_virtual, int, idx)
+HV_WRAP1(HV_ASIDRange, hv_inquire_asid, int, idx)
+HV_WRAP1(void, hv_nanosleep, int, nanosecs)
+HV_WRAP0(int, hv_console_read_if_ready)
+HV_WRAP1(void, hv_console_putc, int, byte)
+HV_WRAP2(int, hv_console_write, HV_VirtAddr, bytes, int, len)
+HV_WRAP0(void, hv_downcall_dispatch)
+HV_WRAP1(int, hv_fs_findfile, HV_VirtAddr, filename)
+HV_WRAP1(HV_FS_StatInfo, hv_fs_fstat, int, inode)
+HV_WRAP4(int, hv_fs_pread, int, inode, HV_VirtAddr, buf,
+	 int, length, int, offset)
+HV_WRAP2(unsigned long long, hv_physaddr_read64, HV_PhysAddr, addr,
+	 HV_PTE, access)
+HV_WRAP3(void, hv_physaddr_write64, HV_PhysAddr, addr, HV_PTE, access,
+	 unsigned long long, val)
+HV_WRAP2(int, hv_get_command_line, HV_VirtAddr, buf, int, length)
+HV_WRAP2(HV_Errno, hv_set_command_line, HV_VirtAddr, buf, int, length)
+HV_WRAP1(void, hv_set_caching, unsigned long, bitmask)
+HV_WRAP2(void, hv_bzero_page, HV_VirtAddr, va, unsigned int, size)
+HV_WRAP1(HV_Errno, hv_register_message_state, HV_MsgState*, msgstate)
+HV_WRAP4(int, hv_send_message, HV_Recipient *, recips, int, nrecip,
+	 HV_VirtAddr, buf, int, buflen)
+HV_WRAP3(HV_RcvMsgInfo, hv_receive_message, HV_MsgState, msgstate,
+	 HV_VirtAddr, buf, int, buflen)
+HV_WRAP0(void, hv_start_all_tiles)
+HV_WRAP2(int, hv_dev_open, HV_VirtAddr, name, __hv32, flags)
+HV_WRAP1(int, hv_dev_close, int, devhdl)
+HV_WRAP5(int, hv_dev_pread, int, devhdl, __hv32, flags, HV_VirtAddr, va,
+	 __hv32, len, __hv64, offset)
+HV_WRAP5(int, hv_dev_pwrite, int, devhdl, __hv32, flags, HV_VirtAddr, va,
+	 __hv32, len, __hv64, offset)
+HV_WRAP3(int, hv_dev_poll, int, devhdl, __hv32, events, HV_IntArg, intarg)
+HV_WRAP1(int, hv_dev_poll_cancel, int, devhdl)
+HV_WRAP6(int, hv_dev_preada, int, devhdl, __hv32, flags, __hv32, sgl_len,
+	 HV_SGL *, sglp, __hv64, offset, HV_IntArg, intarg)
+HV_WRAP6(int, hv_dev_pwritea, int, devhdl, __hv32, flags, __hv32, sgl_len,
+	 HV_SGL *, sglp, __hv64, offset, HV_IntArg, intarg)
+HV_WRAP9(int, hv_flush_remote, HV_PhysAddr, cache_pa,
+	 unsigned long, cache_control, unsigned long*, cache_cpumask,
+	 HV_VirtAddr, tlb_va, unsigned long, tlb_length,
+	 unsigned long, tlb_pgsize, unsigned long*, tlb_cpumask,
+	 HV_Remote_ASID*, asids, int, asidcount)
diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S
index cb52d66343ed..088d5c141e68 100644
--- a/arch/tile/kernel/intvec_32.S
+++ b/arch/tile/kernel/intvec_32.S
@@ -28,20 +28,10 @@
 #include <arch/interrupts.h>
 #include <arch/spr_def.h>
 
-#ifdef CONFIG_PREEMPT
-# error "No support for kernel preemption currently"
-#endif
-
 #define PTREGS_PTR(reg, ptreg) addli reg, sp, C_ABI_SAVE_AREA_SIZE + (ptreg)
 
 #define PTREGS_OFFSET_SYSCALL PTREGS_OFFSET_REG(TREG_SYSCALL_NR)
 
-#if !CHIP_HAS_WH64()
-	/* By making this an empty macro, we can use wh64 in the code. */
-	.macro  wh64 reg
-	.endm
-#endif
-
 	.macro  push_reg reg, ptr=sp, delta=-4
 	{
 	 sw     \ptr, \reg
@@ -189,7 +179,7 @@ intvec_\vecname:
 	 * point sp at the top aligned address on the actual stack page.
 	 */
 	mfspr   r0, SPR_SYSTEM_SAVE_K_0
-	mm      r0, r0, zero, LOG2_THREAD_SIZE, 31
+	mm      r0, r0, zero, LOG2_NR_CPU_IDS, 31
 
 0:
 	/*
@@ -207,6 +197,9 @@ intvec_\vecname:
 	 *    cache line 1: r14...r29
 	 *    cache line 0: 2 x frame, r0..r13
 	 */
+#if STACK_TOP_DELTA != 64
+#error STACK_TOP_DELTA must be 64 for assumptions here and in task_pt_regs()
+#endif
 	andi    r0, r0, -64
 
 	/*
@@ -326,18 +319,14 @@ intvec_\vecname:
 	 movei  r3, -1   /* not used, but set for consistency */
 	}
 	.else
-#if CHIP_HAS_AUX_PERF_COUNTERS()
 	.ifc \c_routine, op_handle_aux_perf_interrupt
 	{
 	 mfspr  r2, AUX_PERF_COUNT_STS
 	 movei  r3, -1   /* not used, but set for consistency */
 	}
 	.else
-#endif
 	movei   r3, 0
-#if CHIP_HAS_AUX_PERF_COUNTERS()
 	.endif
-#endif
 	.endif
 	.endif
 	.endif
@@ -354,7 +343,7 @@ intvec_\vecname:
 #ifdef __COLLECT_LINKER_FEEDBACK__
 	.pushsection .text.intvec_feedback,"ax"
 	.org    (\vecnum << 5)
-	FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt1, 1 << 8)
+	FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt, 1 << 8)
 	jrp     lr
 	.popsection
 #endif
@@ -468,7 +457,7 @@ intvec_\vecname:
 	}
 	{
 	 auli   r21, r21, ha16(__per_cpu_offset)
-	 mm     r20, r20, zero, 0, LOG2_THREAD_SIZE-1
+	 mm     r20, r20, zero, 0, LOG2_NR_CPU_IDS-1
 	}
 	s2a     r20, r20, r21
 	lw      tp, r20
@@ -562,7 +551,6 @@ intvec_\vecname:
 	.endif
 	mtspr   INTERRUPT_CRITICAL_SECTION, zero
 
-#if CHIP_HAS_WH64()
 	/*
 	 * Prepare the first 256 stack bytes to be rapidly accessible
 	 * without having to fetch the background data.  We don't really
@@ -583,7 +571,6 @@ intvec_\vecname:
 	 addi   r52, r52, -64
 	}
 	wh64    r52
-#endif
 
 #ifdef CONFIG_TRACE_IRQFLAGS
 	.ifnc \function,handle_nmi
@@ -762,7 +749,7 @@ intvec_\vecname:
 	.macro  dc_dispatch vecnum, vecname
 	.org    (\vecnum << 8)
 intvec_\vecname:
-	j       hv_downcall_dispatch
+	j       _hv_downcall_dispatch
 	ENDPROC(intvec_\vecname)
 	.endm
 
@@ -812,17 +799,34 @@ STD_ENTRY(interrupt_return)
 	}
 	lw      r29, r29
 	andi    r29, r29, SPR_EX_CONTEXT_1_1__PL_MASK  /* mask off ICS */
+	bzt     r29, .Lresume_userspace
+
+#ifdef CONFIG_PREEMPT
+	/* Returning to kernel space. Check if we need preemption. */
+	GET_THREAD_INFO(r29)
+	addli   r28, r29, THREAD_INFO_FLAGS_OFFSET
 	{
-	 bzt    r29, .Lresume_userspace
-	 PTREGS_PTR(r29, PTREGS_OFFSET_PC)
+	 lw     r28, r28
+	 addli  r29, r29, THREAD_INFO_PREEMPT_COUNT_OFFSET
 	}
+	{
+	 andi   r28, r28, _TIF_NEED_RESCHED
+	 lw     r29, r29
+	}
+	bzt     r28, 1f
+	bnz     r29, 1f
+	jal     preempt_schedule_irq
+	FEEDBACK_REENTER(interrupt_return)
+1:
+#endif
 
 	/* If we're resuming to _cpu_idle_nap, bump PC forward by 8. */
 	{
-	 lw     r28, r29
+	 PTREGS_PTR(r29, PTREGS_OFFSET_PC)
 	 moveli r27, lo16(_cpu_idle_nap)
 	}
 	{
+	 lw     r28, r29
 	 auli   r27, r27, ha16(_cpu_idle_nap)
 	}
 	{
@@ -1420,7 +1424,6 @@ handle_ill:
 	{
 	 lw     r0, r0          /* indirect thru thread_info to get task_info*/
 	 addi   r1, sp, C_ABI_SAVE_AREA_SIZE  /* put ptregs pointer into r1 */
-	 move   r2, zero        /* load error code into r2 */
 	}
 
 	jal     send_sigtrap    /* issue a SIGTRAP */
@@ -1518,12 +1521,10 @@ STD_ENTRY(_sys_clone)
 	__HEAD
 	.align 64
 	/* Align much later jump on the start of a cache line. */
-#if !ATOMIC_LOCKS_FOUND_VIA_TABLE()
 	nop
 #if PAGE_SIZE >= 0x10000
 	nop
 #endif
-#endif
 ENTRY(sys_cmpxchg)
 
 	/*
@@ -1557,45 +1558,6 @@ ENTRY(sys_cmpxchg)
 # error Code here assumes PAGE_OFFSET can be loaded with just hi16()
 #endif
 
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-	{
-	 /* Check for unaligned input. */
-	 bnz    sp, .Lcmpxchg_badaddr
-	 mm     r25, r0, zero, 3, PAGE_SHIFT-1
-	}
-	{
-	 crc32_32 r25, zero, r25
-	 moveli r21, lo16(atomic_lock_ptr)
-	}
-	{
-	 auli   r21, r21, ha16(atomic_lock_ptr)
-	 auli   r23, zero, hi16(PAGE_OFFSET)  /* hugepage-aligned */
-	}
-	{
-	 shri	r20, r25, 32 - ATOMIC_HASH_L1_SHIFT
-	 slt_u  r23, r0, r23
-	 lw	r26, r0  /* see comment in the "#else" for the "lw r26". */
-	}
-	{
-	 s2a    r21, r20, r21
-	 bbns   r23, .Lcmpxchg_badaddr
-	}
-	{
-	 lw     r21, r21
-	 seqi	r23, TREG_SYSCALL_NR_NAME, __NR_FAST_cmpxchg64
-	 andi	r25, r25, ATOMIC_HASH_L2_SIZE - 1
-	}
-	{
-	 /* Branch away at this point if we're doing a 64-bit cmpxchg. */
-	 bbs    r23, .Lcmpxchg64
-	 andi   r23, r0, 7       /* Precompute alignment for cmpxchg64. */
-	}
-	{
-	 s2a	ATOMIC_LOCK_REG_NAME, r25, r21
-	 j      .Lcmpxchg32_tns   /* see comment in the #else for the jump. */
-	}
-
-#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
 	{
 	 /* Check for unaligned input. */
 	 bnz    sp, .Lcmpxchg_badaddr
@@ -1609,7 +1571,7 @@ ENTRY(sys_cmpxchg)
 	  * Because of C pointer arithmetic, we want to compute this:
 	  *
 	  * ((char*)atomic_locks +
-	  *  (((r0 >> 3) & (1 << (ATOMIC_HASH_SIZE - 1))) << 2))
+	  *  (((r0 >> 3) & ((1 << ATOMIC_HASH_SHIFT) - 1)) << 2))
 	  *
 	  * Instead of two shifts we just ">> 1", and use 'mm'
 	  * to ignore the low and high bits we don't want.
@@ -1620,12 +1582,9 @@ ENTRY(sys_cmpxchg)
 
 	 /*
 	  * Ensure that the TLB is loaded before we take out the lock.
-	  * On tilepro, this will start fetching the value all the way
-	  * into our L1 as well (and if it gets modified before we
-	  * grab the lock, it will be invalidated from our cache
-	  * before we reload it).  On tile64, we'll start fetching it
-	  * into our L1 if we're the home, and if we're not, we'll
-	  * still at least start fetching it into the home's L2.
+	  * This will start fetching the value all the way into our L1
+	  * as well (and if it gets modified before we grab the lock,
+	  * it will be invalidated from our cache before we reload it).
 	  */
 	 lw	r26, r0
 	}
@@ -1668,8 +1627,6 @@ ENTRY(sys_cmpxchg)
 	 j      .Lcmpxchg32_tns
 	}
 
-#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
 /* Symbol for do_page_fault_ics() to use to compare against the PC. */
 .global __sys_cmpxchg_grab_lock
 __sys_cmpxchg_grab_lock:
@@ -1807,9 +1764,6 @@ __sys_cmpxchg_grab_lock:
 	.align 64
 .Lcmpxchg64:
 	{
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-	 s2a	ATOMIC_LOCK_REG_NAME, r25, r21
-#endif
 	 bzt     r23, .Lcmpxchg64_tns
 	}
 	j       .Lcmpxchg_badaddr
@@ -1875,8 +1829,8 @@ int_unalign:
 	push_extra_callee_saves r0
 	j       do_trap
 
-/* Include .intrpt1 array of interrupt vectors */
-	.section ".intrpt1", "ax"
+/* Include .intrpt array of interrupt vectors */
+	.section ".intrpt", "ax"
 
 #define op_handle_perf_interrupt bad_intr
 #define op_handle_aux_perf_interrupt bad_intr
@@ -1944,10 +1898,8 @@ int_unalign:
 		     do_page_fault
 	int_hand     INT_SN_CPL, SN_CPL, bad_intr
 	int_hand     INT_DOUBLE_FAULT, DOUBLE_FAULT, do_trap
-#if CHIP_HAS_AUX_PERF_COUNTERS()
 	int_hand     INT_AUX_PERF_COUNT, AUX_PERF_COUNT, \
 		     op_handle_aux_perf_interrupt, handle_nmi
-#endif
 
 	/* Synthetic interrupt delivered only by the simulator */
 	int_hand     INT_BREAKPOINT, BREAKPOINT, do_breakpoint
diff --git a/arch/tile/kernel/intvec_64.S b/arch/tile/kernel/intvec_64.S
index 85d483957027..ec755d3f3734 100644
--- a/arch/tile/kernel/intvec_64.S
+++ b/arch/tile/kernel/intvec_64.S
@@ -17,25 +17,33 @@
 #include <linux/linkage.h>
 #include <linux/errno.h>
 #include <linux/unistd.h>
+#include <linux/init.h>
 #include <asm/ptrace.h>
 #include <asm/thread_info.h>
 #include <asm/irqflags.h>
 #include <asm/asm-offsets.h>
 #include <asm/types.h>
+#include <asm/traps.h>
 #include <asm/signal.h>
 #include <hv/hypervisor.h>
 #include <arch/abi.h>
 #include <arch/interrupts.h>
 #include <arch/spr_def.h>
 
-#ifdef CONFIG_PREEMPT
-# error "No support for kernel preemption currently"
-#endif
-
 #define PTREGS_PTR(reg, ptreg) addli reg, sp, C_ABI_SAVE_AREA_SIZE + (ptreg)
 
 #define PTREGS_OFFSET_SYSCALL PTREGS_OFFSET_REG(TREG_SYSCALL_NR)
 
+#if CONFIG_KERNEL_PL == 1 || CONFIG_KERNEL_PL == 2
+/*
+ * Set "result" non-zero if ex1 holds the PL of the kernel
+ * (with or without ICS being set).  Note this works only
+ * because we never find the PL at level 3.
+ */
+# define IS_KERNEL_EX1(result, ex1) andi result, ex1, CONFIG_KERNEL_PL
+#else
+# error Recode IS_KERNEL_EX1 for CONFIG_KERNEL_PL
+#endif
 
 	.macro  push_reg reg, ptr=sp, delta=-8
 	{
@@ -98,6 +106,185 @@
 	}
 	.endm
 
+	/*
+	 * Unalign data exception fast handling: In order to handle
+	 * unaligned data access, a fast JIT version is generated and stored
+	 * in a specific area in user space. We first need to do a quick poke
+	 * to see if the JIT is available. We use certain bits in the fault
+	 * PC (3 to 9 is used for 16KB page size) as index to address the JIT
+	 * code area. The first 64bit word is the fault PC, and the 2nd one is
+	 * the fault bundle itself. If these 2 words both match, then we
+	 * directly "iret" to JIT code. If not, a slow path is invoked to
+	 * generate new JIT code. Note: the current JIT code WILL be
+	 * overwritten if it existed. So, ideally we can handle 128 unalign
+	 * fixups via JIT. For lookup efficiency and to effectively support
+	 * tight loops with multiple unaligned reference, a simple
+	 * direct-mapped cache is used.
+	 *
+	 * SPR_EX_CONTEXT_K_0 is modified to return to JIT code.
+	 * SPR_EX_CONTEXT_K_1 has ICS set.
+	 * SPR_EX_CONTEXT_0_0 is setup to user program's next PC.
+	 * SPR_EX_CONTEXT_0_1 = 0.
+	 */
+	.macro int_hand_unalign_fast  vecnum, vecname
+	.org  (\vecnum << 8)
+intvec_\vecname:
+	/* Put r3 in SPR_SYSTEM_SAVE_K_1.  */
+	mtspr   SPR_SYSTEM_SAVE_K_1, r3
+
+	mfspr   r3, SPR_EX_CONTEXT_K_1
+	/*
+	 * Examine if exception comes from user without ICS set.
+	 * If not, just go directly to the slow path.
+	 */
+	bnez    r3, hand_unalign_slow_nonuser
+
+	mfspr   r3, SPR_SYSTEM_SAVE_K_0
+
+	/* Get &thread_info->unalign_jit_tmp[0] in r3. */
+	bfexts  r3, r3, 0, CPU_SHIFT-1
+	mm      r3, zero, LOG2_THREAD_SIZE, 63
+	addli   r3, r3, THREAD_INFO_UNALIGN_JIT_TMP_OFFSET
+
+	/*
+	 * Save r0, r1, r2 into thread_info array r3 points to
+	 * from low to high memory in order.
+	 */
+	st_add  r3, r0, 8
+	st_add  r3, r1, 8
+	{
+	 st_add r3, r2, 8
+	 andi   r2, sp, 7
+	}
+
+	/* Save stored r3 value so we can revert it on a page fault. */
+	mfspr   r1, SPR_SYSTEM_SAVE_K_1
+	st      r3, r1
+
+	{
+	 /* Generate a SIGBUS if sp is not 8-byte aligned. */
+	 bnez   r2, hand_unalign_slow_badsp
+	}
+
+	/*
+	 * Get the thread_info in r0; load r1 with pc. Set the low bit of sp
+	 * as an indicator to the page fault code in case we fault.
+	 */
+	{
+	 ori    sp, sp, 1
+	 mfspr  r1, SPR_EX_CONTEXT_K_0
+	}
+
+	/* Add the jit_info offset in thread_info; extract r1 [3:9] into r2. */
+	{
+	 addli  r0, r3, THREAD_INFO_UNALIGN_JIT_BASE_OFFSET - \
+	  (THREAD_INFO_UNALIGN_JIT_TMP_OFFSET + (3 * 8))
+	 bfextu r2, r1, 3, (2 + PAGE_SHIFT - UNALIGN_JIT_SHIFT)
+	}
+
+	/* Load the jit_info; multiply r2 by 128. */
+	{
+	 ld     r0, r0
+	 shli   r2, r2, UNALIGN_JIT_SHIFT
+	}
+
+	/*
+	 * If r0 is NULL, the JIT page is not mapped, so go to slow path;
+	 * add offset r2 to r0 at the same time.
+	 */
+	{
+	 beqz   r0, hand_unalign_slow
+	 add    r2, r0, r2
+	}
+
+        /*
+	 * We are loading from userspace (both the JIT info PC and
+	 * instruction word, and the instruction word we executed)
+	 * and since either could fault while holding the interrupt
+	 * critical section, we must tag this region and check it in
+	 * do_page_fault() to handle it properly.
+	 */
+ENTRY(__start_unalign_asm_code)
+
+	/* Load first word of JIT in r0 and increment r2 by 8. */
+	ld_add  r0, r2, 8
+
+	/*
+	 * Compare the PC with the 1st word in JIT; load the fault bundle
+	 * into r1.
+	 */
+	{
+	 cmpeq  r0, r0, r1
+	 ld     r1, r1
+	}
+
+	/* Go to slow path if PC doesn't match. */
+	beqz    r0, hand_unalign_slow
+
+	/*
+	 * Load the 2nd word of JIT, which is supposed to be the fault
+	 * bundle for a cache hit. Increment r2; after this bundle r2 will
+	 * point to the potential start of the JIT code we want to run.
+	 */
+	ld_add  r0, r2, 8
+
+	/* No further accesses to userspace are done after this point. */
+ENTRY(__end_unalign_asm_code)
+
+	/* Compare the real bundle with what is saved in the JIT area. */
+	{
+	 cmpeq  r0, r1, r0
+	 mtspr  SPR_EX_CONTEXT_0_1, zero
+	}
+
+	/* Go to slow path if the fault bundle does not match. */
+	beqz    r0, hand_unalign_slow
+
+	/*
+	 * A cache hit is found.
+	 * r2 points to start of JIT code (3rd word).
+	 * r0 is the fault pc.
+	 * r1 is the fault bundle.
+	 * Reset the low bit of sp.
+	 */
+	{
+	 mfspr  r0, SPR_EX_CONTEXT_K_0
+	 andi   sp, sp, ~1
+	}
+
+	/* Write r2 into EX_CONTEXT_K_0 and increment PC. */
+	{
+	 mtspr  SPR_EX_CONTEXT_K_0, r2
+	 addi   r0, r0, 8
+	}
+
+	/*
+	 * Set ICS on kernel EX_CONTEXT_K_1 in order to "iret" to
+	 * user with ICS set. This way, if the JIT fixup causes another
+	 * unalign exception (which shouldn't be possible) the user
+	 * process will be terminated with SIGBUS. Also, our fixup will
+	 * run without interleaving with external interrupts.
+	 * Each fixup is at most 14 bundles, so it won't hold ICS for long.
+	 */
+	{
+	 movei  r1, PL_ICS_EX1(USER_PL, 1)
+	 mtspr  SPR_EX_CONTEXT_0_0, r0
+	}
+
+	{
+	 mtspr  SPR_EX_CONTEXT_K_1, r1
+	 addi   r3, r3, -(3 * 8)
+	}
+
+	/* Restore r0..r3. */
+	ld_add  r0, r3, 8
+	ld_add  r1, r3, 8
+	ld_add  r2, r3, 8
+	ld      r3, r3
+
+	iret
+	ENDPROC(intvec_\vecname)
+	.endm
 
 #ifdef __COLLECT_LINKER_FEEDBACK__
 	.pushsection .text.intvec_feedback,"ax"
@@ -118,15 +305,21 @@ intvec_feedback:
 	 * The "processing" argument specifies the code for processing
 	 * the interrupt. Defaults to "handle_interrupt".
 	 */
-	.macro  int_hand vecnum, vecname, c_routine, processing=handle_interrupt
-	.org    (\vecnum << 8)
+	.macro __int_hand vecnum, vecname, c_routine,processing=handle_interrupt
 intvec_\vecname:
 	/* Temporarily save a register so we have somewhere to work. */
 
 	mtspr   SPR_SYSTEM_SAVE_K_1, r0
 	mfspr   r0, SPR_EX_CONTEXT_K_1
 
-	andi    r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK  /* mask off ICS */
+	/*
+	 * The unalign data fastpath code sets the low bit in sp to
+	 * force us to reset it here on fault.
+	 */
+	{
+	 blbs   sp, 2f
+	 IS_KERNEL_EX1(r0, r0)
+	}
 
 	.ifc    \vecnum, INT_DOUBLE_FAULT
 	/*
@@ -176,15 +369,15 @@ intvec_\vecname:
 	}
 	.endif
 
-
+2:
 	/*
-	 * SYSTEM_SAVE_K_0 holds the cpu number in the low bits, and
-	 * the current stack top in the higher bits.  So we recover
-	 * our stack top by just masking off the low bits, then
+	 * SYSTEM_SAVE_K_0 holds the cpu number in the high bits, and
+	 * the current stack top in the lower bits.  So we recover
+	 * our starting stack value by sign-extending the low bits, then
 	 * point sp at the top aligned address on the actual stack page.
 	 */
 	mfspr   r0, SPR_SYSTEM_SAVE_K_0
-	mm      r0, zero, LOG2_THREAD_SIZE, 63
+	bfexts  r0, r0, 0, CPU_SHIFT-1
 
 0:
 	/*
@@ -206,6 +399,9 @@ intvec_\vecname:
 	 *    cache line 1: r6...r13
 	 *    cache line 0: 2 x frame, r0..r5
 	 */
+#if STACK_TOP_DELTA != 64
+#error STACK_TOP_DELTA must be 64 for assumptions here and in task_pt_regs()
+#endif
 	andi    r0, r0, -64
 
 	/*
@@ -305,7 +501,7 @@ intvec_\vecname:
 	mfspr   r3, SPR_SYSTEM_SAVE_K_2   /* info about page fault */
 	.else
 	.ifc \vecnum, INT_ILL_TRANS
-	mfspr   r2, ILL_TRANS_REASON
+	mfspr   r2, ILL_VA_PC
 	.else
 	.ifc \vecnum, INT_DOUBLE_FAULT
 	mfspr   r2, SPR_SYSTEM_SAVE_K_2   /* double fault info from HV */
@@ -315,12 +511,10 @@ intvec_\vecname:
 	.else
 	.ifc \c_routine, op_handle_perf_interrupt
 	mfspr   r2, PERF_COUNT_STS
-#if CHIP_HAS_AUX_PERF_COUNTERS()
 	.else
 	.ifc \c_routine, op_handle_aux_perf_interrupt
 	mfspr   r2, AUX_PERF_COUNT_STS
 	.endif
-#endif
 	.endif
 	.endif
 	.endif
@@ -339,7 +533,7 @@ intvec_\vecname:
 #ifdef __COLLECT_LINKER_FEEDBACK__
 	.pushsection .text.intvec_feedback,"ax"
 	.org    (\vecnum << 5)
-	FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt1, 1 << 8)
+	FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt, 1 << 8)
 	jrp     lr
 	.popsection
 #endif
@@ -455,11 +649,12 @@ intvec_\vecname:
 	/*
 	 * If we will be returning to the kernel, we will need to
 	 * reset the interrupt masks to the state they had before.
-	 * Set DISABLE_IRQ in flags iff we came from PL1 with irqs disabled.
+	 * Set DISABLE_IRQ in flags iff we came from kernel pl with
+	 * irqs disabled.
 	 */
 	mfspr   r32, SPR_EX_CONTEXT_K_1
 	{
-	 andi   r32, r32, SPR_EX_CONTEXT_1_1__PL_MASK  /* mask off ICS */
+	 IS_KERNEL_EX1(r22, r22)
 	 PTREGS_PTR(r21, PTREGS_OFFSET_FLAGS)
 	}
 	beqzt   r32, 1f       /* zero if from user space */
@@ -503,7 +698,7 @@ intvec_\vecname:
 	}
 	{
 	 shl16insli r21, r21, hw1(__per_cpu_offset)
-	 bfextu r20, r20, 0, LOG2_THREAD_SIZE-1
+	 bfextu r20, r20, CPU_SHIFT, 63
 	}
 	shl16insli r21, r21, hw0(__per_cpu_offset)
 	shl3add r20, r20, r21
@@ -585,7 +780,7 @@ intvec_\vecname:
 	.macro  dc_dispatch vecnum, vecname
 	.org    (\vecnum << 8)
 intvec_\vecname:
-	j       hv_downcall_dispatch
+	j       _hv_downcall_dispatch
 	ENDPROC(intvec_\vecname)
 	.endm
 
@@ -626,14 +821,36 @@ STD_ENTRY(interrupt_return)
 	 PTREGS_PTR(r29, PTREGS_OFFSET_EX1)
 	}
 	ld      r29, r29
-	andi    r29, r29, SPR_EX_CONTEXT_1_1__PL_MASK  /* mask off ICS */
+	IS_KERNEL_EX1(r29, r29)
 	{
 	 beqzt  r29, .Lresume_userspace
-	 PTREGS_PTR(r29, PTREGS_OFFSET_PC)
+	 move   r29, sp
+	}
+
+#ifdef CONFIG_PREEMPT
+	/* Returning to kernel space. Check if we need preemption. */
+	EXTRACT_THREAD_INFO(r29)
+	addli   r28, r29, THREAD_INFO_FLAGS_OFFSET
+	{
+	 ld     r28, r28
+	 addli  r29, r29, THREAD_INFO_PREEMPT_COUNT_OFFSET
+	}
+	{
+	 andi   r28, r28, _TIF_NEED_RESCHED
+	 ld4s   r29, r29
 	}
+	beqzt   r28, 1f
+	bnez    r29, 1f
+	jal     preempt_schedule_irq
+	FEEDBACK_REENTER(interrupt_return)
+1:
+#endif
 
 	/* If we're resuming to _cpu_idle_nap, bump PC forward by 8. */
-	moveli  r27, hw2_last(_cpu_idle_nap)
+	{
+	 moveli r27, hw2_last(_cpu_idle_nap)
+	 PTREGS_PTR(r29, PTREGS_OFFSET_PC)
+	}
 	{
 	 ld     r28, r29
 	 shl16insli r27, r27, hw1(_cpu_idle_nap)
@@ -728,7 +945,7 @@ STD_ENTRY(interrupt_return)
 	 PTREGS_PTR(r32, PTREGS_OFFSET_FLAGS)
 	}
 	{
-	 andi   r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK
+	 IS_KERNEL_EX1(r0, r0)
 	 ld     r32, r32
 	}
 	bnez    r0, 1f
@@ -799,7 +1016,7 @@ STD_ENTRY(interrupt_return)
 	pop_reg r21, sp, PTREGS_OFFSET_REG(31) - PTREGS_OFFSET_PC
 	{
 	 mtspr  SPR_EX_CONTEXT_K_1, lr
-	 andi   lr, lr, SPR_EX_CONTEXT_1_1__PL_MASK  /* mask off ICS */
+	 IS_KERNEL_EX1(lr, lr)
 	}
 	{
 	 mtspr  SPR_EX_CONTEXT_K_0, r21
@@ -1223,10 +1440,31 @@ STD_ENTRY(_sys_clone)
 	j       sys_clone
 	STD_ENDPROC(_sys_clone)
 
-/* The single-step support may need to read all the registers. */
+	/*
+	 * Recover r3, r2, r1 and r0 here saved by unalign fast vector.
+	 * The vector area limit is 32 bundles, so we handle the reload here.
+	 * r0, r1, r2 are in thread_info from low to high memory in order.
+	 * r3 points to location the original r3 was saved.
+	 * We put this code in the __HEAD section so it can be reached
+	 * via a conditional branch from the fast path.
+	 */
+	__HEAD
+hand_unalign_slow:
+	andi    sp, sp, ~1
+hand_unalign_slow_badsp:
+	addi    r3, r3, -(3 * 8)
+	ld_add  r0, r3, 8
+	ld_add  r1, r3, 8
+	ld      r2, r3
+hand_unalign_slow_nonuser:
+	mfspr   r3, SPR_SYSTEM_SAVE_K_1
+	__int_hand     INT_UNALIGN_DATA, UNALIGN_DATA_SLOW, int_unalign
+
+/* The unaligned data support needs to read all the registers. */
 int_unalign:
 	push_extra_callee_saves r0
-	j       do_trap
+	j       do_unaligned
+ENDPROC(hand_unalign_slow)
 
 /* Fill the return address stack with nonzero entries. */
 STD_ENTRY(fill_ra_stack)
@@ -1240,8 +1478,15 @@ STD_ENTRY(fill_ra_stack)
 4:	jrp	r0
 	STD_ENDPROC(fill_ra_stack)
 
-/* Include .intrpt1 array of interrupt vectors */
-	.section ".intrpt1", "ax"
+	.macro int_hand  vecnum, vecname, c_routine, processing=handle_interrupt
+	.org   (\vecnum << 8)
+		__int_hand   \vecnum, \vecname, \c_routine, \processing
+	.endm
+
+/* Include .intrpt array of interrupt vectors */
+	.section ".intrpt", "ax"
+	.global intrpt_start
+intrpt_start:
 
 #define op_handle_perf_interrupt bad_intr
 #define op_handle_aux_perf_interrupt bad_intr
@@ -1272,7 +1517,7 @@ STD_ENTRY(fill_ra_stack)
 	int_hand     INT_SWINT_1, SWINT_1, SYSCALL, handle_syscall
 	int_hand     INT_SWINT_0, SWINT_0, do_trap
 	int_hand     INT_ILL_TRANS, ILL_TRANS, do_trap
-	int_hand     INT_UNALIGN_DATA, UNALIGN_DATA, int_unalign
+	int_hand_unalign_fast INT_UNALIGN_DATA, UNALIGN_DATA
 	int_hand     INT_DTLB_MISS, DTLB_MISS, do_page_fault
 	int_hand     INT_DTLB_ACCESS, DTLB_ACCESS, do_page_fault
 	int_hand     INT_IDN_FIREWALL, IDN_FIREWALL, do_hardwall_trap
diff --git a/arch/tile/kernel/irq.c b/arch/tile/kernel/irq.c
index 3ccf2cd7182e..0586fdb9352d 100644
--- a/arch/tile/kernel/irq.c
+++ b/arch/tile/kernel/irq.c
@@ -55,7 +55,8 @@ static DEFINE_PER_CPU(int, irq_depth);
 
 /* State for allocating IRQs on Gx. */
 #if CHIP_HAS_IPI()
-static unsigned long available_irqs = ~(1UL << IRQ_RESCHEDULE);
+static unsigned long available_irqs = ((1UL << NR_IRQS) - 1) &
+				      (~(1UL << IRQ_RESCHEDULE));
 static DEFINE_SPINLOCK(available_irqs_lock);
 #endif
 
@@ -73,7 +74,8 @@ static DEFINE_SPINLOCK(available_irqs_lock);
 
 /*
  * The interrupt handling path, implemented in terms of HV interrupt
- * emulation on TILE64 and TILEPro, and IPI hardware on TILE-Gx.
+ * emulation on TILEPro, and IPI hardware on TILE-Gx.
+ * Entered with interrupts disabled.
  */
 void tile_dev_intr(struct pt_regs *regs, int intnum)
 {
@@ -233,7 +235,7 @@ void tile_irq_activate(unsigned int irq, int tile_irq_type)
 {
 	/*
 	 * We use handle_level_irq() by default because the pending
-	 * interrupt vector (whether modeled by the HV on TILE64 and
+	 * interrupt vector (whether modeled by the HV on
 	 * TILEPro or implemented in hardware on TILE-Gx) has
 	 * level-style semantics for each bit.  An interrupt fires
 	 * whenever a bit is high, not just at edges.
diff --git a/arch/tile/kernel/kgdb.c b/arch/tile/kernel/kgdb.c
new file mode 100644
index 000000000000..4cd88381a83e
--- /dev/null
+++ b/arch/tile/kernel/kgdb.c
@@ -0,0 +1,499 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * TILE-Gx KGDB support.
+ */
+
+#include <linux/ptrace.h>
+#include <linux/kgdb.h>
+#include <linux/kdebug.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <asm/cacheflush.h>
+
+static tile_bundle_bits singlestep_insn = TILEGX_BPT_BUNDLE | DIE_SSTEPBP;
+static unsigned long stepped_addr;
+static tile_bundle_bits stepped_instr;
+
+struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = {
+	{ "r0", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[0])},
+	{ "r1", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[1])},
+	{ "r2", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[2])},
+	{ "r3", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[3])},
+	{ "r4", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[4])},
+	{ "r5", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[5])},
+	{ "r6", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[6])},
+	{ "r7", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[7])},
+	{ "r8", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[8])},
+	{ "r9", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[9])},
+	{ "r10", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[10])},
+	{ "r11", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[11])},
+	{ "r12", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[12])},
+	{ "r13", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[13])},
+	{ "r14", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[14])},
+	{ "r15", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[15])},
+	{ "r16", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[16])},
+	{ "r17", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[17])},
+	{ "r18", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[18])},
+	{ "r19", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[19])},
+	{ "r20", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[20])},
+	{ "r21", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[21])},
+	{ "r22", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[22])},
+	{ "r23", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[23])},
+	{ "r24", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[24])},
+	{ "r25", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[25])},
+	{ "r26", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[26])},
+	{ "r27", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[27])},
+	{ "r28", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[28])},
+	{ "r29", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[29])},
+	{ "r30", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[30])},
+	{ "r31", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[31])},
+	{ "r32", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[32])},
+	{ "r33", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[33])},
+	{ "r34", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[34])},
+	{ "r35", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[35])},
+	{ "r36", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[36])},
+	{ "r37", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[37])},
+	{ "r38", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[38])},
+	{ "r39", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[39])},
+	{ "r40", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[40])},
+	{ "r41", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[41])},
+	{ "r42", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[42])},
+	{ "r43", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[43])},
+	{ "r44", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[44])},
+	{ "r45", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[45])},
+	{ "r46", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[46])},
+	{ "r47", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[47])},
+	{ "r48", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[48])},
+	{ "r49", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[49])},
+	{ "r50", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[50])},
+	{ "r51", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[51])},
+	{ "r52", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[52])},
+	{ "tp", GDB_SIZEOF_REG, offsetof(struct pt_regs, tp)},
+	{ "sp", GDB_SIZEOF_REG, offsetof(struct pt_regs, sp)},
+	{ "lr", GDB_SIZEOF_REG, offsetof(struct pt_regs, lr)},
+	{ "sn", GDB_SIZEOF_REG, -1},
+	{ "idn0", GDB_SIZEOF_REG, -1},
+	{ "idn1", GDB_SIZEOF_REG, -1},
+	{ "udn0", GDB_SIZEOF_REG, -1},
+	{ "udn1", GDB_SIZEOF_REG, -1},
+	{ "udn2", GDB_SIZEOF_REG, -1},
+	{ "udn3", GDB_SIZEOF_REG, -1},
+	{ "zero", GDB_SIZEOF_REG, -1},
+	{ "pc", GDB_SIZEOF_REG, offsetof(struct pt_regs, pc)},
+	{ "faultnum", GDB_SIZEOF_REG, offsetof(struct pt_regs, faultnum)},
+};
+
+char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
+{
+	if (regno >= DBG_MAX_REG_NUM || regno < 0)
+		return NULL;
+
+	if (dbg_reg_def[regno].offset != -1)
+		memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
+		       dbg_reg_def[regno].size);
+	else
+		memset(mem, 0, dbg_reg_def[regno].size);
+	return dbg_reg_def[regno].name;
+}
+
+int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
+{
+	if (regno >= DBG_MAX_REG_NUM || regno < 0)
+		return -EINVAL;
+
+	if (dbg_reg_def[regno].offset != -1)
+		memcpy((void *)regs + dbg_reg_def[regno].offset, mem,
+		       dbg_reg_def[regno].size);
+	return 0;
+}
+
+/*
+ * Similar to pt_regs_to_gdb_regs() except that process is sleeping and so
+ * we may not be able to get all the info.
+ */
+void
+sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *task)
+{
+	int reg;
+	struct pt_regs *thread_regs;
+	unsigned long *ptr = gdb_regs;
+
+	if (task == NULL)
+		return;
+
+	/* Initialize to zero. */
+	memset(gdb_regs, 0, NUMREGBYTES);
+
+	thread_regs = task_pt_regs(task);
+	for (reg = 0; reg <= TREG_LAST_GPR; reg++)
+		*(ptr++) = thread_regs->regs[reg];
+
+	gdb_regs[TILEGX_PC_REGNUM] = thread_regs->pc;
+	gdb_regs[TILEGX_FAULTNUM_REGNUM] = thread_regs->faultnum;
+}
+
+void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc)
+{
+	regs->pc = pc;
+}
+
+static void kgdb_call_nmi_hook(void *ignored)
+{
+	kgdb_nmicallback(raw_smp_processor_id(), NULL);
+}
+
+void kgdb_roundup_cpus(unsigned long flags)
+{
+	local_irq_enable();
+	smp_call_function(kgdb_call_nmi_hook, NULL, 0);
+	local_irq_disable();
+}
+
+/*
+ * Convert a kernel address to the writable kernel text mapping.
+ */
+static unsigned long writable_address(unsigned long addr)
+{
+	unsigned long ret = 0;
+
+	if (core_kernel_text(addr))
+		ret = addr - MEM_SV_START + PAGE_OFFSET;
+	else if (is_module_text_address(addr))
+		ret = addr;
+	else
+		pr_err("Unknown virtual address 0x%lx\n", addr);
+
+	return ret;
+}
+
+/*
+ * Calculate the new address for after a step.
+ */
+static unsigned long get_step_address(struct pt_regs *regs)
+{
+	int src_reg;
+	int jump_off;
+	int br_off;
+	unsigned long addr;
+	unsigned int opcode;
+	tile_bundle_bits bundle;
+
+	/* Move to the next instruction by default. */
+	addr = regs->pc + TILEGX_BUNDLE_SIZE_IN_BYTES;
+	bundle = *(unsigned long *)instruction_pointer(regs);
+
+	/* 0: X mode, Otherwise: Y mode. */
+	if (bundle & TILEGX_BUNDLE_MODE_MASK) {
+		if (get_Opcode_Y1(bundle) == RRR_1_OPCODE_Y1 &&
+		    get_RRROpcodeExtension_Y1(bundle) ==
+		    UNARY_RRR_1_OPCODE_Y1) {
+			opcode = get_UnaryOpcodeExtension_Y1(bundle);
+
+			switch (opcode) {
+			case JALR_UNARY_OPCODE_Y1:
+			case JALRP_UNARY_OPCODE_Y1:
+			case JR_UNARY_OPCODE_Y1:
+			case JRP_UNARY_OPCODE_Y1:
+				src_reg = get_SrcA_Y1(bundle);
+				dbg_get_reg(src_reg, &addr, regs);
+				break;
+			}
+		}
+	} else if (get_Opcode_X1(bundle) == RRR_0_OPCODE_X1) {
+		if (get_RRROpcodeExtension_X1(bundle) ==
+		    UNARY_RRR_0_OPCODE_X1) {
+			opcode = get_UnaryOpcodeExtension_X1(bundle);
+
+			switch (opcode) {
+			case JALR_UNARY_OPCODE_X1:
+			case JALRP_UNARY_OPCODE_X1:
+			case JR_UNARY_OPCODE_X1:
+			case JRP_UNARY_OPCODE_X1:
+				src_reg = get_SrcA_X1(bundle);
+				dbg_get_reg(src_reg, &addr, regs);
+				break;
+			}
+		}
+	} else if (get_Opcode_X1(bundle) == JUMP_OPCODE_X1) {
+		opcode = get_JumpOpcodeExtension_X1(bundle);
+
+		switch (opcode) {
+		case JAL_JUMP_OPCODE_X1:
+		case J_JUMP_OPCODE_X1:
+			jump_off = sign_extend(get_JumpOff_X1(bundle), 27);
+			addr = regs->pc +
+				(jump_off << TILEGX_LOG2_BUNDLE_SIZE_IN_BYTES);
+			break;
+		}
+	} else if (get_Opcode_X1(bundle) == BRANCH_OPCODE_X1) {
+		br_off = 0;
+		opcode = get_BrType_X1(bundle);
+
+		switch (opcode) {
+		case BEQZT_BRANCH_OPCODE_X1:
+		case BEQZ_BRANCH_OPCODE_X1:
+			if (get_SrcA_X1(bundle) == 0)
+				br_off = get_BrOff_X1(bundle);
+			break;
+		case BGEZT_BRANCH_OPCODE_X1:
+		case BGEZ_BRANCH_OPCODE_X1:
+			if (get_SrcA_X1(bundle) >= 0)
+				br_off = get_BrOff_X1(bundle);
+			break;
+		case BGTZT_BRANCH_OPCODE_X1:
+		case BGTZ_BRANCH_OPCODE_X1:
+			if (get_SrcA_X1(bundle) > 0)
+				br_off = get_BrOff_X1(bundle);
+			break;
+		case BLBCT_BRANCH_OPCODE_X1:
+		case BLBC_BRANCH_OPCODE_X1:
+			if (!(get_SrcA_X1(bundle) & 1))
+				br_off = get_BrOff_X1(bundle);
+			break;
+		case BLBST_BRANCH_OPCODE_X1:
+		case BLBS_BRANCH_OPCODE_X1:
+			if (get_SrcA_X1(bundle) & 1)
+				br_off = get_BrOff_X1(bundle);
+			break;
+		case BLEZT_BRANCH_OPCODE_X1:
+		case BLEZ_BRANCH_OPCODE_X1:
+			if (get_SrcA_X1(bundle) <= 0)
+				br_off = get_BrOff_X1(bundle);
+			break;
+		case BLTZT_BRANCH_OPCODE_X1:
+		case BLTZ_BRANCH_OPCODE_X1:
+			if (get_SrcA_X1(bundle) < 0)
+				br_off = get_BrOff_X1(bundle);
+			break;
+		case BNEZT_BRANCH_OPCODE_X1:
+		case BNEZ_BRANCH_OPCODE_X1:
+			if (get_SrcA_X1(bundle) != 0)
+				br_off = get_BrOff_X1(bundle);
+			break;
+		}
+
+		if (br_off != 0) {
+			br_off = sign_extend(br_off, 17);
+			addr = regs->pc +
+				(br_off << TILEGX_LOG2_BUNDLE_SIZE_IN_BYTES);
+		}
+	}
+
+	return addr;
+}
+
+/*
+ * Replace the next instruction after the current instruction with a
+ * breakpoint instruction.
+ */
+static void do_single_step(struct pt_regs *regs)
+{
+	unsigned long addr_wr;
+
+	/* Determine where the target instruction will send us to. */
+	stepped_addr = get_step_address(regs);
+	probe_kernel_read((char *)&stepped_instr, (char *)stepped_addr,
+			  BREAK_INSTR_SIZE);
+
+	addr_wr = writable_address(stepped_addr);
+	probe_kernel_write((char *)addr_wr, (char *)&singlestep_insn,
+			   BREAK_INSTR_SIZE);
+	smp_wmb();
+	flush_icache_range(stepped_addr, stepped_addr + BREAK_INSTR_SIZE);
+}
+
+static void undo_single_step(struct pt_regs *regs)
+{
+	unsigned long addr_wr;
+
+	if (stepped_instr == 0)
+		return;
+
+	addr_wr = writable_address(stepped_addr);
+	probe_kernel_write((char *)addr_wr, (char *)&stepped_instr,
+			   BREAK_INSTR_SIZE);
+	stepped_instr = 0;
+	smp_wmb();
+	flush_icache_range(stepped_addr, stepped_addr + BREAK_INSTR_SIZE);
+}
+
+/*
+ * Calls linux_debug_hook before the kernel dies. If KGDB is enabled,
+ * then try to fall into the debugger.
+ */
+static int
+kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr)
+{
+	int ret;
+	unsigned long flags;
+	struct die_args *args = (struct die_args *)ptr;
+	struct pt_regs *regs = args->regs;
+
+#ifdef CONFIG_KPROBES
+	/*
+	 * Return immediately if the kprobes fault notifier has set
+	 * DIE_PAGE_FAULT.
+	 */
+	if (cmd == DIE_PAGE_FAULT)
+		return NOTIFY_DONE;
+#endif /* CONFIG_KPROBES */
+
+	switch (cmd) {
+	case DIE_BREAK:
+	case DIE_COMPILED_BPT:
+		break;
+	case DIE_SSTEPBP:
+		local_irq_save(flags);
+		kgdb_handle_exception(0, SIGTRAP, 0, regs);
+		local_irq_restore(flags);
+		return NOTIFY_STOP;
+	default:
+		/* Userspace events, ignore. */
+		if (user_mode(regs))
+			return NOTIFY_DONE;
+	}
+
+	local_irq_save(flags);
+	ret = kgdb_handle_exception(args->trapnr, args->signr, args->err, regs);
+	local_irq_restore(flags);
+	if (ret)
+		return NOTIFY_DONE;
+
+	return NOTIFY_STOP;
+}
+
+static struct notifier_block kgdb_notifier = {
+	.notifier_call = kgdb_notify,
+};
+
+/*
+ * kgdb_arch_handle_exception - Handle architecture specific GDB packets.
+ * @vector: The error vector of the exception that happened.
+ * @signo: The signal number of the exception that happened.
+ * @err_code: The error code of the exception that happened.
+ * @remcom_in_buffer: The buffer of the packet we have read.
+ * @remcom_out_buffer: The buffer of %BUFMAX bytes to write a packet into.
+ * @regs: The &struct pt_regs of the current process.
+ *
+ * This function MUST handle the 'c' and 's' command packets,
+ * as well packets to set / remove a hardware breakpoint, if used.
+ * If there are additional packets which the hardware needs to handle,
+ * they are handled here. The code should return -1 if it wants to
+ * process more packets, and a %0 or %1 if it wants to exit from the
+ * kgdb callback.
+ */
+int kgdb_arch_handle_exception(int vector, int signo, int err_code,
+			       char *remcom_in_buffer, char *remcom_out_buffer,
+			       struct pt_regs *regs)
+{
+	char *ptr;
+	unsigned long address;
+
+	/* Undo any stepping we may have done. */
+	undo_single_step(regs);
+
+	switch (remcom_in_buffer[0]) {
+	case 'c':
+	case 's':
+	case 'D':
+	case 'k':
+		/*
+		 * Try to read optional parameter, pc unchanged if no parm.
+		 * If this was a compiled-in breakpoint, we need to move
+		 * to the next instruction or we will just breakpoint
+		 * over and over again.
+		 */
+		ptr = &remcom_in_buffer[1];
+		if (kgdb_hex2long(&ptr, &address))
+			regs->pc = address;
+		else if (*(unsigned long *)regs->pc == compiled_bpt)
+			regs->pc += BREAK_INSTR_SIZE;
+
+		if (remcom_in_buffer[0] == 's') {
+			do_single_step(regs);
+			kgdb_single_step = 1;
+			atomic_set(&kgdb_cpu_doing_single_step,
+				   raw_smp_processor_id());
+		} else
+			atomic_set(&kgdb_cpu_doing_single_step, -1);
+
+		return 0;
+	}
+
+	return -1; /* this means that we do not want to exit from the handler */
+}
+
+struct kgdb_arch arch_kgdb_ops;
+
+/*
+ * kgdb_arch_init - Perform any architecture specific initalization.
+ *
+ * This function will handle the initalization of any architecture
+ * specific callbacks.
+ */
+int kgdb_arch_init(void)
+{
+	tile_bundle_bits bundle = TILEGX_BPT_BUNDLE;
+
+	memcpy(arch_kgdb_ops.gdb_bpt_instr, &bundle, BREAK_INSTR_SIZE);
+	return register_die_notifier(&kgdb_notifier);
+}
+
+/*
+ * kgdb_arch_exit - Perform any architecture specific uninitalization.
+ *
+ * This function will handle the uninitalization of any architecture
+ * specific callbacks, for dynamic registration and unregistration.
+ */
+void kgdb_arch_exit(void)
+{
+	unregister_die_notifier(&kgdb_notifier);
+}
+
+int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
+{
+	int err;
+	unsigned long addr_wr = writable_address(bpt->bpt_addr);
+
+	if (addr_wr == 0)
+		return -1;
+
+	err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
+				BREAK_INSTR_SIZE);
+	if (err)
+		return err;
+
+	err = probe_kernel_write((char *)addr_wr, arch_kgdb_ops.gdb_bpt_instr,
+				 BREAK_INSTR_SIZE);
+	smp_wmb();
+	flush_icache_range((unsigned long)bpt->bpt_addr,
+			   (unsigned long)bpt->bpt_addr + BREAK_INSTR_SIZE);
+	return err;
+}
+
+int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
+{
+	int err;
+	unsigned long addr_wr = writable_address(bpt->bpt_addr);
+
+	if (addr_wr == 0)
+		return -1;
+
+	err = probe_kernel_write((char *)addr_wr, (char *)bpt->saved_instr,
+				 BREAK_INSTR_SIZE);
+	smp_wmb();
+	flush_icache_range((unsigned long)bpt->bpt_addr,
+			   (unsigned long)bpt->bpt_addr + BREAK_INSTR_SIZE);
+	return err;
+}
diff --git a/arch/tile/kernel/kprobes.c b/arch/tile/kernel/kprobes.c
new file mode 100644
index 000000000000..27cdcacbe81d
--- /dev/null
+++ b/arch/tile/kernel/kprobes.c
@@ -0,0 +1,528 @@
+/*
+ * arch/tile/kernel/kprobes.c
+ * Kprobes on TILE-Gx
+ *
+ * Some portions copied from the MIPS version.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ * Copyright 2006 Sony Corp.
+ * Copyright 2010 Cavium Networks
+ *
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+
+#include <arch/opcode.h>
+
+DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
+DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
+
+tile_bundle_bits breakpoint_insn = TILEGX_BPT_BUNDLE;
+tile_bundle_bits breakpoint2_insn = TILEGX_BPT_BUNDLE | DIE_SSTEPBP;
+
+/*
+ * Check whether instruction is branch or jump, or if executing it
+ * has different results depending on where it is executed (e.g. lnk).
+ */
+static int __kprobes insn_has_control(kprobe_opcode_t insn)
+{
+	if (get_Mode(insn) != 0) {   /* Y-format bundle */
+		if (get_Opcode_Y1(insn) != RRR_1_OPCODE_Y1 ||
+		    get_RRROpcodeExtension_Y1(insn) != UNARY_RRR_1_OPCODE_Y1)
+			return 0;
+
+		switch (get_UnaryOpcodeExtension_Y1(insn)) {
+		case JALRP_UNARY_OPCODE_Y1:
+		case JALR_UNARY_OPCODE_Y1:
+		case JRP_UNARY_OPCODE_Y1:
+		case JR_UNARY_OPCODE_Y1:
+		case LNK_UNARY_OPCODE_Y1:
+			return 1;
+		default:
+			return 0;
+		}
+	}
+
+	switch (get_Opcode_X1(insn)) {
+	case BRANCH_OPCODE_X1:	/* branch instructions */
+	case JUMP_OPCODE_X1:	/* jump instructions: j and jal */
+		return 1;
+
+	case RRR_0_OPCODE_X1:   /* other jump instructions */
+		if (get_RRROpcodeExtension_X1(insn) != UNARY_RRR_0_OPCODE_X1)
+			return 0;
+		switch (get_UnaryOpcodeExtension_X1(insn)) {
+		case JALRP_UNARY_OPCODE_X1:
+		case JALR_UNARY_OPCODE_X1:
+		case JRP_UNARY_OPCODE_X1:
+		case JR_UNARY_OPCODE_X1:
+		case LNK_UNARY_OPCODE_X1:
+			return 1;
+		default:
+			return 0;
+		}
+	default:
+		return 0;
+	}
+}
+
+int __kprobes arch_prepare_kprobe(struct kprobe *p)
+{
+	unsigned long addr = (unsigned long)p->addr;
+
+	if (addr & (sizeof(kprobe_opcode_t) - 1))
+		return -EINVAL;
+
+	if (insn_has_control(*p->addr)) {
+		pr_notice("Kprobes for control instructions are not "
+			  "supported\n");
+		return -EINVAL;
+	}
+
+	/* insn: must be on special executable page on tile. */
+	p->ainsn.insn = get_insn_slot();
+	if (!p->ainsn.insn)
+		return -ENOMEM;
+
+	/*
+	 * In the kprobe->ainsn.insn[] array we store the original
+	 * instruction at index zero and a break trap instruction at
+	 * index one.
+	 */
+	memcpy(&p->ainsn.insn[0], p->addr, sizeof(kprobe_opcode_t));
+	p->ainsn.insn[1] = breakpoint2_insn;
+	p->opcode = *p->addr;
+
+	return 0;
+}
+
+void __kprobes arch_arm_kprobe(struct kprobe *p)
+{
+	unsigned long addr_wr;
+
+	/* Operate on writable kernel text mapping. */
+	addr_wr = (unsigned long)p->addr - MEM_SV_START + PAGE_OFFSET;
+
+	if (probe_kernel_write((void *)addr_wr, &breakpoint_insn,
+		sizeof(breakpoint_insn)))
+		pr_err("%s: failed to enable kprobe\n", __func__);
+
+	smp_wmb();
+	flush_insn_slot(p);
+}
+
+void __kprobes arch_disarm_kprobe(struct kprobe *kp)
+{
+	unsigned long addr_wr;
+
+	/* Operate on writable kernel text mapping. */
+	addr_wr = (unsigned long)kp->addr - MEM_SV_START + PAGE_OFFSET;
+
+	if (probe_kernel_write((void *)addr_wr, &kp->opcode,
+		sizeof(kp->opcode)))
+		pr_err("%s: failed to enable kprobe\n", __func__);
+
+	smp_wmb();
+	flush_insn_slot(kp);
+}
+
+void __kprobes arch_remove_kprobe(struct kprobe *p)
+{
+	if (p->ainsn.insn) {
+		free_insn_slot(p->ainsn.insn, 0);
+		p->ainsn.insn = NULL;
+	}
+}
+
+static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+	kcb->prev_kprobe.kp = kprobe_running();
+	kcb->prev_kprobe.status = kcb->kprobe_status;
+	kcb->prev_kprobe.saved_pc = kcb->kprobe_saved_pc;
+}
+
+static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+	__this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
+	kcb->kprobe_status = kcb->prev_kprobe.status;
+	kcb->kprobe_saved_pc = kcb->prev_kprobe.saved_pc;
+}
+
+static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+			struct kprobe_ctlblk *kcb)
+{
+	__this_cpu_write(current_kprobe, p);
+	kcb->kprobe_saved_pc = regs->pc;
+}
+
+static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
+{
+	/* Single step inline if the instruction is a break. */
+	if (p->opcode == breakpoint_insn ||
+	    p->opcode == breakpoint2_insn)
+		regs->pc = (unsigned long)p->addr;
+	else
+		regs->pc = (unsigned long)&p->ainsn.insn[0];
+}
+
+static int __kprobes kprobe_handler(struct pt_regs *regs)
+{
+	struct kprobe *p;
+	int ret = 0;
+	kprobe_opcode_t *addr;
+	struct kprobe_ctlblk *kcb;
+
+	addr = (kprobe_opcode_t *)regs->pc;
+
+	/*
+	 * We don't want to be preempted for the entire
+	 * duration of kprobe processing.
+	 */
+	preempt_disable();
+	kcb = get_kprobe_ctlblk();
+
+	/* Check we're not actually recursing. */
+	if (kprobe_running()) {
+		p = get_kprobe(addr);
+		if (p) {
+			if (kcb->kprobe_status == KPROBE_HIT_SS &&
+			    p->ainsn.insn[0] == breakpoint_insn) {
+				goto no_kprobe;
+			}
+			/*
+			 * We have reentered the kprobe_handler(), since
+			 * another probe was hit while within the handler.
+			 * We here save the original kprobes variables and
+			 * just single step on the instruction of the new probe
+			 * without calling any user handlers.
+			 */
+			save_previous_kprobe(kcb);
+			set_current_kprobe(p, regs, kcb);
+			kprobes_inc_nmissed_count(p);
+			prepare_singlestep(p, regs);
+			kcb->kprobe_status = KPROBE_REENTER;
+			return 1;
+		} else {
+			if (*addr != breakpoint_insn) {
+				/*
+				 * The breakpoint instruction was removed by
+				 * another cpu right after we hit, no further
+				 * handling of this interrupt is appropriate.
+				 */
+				ret = 1;
+				goto no_kprobe;
+			}
+			p = __this_cpu_read(current_kprobe);
+			if (p->break_handler && p->break_handler(p, regs))
+				goto ss_probe;
+		}
+		goto no_kprobe;
+	}
+
+	p = get_kprobe(addr);
+	if (!p) {
+		if (*addr != breakpoint_insn) {
+			/*
+			 * The breakpoint instruction was removed right
+			 * after we hit it.  Another cpu has removed
+			 * either a probepoint or a debugger breakpoint
+			 * at this address.  In either case, no further
+			 * handling of this interrupt is appropriate.
+			 */
+			ret = 1;
+		}
+		/* Not one of ours: let kernel handle it. */
+		goto no_kprobe;
+	}
+
+	set_current_kprobe(p, regs, kcb);
+	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+	if (p->pre_handler && p->pre_handler(p, regs)) {
+		/* Handler has already set things up, so skip ss setup. */
+		return 1;
+	}
+
+ss_probe:
+	prepare_singlestep(p, regs);
+	kcb->kprobe_status = KPROBE_HIT_SS;
+	return 1;
+
+no_kprobe:
+	preempt_enable_no_resched();
+	return ret;
+}
+
+/*
+ * Called after single-stepping.  p->addr is the address of the
+ * instruction that has been replaced by the breakpoint. To avoid the
+ * SMP problems that can occur when we temporarily put back the
+ * original opcode to single-step, we single-stepped a copy of the
+ * instruction.  The address of this copy is p->ainsn.insn.
+ *
+ * This function prepares to return from the post-single-step
+ * breakpoint trap.
+ */
+static void __kprobes resume_execution(struct kprobe *p,
+				       struct pt_regs *regs,
+				       struct kprobe_ctlblk *kcb)
+{
+	unsigned long orig_pc = kcb->kprobe_saved_pc;
+	regs->pc = orig_pc + 8;
+}
+
+static inline int post_kprobe_handler(struct pt_regs *regs)
+{
+	struct kprobe *cur = kprobe_running();
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	if (!cur)
+		return 0;
+
+	if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
+		kcb->kprobe_status = KPROBE_HIT_SSDONE;
+		cur->post_handler(cur, regs, 0);
+	}
+
+	resume_execution(cur, regs, kcb);
+
+	/* Restore back the original saved kprobes variables and continue. */
+	if (kcb->kprobe_status == KPROBE_REENTER) {
+		restore_previous_kprobe(kcb);
+		goto out;
+	}
+	reset_current_kprobe();
+out:
+	preempt_enable_no_resched();
+
+	return 1;
+}
+
+static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+	struct kprobe *cur = kprobe_running();
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
+		return 1;
+
+	if (kcb->kprobe_status & KPROBE_HIT_SS) {
+		/*
+		 * We are here because the instruction being single
+		 * stepped caused a page fault. We reset the current
+		 * kprobe and the ip points back to the probe address
+		 * and allow the page fault handler to continue as a
+		 * normal page fault.
+		 */
+		resume_execution(cur, regs, kcb);
+		reset_current_kprobe();
+		preempt_enable_no_resched();
+	}
+	return 0;
+}
+
+/*
+ * Wrapper routine for handling exceptions.
+ */
+int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
+				       unsigned long val, void *data)
+{
+	struct die_args *args = (struct die_args *)data;
+	int ret = NOTIFY_DONE;
+
+	switch (val) {
+	case DIE_BREAK:
+		if (kprobe_handler(args->regs))
+			ret = NOTIFY_STOP;
+		break;
+	case DIE_SSTEPBP:
+		if (post_kprobe_handler(args->regs))
+			ret = NOTIFY_STOP;
+		break;
+	case DIE_PAGE_FAULT:
+		/* kprobe_running() needs smp_processor_id(). */
+		preempt_disable();
+
+		if (kprobe_running()
+		    && kprobe_fault_handler(args->regs, args->trapnr))
+			ret = NOTIFY_STOP;
+		preempt_enable();
+		break;
+	default:
+		break;
+	}
+	return ret;
+}
+
+int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct jprobe *jp = container_of(p, struct jprobe, kp);
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	kcb->jprobe_saved_regs = *regs;
+	kcb->jprobe_saved_sp = regs->sp;
+
+	memcpy(kcb->jprobes_stack, (void *)kcb->jprobe_saved_sp,
+	       MIN_JPROBES_STACK_SIZE(kcb->jprobe_saved_sp));
+
+	regs->pc = (unsigned long)(jp->entry);
+
+	return 1;
+}
+
+/* Defined in the inline asm below. */
+void jprobe_return_end(void);
+
+void __kprobes jprobe_return(void)
+{
+	asm volatile(
+		"bpt\n\t"
+		".globl jprobe_return_end\n"
+		"jprobe_return_end:\n");
+}
+
+int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	if (regs->pc >= (unsigned long)jprobe_return &&
+	    regs->pc <= (unsigned long)jprobe_return_end) {
+		*regs = kcb->jprobe_saved_regs;
+		memcpy((void *)kcb->jprobe_saved_sp, kcb->jprobes_stack,
+		       MIN_JPROBES_STACK_SIZE(kcb->jprobe_saved_sp));
+		preempt_enable_no_resched();
+
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * Function return probe trampoline:
+ * - init_kprobes() establishes a probepoint here
+ * - When the probed function returns, this probe causes the
+ *   handlers to fire
+ */
+static void __used kretprobe_trampoline_holder(void)
+{
+	asm volatile(
+		"nop\n\t"
+		".global kretprobe_trampoline\n"
+		"kretprobe_trampoline:\n\t"
+		"nop\n\t"
+		: : : "memory");
+}
+
+void kretprobe_trampoline(void);
+
+void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
+				      struct pt_regs *regs)
+{
+	ri->ret_addr = (kprobe_opcode_t *) regs->lr;
+
+	/* Replace the return addr with trampoline addr */
+	regs->lr = (unsigned long)kretprobe_trampoline;
+}
+
+/*
+ * Called when the probe at kretprobe trampoline is hit.
+ */
+static int __kprobes trampoline_probe_handler(struct kprobe *p,
+						struct pt_regs *regs)
+{
+	struct kretprobe_instance *ri = NULL;
+	struct hlist_head *head, empty_rp;
+	struct hlist_node *tmp;
+	unsigned long flags, orig_ret_address = 0;
+	unsigned long trampoline_address = (unsigned long)kretprobe_trampoline;
+
+	INIT_HLIST_HEAD(&empty_rp);
+	kretprobe_hash_lock(current, &head, &flags);
+
+	/*
+	 * It is possible to have multiple instances associated with a given
+	 * task either because multiple functions in the call path have
+	 * a return probe installed on them, and/or more than one return
+	 * return probe was registered for a target function.
+	 *
+	 * We can handle this because:
+	 *     - instances are always inserted at the head of the list
+	 *     - when multiple return probes are registered for the same
+	 *       function, the first instance's ret_addr will point to the
+	 *       real return address, and all the rest will point to
+	 *       kretprobe_trampoline
+	 */
+	hlist_for_each_entry_safe(ri, tmp, head, hlist) {
+		if (ri->task != current)
+			/* another task is sharing our hash bucket */
+			continue;
+
+		if (ri->rp && ri->rp->handler)
+			ri->rp->handler(ri, regs);
+
+		orig_ret_address = (unsigned long)ri->ret_addr;
+		recycle_rp_inst(ri, &empty_rp);
+
+		if (orig_ret_address != trampoline_address) {
+			/*
+			 * This is the real return address. Any other
+			 * instances associated with this task are for
+			 * other calls deeper on the call stack
+			 */
+			break;
+		}
+	}
+
+	kretprobe_assert(ri, orig_ret_address, trampoline_address);
+	instruction_pointer(regs) = orig_ret_address;
+
+	reset_current_kprobe();
+	kretprobe_hash_unlock(current, &flags);
+	preempt_enable_no_resched();
+
+	hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
+		hlist_del(&ri->hlist);
+		kfree(ri);
+	}
+	/*
+	 * By returning a non-zero value, we are telling
+	 * kprobe_handler() that we don't want the post_handler
+	 * to run (and have re-enabled preemption)
+	 */
+	return 1;
+}
+
+int __kprobes arch_trampoline_kprobe(struct kprobe *p)
+{
+	if (p->addr == (kprobe_opcode_t *)kretprobe_trampoline)
+		return 1;
+
+	return 0;
+}
+
+static struct kprobe trampoline_p = {
+	.addr = (kprobe_opcode_t *)kretprobe_trampoline,
+	.pre_handler = trampoline_probe_handler
+};
+
+int __init arch_init_kprobes(void)
+{
+	register_kprobe(&trampoline_p);
+	return 0;
+}
diff --git a/arch/tile/kernel/mcount_64.S b/arch/tile/kernel/mcount_64.S
new file mode 100644
index 000000000000..70d7bb0c4d8f
--- /dev/null
+++ b/arch/tile/kernel/mcount_64.S
@@ -0,0 +1,224 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * TILE-Gx specific __mcount support
+ */
+
+#include <linux/linkage.h>
+#include <asm/ftrace.h>
+
+#define REGSIZE 8
+
+	.text
+	.global __mcount
+
+	.macro	MCOUNT_SAVE_REGS
+	addli	sp, sp, -REGSIZE
+	{
+	 st     sp, lr
+	 addli	r29, sp, - (12 * REGSIZE)
+	}
+	{
+	 addli	sp, sp, - (13 * REGSIZE)
+	 st     r29, sp
+	}
+	addli	r29, r29, REGSIZE
+	{ st	r29, r0; addli	r29, r29, REGSIZE }
+	{ st	r29, r1; addli	r29, r29, REGSIZE }
+	{ st	r29, r2; addli	r29, r29, REGSIZE }
+	{ st	r29, r3; addli	r29, r29, REGSIZE }
+	{ st	r29, r4; addli	r29, r29, REGSIZE }
+	{ st	r29, r5; addli	r29, r29, REGSIZE }
+	{ st	r29, r6; addli	r29, r29, REGSIZE }
+	{ st	r29, r7; addli	r29, r29, REGSIZE }
+	{ st	r29, r8; addli	r29, r29, REGSIZE }
+	{ st	r29, r9; addli	r29, r29, REGSIZE }
+	{ st	r29, r10; addli	r29, r29, REGSIZE }
+	.endm
+
+	.macro	MCOUNT_RESTORE_REGS
+	addli	r29, sp, (2 * REGSIZE)
+	{ ld	r0, r29; addli	r29, r29, REGSIZE }
+	{ ld	r1, r29; addli	r29, r29, REGSIZE }
+	{ ld	r2, r29; addli	r29, r29, REGSIZE }
+	{ ld	r3, r29; addli	r29, r29, REGSIZE }
+	{ ld	r4, r29; addli	r29, r29, REGSIZE }
+	{ ld	r5, r29; addli	r29, r29, REGSIZE }
+	{ ld	r6, r29; addli	r29, r29, REGSIZE }
+	{ ld	r7, r29; addli	r29, r29, REGSIZE }
+	{ ld	r8, r29; addli	r29, r29, REGSIZE }
+	{ ld	r9, r29; addli	r29, r29, REGSIZE }
+	{ ld	r10, r29; addli	lr, sp, (13 * REGSIZE) }
+	{ ld	lr, lr;  addli	sp, sp, (14 * REGSIZE) }
+	.endm
+
+	.macro  RETURN_BACK
+	{ move	r12, lr; move	lr, r10 }
+	jrp	r12
+	.endm
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+	.align	64
+STD_ENTRY(__mcount)
+__mcount:
+	j	ftrace_stub
+STD_ENDPROC(__mcount)
+
+	.align	64
+STD_ENTRY(ftrace_caller)
+	moveli	r11, hw2_last(function_trace_stop)
+	{ shl16insli	r11, r11, hw1(function_trace_stop); move r12, lr }
+	{ shl16insli	r11, r11, hw0(function_trace_stop); move lr, r10 }
+	ld	r11, r11
+	beqz	r11, 1f
+	jrp	r12
+
+1:
+	{ move	r10, lr; move	lr, r12 }
+	MCOUNT_SAVE_REGS
+
+	/* arg1: self return address */
+	/* arg2: parent's return address */
+	{ move	r0, lr; move	r1, r10 }
+
+	.global	ftrace_call
+ftrace_call:
+	/*
+	 * a placeholder for the call to a real tracing function, i.e.
+	 * ftrace_trace_function()
+	 */
+	nop
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	.global	ftrace_graph_call
+ftrace_graph_call:
+	/*
+	 * a placeholder for the call to a real tracing function, i.e.
+	 * ftrace_graph_caller()
+	 */
+	nop
+#endif
+	MCOUNT_RESTORE_REGS
+	.global	ftrace_stub
+ftrace_stub:
+	RETURN_BACK
+STD_ENDPROC(ftrace_caller)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+	.align	64
+STD_ENTRY(__mcount)
+	moveli	r11, hw2_last(function_trace_stop)
+	{ shl16insli	r11, r11, hw1(function_trace_stop); move r12, lr }
+	{ shl16insli	r11, r11, hw0(function_trace_stop); move lr, r10 }
+	ld	r11, r11
+	beqz	r11, 1f
+	jrp	r12
+
+1:
+	{ move	r10, lr; move	lr, r12 }
+	{
+	 moveli	r11, hw2_last(ftrace_trace_function)
+	 moveli	r13, hw2_last(ftrace_stub)
+	}
+	{
+	 shl16insli	r11, r11, hw1(ftrace_trace_function)
+	 shl16insli	r13, r13, hw1(ftrace_stub)
+	}
+	{
+	 shl16insli	r11, r11, hw0(ftrace_trace_function)
+	 shl16insli	r13, r13, hw0(ftrace_stub)
+	}
+
+	ld	r11, r11
+	sub	r14, r13, r11
+	bnez	r14, static_trace
+
+#ifdef	CONFIG_FUNCTION_GRAPH_TRACER
+	moveli	r15, hw2_last(ftrace_graph_return)
+	shl16insli	r15, r15, hw1(ftrace_graph_return)
+	shl16insli	r15, r15, hw0(ftrace_graph_return)
+	ld	r15, r15
+	sub	r15, r15, r13
+	bnez	r15, ftrace_graph_caller
+
+	{
+	 moveli	r16, hw2_last(ftrace_graph_entry)
+	 moveli	r17, hw2_last(ftrace_graph_entry_stub)
+	}
+	{
+	 shl16insli	r16, r16, hw1(ftrace_graph_entry)
+	 shl16insli	r17, r17, hw1(ftrace_graph_entry_stub)
+	}
+	{
+	 shl16insli	r16, r16, hw0(ftrace_graph_entry)
+	 shl16insli	r17, r17, hw0(ftrace_graph_entry_stub)
+	}
+	ld	r16, r16
+	sub	r17, r16, r17
+	bnez	r17, ftrace_graph_caller
+
+#endif
+	RETURN_BACK
+
+static_trace:
+	MCOUNT_SAVE_REGS
+
+	/* arg1: self return address */
+	/* arg2: parent's return address */
+	{ move	r0, lr; move	r1, r10 }
+
+	/* call ftrace_trace_function() */
+	jalr	r11
+
+	MCOUNT_RESTORE_REGS
+
+	.global ftrace_stub
+ftrace_stub:
+	RETURN_BACK
+STD_ENDPROC(__mcount)
+
+#endif	/* ! CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+STD_ENTRY(ftrace_graph_caller)
+ftrace_graph_caller:
+#ifndef CONFIG_DYNAMIC_FTRACE
+	MCOUNT_SAVE_REGS
+#endif
+
+	/* arg1: Get the location of the parent's return address */
+	addi	r0, sp, 12 * REGSIZE
+	/* arg2: Get self return address */
+	move	r1, lr
+
+	jal prepare_ftrace_return
+
+	MCOUNT_RESTORE_REGS
+	RETURN_BACK
+STD_ENDPROC(ftrace_graph_caller)
+
+	.global return_to_handler
+return_to_handler:
+	MCOUNT_SAVE_REGS
+
+	jal	ftrace_return_to_handler
+	/* restore the real parent address */
+	move	r11, r0
+
+	MCOUNT_RESTORE_REGS
+	jr	r11
+
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/tile/kernel/pci-dma.c b/arch/tile/kernel/pci-dma.c
index b9fe80ec1089..09b58703ac26 100644
--- a/arch/tile/kernel/pci-dma.c
+++ b/arch/tile/kernel/pci-dma.c
@@ -36,8 +36,9 @@ static void *tile_dma_alloc_coherent(struct device *dev, size_t size,
 				     dma_addr_t *dma_handle, gfp_t gfp,
 				     struct dma_attrs *attrs)
 {
-	u64 dma_mask = dev->coherent_dma_mask ?: DMA_BIT_MASK(32);
-	int node = dev_to_node(dev);
+	u64 dma_mask = (dev && dev->coherent_dma_mask) ?
+		dev->coherent_dma_mask : DMA_BIT_MASK(32);
+	int node = dev ? dev_to_node(dev) : 0;
 	int order = get_order(size);
 	struct page *pg;
 	dma_addr_t addr;
@@ -256,7 +257,7 @@ static void tile_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
 	BUG_ON(!valid_dma_direction(direction));
 
 	__dma_complete_page(pfn_to_page(PFN_DOWN(dma_address)),
-			    dma_address & PAGE_OFFSET, size, direction);
+			    dma_address & (PAGE_SIZE - 1), size, direction);
 }
 
 static void tile_dma_sync_single_for_cpu(struct device *dev,
@@ -357,7 +358,7 @@ static void *tile_pci_dma_alloc_coherent(struct device *dev, size_t size,
 
 	addr = page_to_phys(pg);
 
-	*dma_handle = phys_to_dma(dev, addr);
+	*dma_handle = addr + get_dma_offset(dev);
 
 	return page_address(pg);
 }
@@ -387,7 +388,7 @@ static int tile_pci_dma_map_sg(struct device *dev, struct scatterlist *sglist,
 		sg->dma_address = sg_phys(sg);
 		__dma_prep_pa_range(sg->dma_address, sg->length, direction);
 
-		sg->dma_address = phys_to_dma(dev, sg->dma_address);
+		sg->dma_address = sg->dma_address + get_dma_offset(dev);
 #ifdef CONFIG_NEED_SG_DMA_LENGTH
 		sg->dma_length = sg->length;
 #endif
@@ -422,7 +423,7 @@ static dma_addr_t tile_pci_dma_map_page(struct device *dev, struct page *page,
 	BUG_ON(offset + size > PAGE_SIZE);
 	__dma_prep_page(page, offset, size, direction);
 
-	return phys_to_dma(dev, page_to_pa(page) + offset);
+	return page_to_pa(page) + offset + get_dma_offset(dev);
 }
 
 static void tile_pci_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
@@ -432,10 +433,10 @@ static void tile_pci_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
 {
 	BUG_ON(!valid_dma_direction(direction));
 
-	dma_address = dma_to_phys(dev, dma_address);
+	dma_address -= get_dma_offset(dev);
 
 	__dma_complete_page(pfn_to_page(PFN_DOWN(dma_address)),
-			    dma_address & PAGE_OFFSET, size, direction);
+			    dma_address & (PAGE_SIZE - 1), size, direction);
 }
 
 static void tile_pci_dma_sync_single_for_cpu(struct device *dev,
@@ -445,7 +446,7 @@ static void tile_pci_dma_sync_single_for_cpu(struct device *dev,
 {
 	BUG_ON(!valid_dma_direction(direction));
 
-	dma_handle = dma_to_phys(dev, dma_handle);
+	dma_handle -= get_dma_offset(dev);
 
 	__dma_complete_pa_range(dma_handle, size, direction);
 }
@@ -456,7 +457,7 @@ static void tile_pci_dma_sync_single_for_device(struct device *dev,
 						enum dma_data_direction
 						direction)
 {
-	dma_handle = dma_to_phys(dev, dma_handle);
+	dma_handle -= get_dma_offset(dev);
 
 	__dma_prep_pa_range(dma_handle, size, direction);
 }
@@ -558,22 +559,47 @@ static struct dma_map_ops pci_swiotlb_dma_ops = {
 	.mapping_error = swiotlb_dma_mapping_error,
 };
 
+static struct dma_map_ops pci_hybrid_dma_ops = {
+	.alloc = tile_swiotlb_alloc_coherent,
+	.free = tile_swiotlb_free_coherent,
+	.map_page = tile_pci_dma_map_page,
+	.unmap_page = tile_pci_dma_unmap_page,
+	.map_sg = tile_pci_dma_map_sg,
+	.unmap_sg = tile_pci_dma_unmap_sg,
+	.sync_single_for_cpu = tile_pci_dma_sync_single_for_cpu,
+	.sync_single_for_device = tile_pci_dma_sync_single_for_device,
+	.sync_sg_for_cpu = tile_pci_dma_sync_sg_for_cpu,
+	.sync_sg_for_device = tile_pci_dma_sync_sg_for_device,
+	.mapping_error = tile_pci_dma_mapping_error,
+	.dma_supported = tile_pci_dma_supported
+};
+
 struct dma_map_ops *gx_legacy_pci_dma_map_ops = &pci_swiotlb_dma_ops;
+struct dma_map_ops *gx_hybrid_pci_dma_map_ops = &pci_hybrid_dma_ops;
 #else
 struct dma_map_ops *gx_legacy_pci_dma_map_ops;
+struct dma_map_ops *gx_hybrid_pci_dma_map_ops;
 #endif
 EXPORT_SYMBOL(gx_legacy_pci_dma_map_ops);
+EXPORT_SYMBOL(gx_hybrid_pci_dma_map_ops);
 
 #ifdef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
 int dma_set_coherent_mask(struct device *dev, u64 mask)
 {
 	struct dma_map_ops *dma_ops = get_dma_ops(dev);
 
-	/* Handle legacy PCI devices with limited memory addressability. */
-	if (((dma_ops == gx_pci_dma_map_ops) ||
-	    (dma_ops == gx_legacy_pci_dma_map_ops)) &&
-	    (mask <= DMA_BIT_MASK(32))) {
-		if (mask > dev->archdata.max_direct_dma_addr)
+	/*
+	 * For PCI devices with 64-bit DMA addressing capability, promote
+	 * the dma_ops to full capability for both streams and consistent
+	 * memory access. For 32-bit capable devices, limit the consistent 
+	 * memory DMA range to max_direct_dma_addr.
+	 */
+	if (dma_ops == gx_pci_dma_map_ops ||
+	    dma_ops == gx_hybrid_pci_dma_map_ops ||
+	    dma_ops == gx_legacy_pci_dma_map_ops) {
+		if (mask == DMA_BIT_MASK(64))
+			set_dma_ops(dev, gx_pci_dma_map_ops);
+		else if (mask > dev->archdata.max_direct_dma_addr)
 			mask = dev->archdata.max_direct_dma_addr;
 	}
 
@@ -584,3 +610,21 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
 }
 EXPORT_SYMBOL(dma_set_coherent_mask);
 #endif
+
+#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK
+/*
+ * The generic dma_get_required_mask() uses the highest physical address
+ * (max_pfn) to provide the hint to the PCI drivers regarding 32-bit or
+ * 64-bit DMA configuration. Since TILEGx has I/O TLB/MMU, allowing the
+ * DMAs to use the full 64-bit PCI address space and not limited by
+ * the physical memory space, we always let the PCI devices use
+ * 64-bit DMA if they have that capability, by returning the 64-bit
+ * DMA mask here. The device driver has the option to use 32-bit DMA if
+ * the device is not capable of 64-bit DMA.
+ */
+u64 dma_get_required_mask(struct device *dev)
+{
+	return DMA_BIT_MASK(64);
+}
+EXPORT_SYMBOL_GPL(dma_get_required_mask);
+#endif
diff --git a/arch/tile/kernel/pci.c b/arch/tile/kernel/pci.c
index 67237d34c2e2..b7180e6e900d 100644
--- a/arch/tile/kernel/pci.c
+++ b/arch/tile/kernel/pci.c
@@ -20,7 +20,6 @@
 #include <linux/capability.h>
 #include <linux/sched.h>
 #include <linux/errno.h>
-#include <linux/bootmem.h>
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/uaccess.h>
@@ -52,6 +51,8 @@
  *
  */
 
+static int pci_probe = 1;
+
 /*
  * This flag tells if the platform is TILEmpower that needs
  * special configuration for the PLX switch chip.
@@ -144,6 +145,11 @@ int __init tile_pci_init(void)
 {
 	int i;
 
+	if (!pci_probe) {
+		pr_info("PCI: disabled by boot argument\n");
+		return 0;
+	}
+
 	pr_info("PCI: Searching for controllers...\n");
 
 	/* Re-init number of PCIe controllers to support hot-plug feature. */
@@ -192,7 +198,6 @@ int __init tile_pci_init(void)
 			controller->hv_cfg_fd[0] = hv_cfg_fd0;
 			controller->hv_cfg_fd[1] = hv_cfg_fd1;
 			controller->hv_mem_fd = hv_mem_fd;
-			controller->first_busno = 0;
 			controller->last_busno = 0xff;
 			controller->ops = &tile_cfg_ops;
 
@@ -283,7 +288,7 @@ int __init pcibios_init(void)
 	 * known to require at least 20ms here, but we use a more
 	 * conservative value.
 	 */
-	mdelay(250);
+	msleep(250);
 
 	/* Scan all of the recorded PCI controllers.  */
 	for (i = 0; i < TILE_NUM_PCIE; i++) {
@@ -304,18 +309,10 @@ int __init pcibios_init(void)
 
 			pr_info("PCI: initializing controller #%d\n", i);
 
-			/*
-			 * This comes from the generic Linux PCI driver.
-			 *
-			 * It reads the PCI tree for this bus into the Linux
-			 * data structures.
-			 *
-			 * This is inlined in linux/pci.h and calls into
-			 * pci_scan_bus_parented() in probe.c.
-			 */
 			pci_add_resource(&resources, &ioport_resource);
 			pci_add_resource(&resources, &iomem_resource);
-			bus = pci_scan_root_bus(NULL, 0, controller->ops, controller, &resources);
+			bus = pci_scan_root_bus(NULL, 0, controller->ops,
+						controller, &resources);
 			controller->root_bus = bus;
 			controller->last_busno = bus->busn_res.end;
 		}
@@ -388,6 +385,16 @@ void pcibios_set_master(struct pci_dev *dev)
 	/* No special bus mastering setup handling. */
 }
 
+/* Process any "pci=" kernel boot arguments. */
+char *__init pcibios_setup(char *str)
+{
+	if (!strcmp(str, "off")) {
+		pci_probe = 0;
+		return NULL;
+	}
+	return str;
+}
+
 /*
  * Enable memory and/or address decoding, as appropriate, for the
  * device described by the 'dev' struct.
diff --git a/arch/tile/kernel/pci_gx.c b/arch/tile/kernel/pci_gx.c
index 6640e7bbeaa2..a97a6452b812 100644
--- a/arch/tile/kernel/pci_gx.c
+++ b/arch/tile/kernel/pci_gx.c
@@ -69,19 +69,32 @@ static int pcie_rc[TILEGX_NUM_TRIO][TILEGX_TRIO_PCIES];
  * a HW PCIe link-training bug. The exact delay is specified with
  * a kernel boot argument in the form of "pcie_rc_delay=T,P,S",
  * where T is the TRIO instance number, P is the port number and S is
- * the delay in seconds. If the delay is not provided, the value
- * will be DEFAULT_RC_DELAY.
+ * the delay in seconds. If the argument is specified, but the delay is
+ * not provided, the value will be DEFAULT_RC_DELAY.
  */
 static int rc_delay[TILEGX_NUM_TRIO][TILEGX_TRIO_PCIES];
 
 /* Default number of seconds that the PCIe RC port probe can be delayed. */
 #define DEFAULT_RC_DELAY	10
 
-/* Max number of seconds that the PCIe RC port probe can be delayed. */
-#define MAX_RC_DELAY		20
+/* The PCI I/O space size in each PCI domain. */
+#define IO_SPACE_SIZE		0x10000
+
+/* Provide shorter versions of some very long constant names. */
+#define AUTO_CONFIG_RC	\
+	TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_RC
+#define AUTO_CONFIG_RC_G1	\
+	TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_RC_G1
+#define AUTO_CONFIG_EP	\
+	TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_ENDPOINT
+#define AUTO_CONFIG_EP_G1	\
+	TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_ENDPOINT_G1
 
 /* Array of the PCIe ports configuration info obtained from the BIB. */
-struct pcie_port_property pcie_ports[TILEGX_NUM_TRIO][TILEGX_TRIO_PCIES];
+struct pcie_trio_ports_property pcie_ports[TILEGX_NUM_TRIO];
+
+/* Number of configured TRIO instances. */
+int num_trio_shims;
 
 /* All drivers share the TRIO contexts defined here. */
 gxio_trio_context_t trio_contexts[TILEGX_NUM_TRIO];
@@ -89,24 +102,21 @@ gxio_trio_context_t trio_contexts[TILEGX_NUM_TRIO];
 /* Pointer to an array of PCIe RC controllers. */
 struct pci_controller pci_controllers[TILEGX_NUM_TRIO * TILEGX_TRIO_PCIES];
 int num_rc_controllers;
-static int num_ep_controllers;
 
 static struct pci_ops tile_cfg_ops;
 
 /* Mask of CPUs that should receive PCIe interrupts. */
 static struct cpumask intr_cpus_map;
 
-/*
- * We don't need to worry about the alignment of resources.
- */
+/* We don't need to worry about the alignment of resources. */
 resource_size_t pcibios_align_resource(void *data, const struct resource *res,
-				resource_size_t size, resource_size_t align)
+				       resource_size_t size,
+				       resource_size_t align)
 {
 	return res->start;
 }
 EXPORT_SYMBOL(pcibios_align_resource);
 
-
 /*
  * Pick a CPU to receive and handle the PCIe interrupts, based on the IRQ #.
  * For now, we simply send interrupts to non-dataplane CPUs.
@@ -134,24 +144,19 @@ static int tile_irq_cpu(int irq)
 	return cpu;
 }
 
-/*
- * Open a file descriptor to the TRIO shim.
- */
+/* Open a file descriptor to the TRIO shim. */
 static int tile_pcie_open(int trio_index)
 {
 	gxio_trio_context_t *context = &trio_contexts[trio_index];
 	int ret;
+	int mac;
 
-	/*
-	 * This opens a file descriptor to the TRIO shim.
-	 */
+	/* This opens a file descriptor to the TRIO shim. */
 	ret = gxio_trio_init(context, trio_index);
 	if (ret < 0)
-		return ret;
+		goto gxio_trio_init_failure;
 
-	/*
-	 * Allocate an ASID for the kernel.
-	 */
+	/* Allocate an ASID for the kernel. */
 	ret = gxio_trio_alloc_asids(context, 1, 0, 0);
 	if (ret < 0) {
 		pr_err("PCI: ASID alloc failure on TRIO %d, give up\n",
@@ -189,31 +194,97 @@ static int tile_pcie_open(int trio_index)
 	}
 #endif
 
+	/* Get the properties of the PCIe ports on this TRIO instance. */
+	ret = gxio_trio_get_port_property(context, &pcie_ports[trio_index]);
+	if (ret < 0) {
+		pr_err("PCI: PCIE_GET_PORT_PROPERTY failure, error %d,"
+		       " on TRIO %d\n", ret, trio_index);
+		goto get_port_property_failure;
+	}
+
+	context->mmio_base_mac =
+		iorpc_ioremap(context->fd, 0, HV_TRIO_CONFIG_IOREMAP_SIZE);
+	if (context->mmio_base_mac == NULL) {
+		pr_err("PCI: TRIO config space mapping failure, error %d,"
+		       " on TRIO %d\n", ret, trio_index);
+		ret = -ENOMEM;
+
+		goto trio_mmio_mapping_failure;
+	}
+
+	/* Check the port strap state which will override the BIB setting. */
+	for (mac = 0; mac < TILEGX_TRIO_PCIES; mac++) {
+		TRIO_PCIE_INTFC_PORT_CONFIG_t port_config;
+		unsigned int reg_offset;
+
+		/* Ignore ports that are not specified in the BIB. */
+		if (!pcie_ports[trio_index].ports[mac].allow_rc &&
+		    !pcie_ports[trio_index].ports[mac].allow_ep)
+			continue;
+
+		reg_offset =
+			(TRIO_PCIE_INTFC_PORT_CONFIG <<
+				TRIO_CFG_REGION_ADDR__REG_SHIFT) |
+			(TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_INTERFACE <<
+				TRIO_CFG_REGION_ADDR__INTFC_SHIFT) |
+			(mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
+
+		port_config.word =
+			__gxio_mmio_read(context->mmio_base_mac + reg_offset);
+
+		if (port_config.strap_state != AUTO_CONFIG_RC &&
+		    port_config.strap_state != AUTO_CONFIG_RC_G1) {
+			/*
+			 * If this is really intended to be an EP port, record
+			 * it so that the endpoint driver will know about it.
+			 */
+			if (port_config.strap_state == AUTO_CONFIG_EP ||
+			    port_config.strap_state == AUTO_CONFIG_EP_G1)
+				pcie_ports[trio_index].ports[mac].allow_ep = 1;
+		}
+	}
+
 	return ret;
 
+trio_mmio_mapping_failure:
+get_port_property_failure:
 asid_alloc_failure:
 #ifdef USE_SHARED_PCIE_CONFIG_REGION
 pio_alloc_failure:
 #endif
 	hv_dev_close(context->fd);
+gxio_trio_init_failure:
+	context->fd = -1;
 
 	return ret;
 }
 
-static void
-tilegx_legacy_irq_ack(struct irq_data *d)
+static int __init tile_trio_init(void)
+{
+	int i;
+
+	/* We loop over all the TRIO shims. */
+	for (i = 0; i < TILEGX_NUM_TRIO; i++) {
+		if (tile_pcie_open(i) < 0)
+			continue;
+		num_trio_shims++;
+	}
+
+	return 0;
+}
+postcore_initcall(tile_trio_init);
+
+static void tilegx_legacy_irq_ack(struct irq_data *d)
 {
 	__insn_mtspr(SPR_IPI_EVENT_RESET_K, 1UL << d->irq);
 }
 
-static void
-tilegx_legacy_irq_mask(struct irq_data *d)
+static void tilegx_legacy_irq_mask(struct irq_data *d)
 {
 	__insn_mtspr(SPR_IPI_MASK_SET_K, 1UL << d->irq);
 }
 
-static void
-tilegx_legacy_irq_unmask(struct irq_data *d)
+static void tilegx_legacy_irq_unmask(struct irq_data *d)
 {
 	__insn_mtspr(SPR_IPI_MASK_RESET_K, 1UL << d->irq);
 }
@@ -234,8 +305,7 @@ static struct irq_chip tilegx_legacy_irq_chip = {
  * to Linux which just calls handle_level_irq() after clearing the
  * MAC INTx Assert status bit associated with this interrupt.
  */
-static void
-trio_handle_level_irq(unsigned int irq, struct irq_desc *desc)
+static void trio_handle_level_irq(unsigned int irq, struct irq_desc *desc)
 {
 	struct pci_controller *controller = irq_desc_get_handler_data(desc);
 	gxio_trio_context_t *trio_context = controller->trio;
@@ -301,9 +371,7 @@ static int tile_init_irqs(struct pci_controller *controller)
 			goto free_irqs;
 		}
 
-		/*
-		 * Register the IRQ handler with the kernel.
-		 */
+		/* Register the IRQ handler with the kernel. */
 		irq_set_chip_and_handler(irq, &tilegx_legacy_irq_chip,
 					trio_handle_level_irq);
 		irq_set_chip_data(irq, (void *)(uint64_t)i);
@@ -320,14 +388,39 @@ free_irqs:
 }
 
 /*
+ * Return 1 if the port is strapped to operate in RC mode.
+ */
+static int
+strapped_for_rc(gxio_trio_context_t *trio_context, int mac)
+{
+	TRIO_PCIE_INTFC_PORT_CONFIG_t port_config;
+	unsigned int reg_offset;
+
+	/* Check the port configuration. */
+	reg_offset =
+		(TRIO_PCIE_INTFC_PORT_CONFIG <<
+			TRIO_CFG_REGION_ADDR__REG_SHIFT) |
+		(TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_INTERFACE <<
+			TRIO_CFG_REGION_ADDR__INTFC_SHIFT) |
+		(mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
+	port_config.word =
+		__gxio_mmio_read(trio_context->mmio_base_mac + reg_offset);
+
+	if (port_config.strap_state == AUTO_CONFIG_RC ||
+	    port_config.strap_state == AUTO_CONFIG_RC_G1)
+		return 1;
+	else
+		return 0;
+}
+
+/*
  * Find valid controllers and fill in pci_controller structs for each
  * of them.
  *
- * Returns the number of controllers discovered.
+ * Return the number of controllers discovered.
  */
 int __init tile_pci_init(void)
 {
-	int num_trio_shims = 0;
 	int ctl_index = 0;
 	int i, j;
 
@@ -338,64 +431,62 @@ int __init tile_pci_init(void)
 
 	pr_info("PCI: Searching for controllers...\n");
 
-	/*
-	 * We loop over all the TRIO shims.
-	 */
-	for (i = 0; i < TILEGX_NUM_TRIO; i++) {
-		int ret;
-
-		ret = tile_pcie_open(i);
-		if (ret < 0)
-			continue;
-
-		num_trio_shims++;
-	}
-
 	if (num_trio_shims == 0 || sim_is_simulator())
 		return 0;
 
 	/*
-	 * Now determine which PCIe ports are configured to operate in RC mode.
-	 * We look at the Board Information Block first and then see if there
-	 * are any overriding configuration by the HW strapping pin.
+	 * Now determine which PCIe ports are configured to operate in RC
+	 * mode. There is a differece in the port configuration capability
+	 * between the Gx36 and Gx72 devices.
+	 *
+	 * The Gx36 has configuration capability for each of the 3 PCIe
+	 * interfaces (disable, auto endpoint, auto RC, etc.).
+	 * On the Gx72, you can only select one of the 3 PCIe interfaces per
+	 * TRIO to train automatically. Further, the allowable training modes
+	 * are reduced to four options (auto endpoint, auto RC, stream x1,
+	 * stream x4).
+	 *
+	 * For Gx36 ports, it must be allowed to be in RC mode by the
+	 * Board Information Block, and the hardware strapping pins must be
+	 * set to RC mode.
+	 *
+	 * For Gx72 ports, the port will operate in RC mode if either of the
+	 * following is true:
+	 * 1. It is allowed to be in RC mode by the Board Information Block,
+	 *    and the BIB doesn't allow the EP mode.
+	 * 2. It is allowed to be in either the RC or the EP mode by the BIB,
+	 *    and the hardware strapping pin is set to RC mode.
 	 */
 	for (i = 0; i < TILEGX_NUM_TRIO; i++) {
 		gxio_trio_context_t *context = &trio_contexts[i];
-		int ret;
 
 		if (context->fd < 0)
 			continue;
 
-		ret = hv_dev_pread(context->fd, 0,
-			(HV_VirtAddr)&pcie_ports[i][0],
-			sizeof(struct pcie_port_property) * TILEGX_TRIO_PCIES,
-			GXIO_TRIO_OP_GET_PORT_PROPERTY);
-		if (ret < 0) {
-			pr_err("PCI: PCIE_GET_PORT_PROPERTY failure, error %d,"
-				" on TRIO %d\n", ret, i);
-			continue;
-		}
-
 		for (j = 0; j < TILEGX_TRIO_PCIES; j++) {
-			if (pcie_ports[i][j].allow_rc) {
+			int is_rc = 0;
+
+			if (pcie_ports[i].is_gx72 &&
+			    pcie_ports[i].ports[j].allow_rc) {
+				if (!pcie_ports[i].ports[j].allow_ep ||
+				    strapped_for_rc(context, j))
+					is_rc = 1;
+			} else if (pcie_ports[i].ports[j].allow_rc &&
+				   strapped_for_rc(context, j)) {
+				is_rc = 1;
+			}
+			if (is_rc) {
 				pcie_rc[i][j] = 1;
 				num_rc_controllers++;
 			}
-			else if (pcie_ports[i][j].allow_ep) {
-				num_ep_controllers++;
-			}
 		}
 	}
 
-	/*
-	 * Return if no PCIe ports are configured to operate in RC mode.
-	 */
+	/* Return if no PCIe ports are configured to operate in RC mode. */
 	if (num_rc_controllers == 0)
 		return 0;
 
-	/*
-	 * Set the TRIO pointer and MAC index for each PCIe RC port.
-	 */
+	/* Set the TRIO pointer and MAC index for each PCIe RC port. */
 	for (i = 0; i < TILEGX_NUM_TRIO; i++) {
 		for (j = 0; j < TILEGX_TRIO_PCIES; j++) {
 			if (pcie_rc[i][j]) {
@@ -411,26 +502,32 @@ int __init tile_pci_init(void)
 	}
 
 out:
-	/*
-	 * Configure each PCIe RC port.
-	 */
+	/* Configure each PCIe RC port. */
 	for (i = 0; i < num_rc_controllers; i++) {
-		/*
-		 * Configure the PCIe MAC to run in RC mode.
-		 */
 
+		/* Configure the PCIe MAC to run in RC mode. */
 		struct pci_controller *controller = &pci_controllers[i];
 
 		controller->index = i;
 		controller->ops = &tile_cfg_ops;
 
+		controller->io_space.start = PCIBIOS_MIN_IO +
+			(i * IO_SPACE_SIZE);
+		controller->io_space.end = controller->io_space.start +
+			IO_SPACE_SIZE - 1;
+		BUG_ON(controller->io_space.end > IO_SPACE_LIMIT);
+		controller->io_space.flags = IORESOURCE_IO;
+		snprintf(controller->io_space_name,
+			 sizeof(controller->io_space_name),
+			 "PCI I/O domain %d", i);
+		controller->io_space.name = controller->io_space_name;
+
 		/*
 		 * The PCI memory resource is located above the PA space.
 		 * For every host bridge, the BAR window or the MMIO aperture
 		 * is in range [3GB, 4GB - 1] of a 4GB space beyond the
 		 * PA space.
 		 */
-
 		controller->mem_offset = TILE_PCI_MEM_START +
 			(i * TILE_PCI_BAR_WINDOW_TOP);
 		controller->mem_space.start = controller->mem_offset +
@@ -458,7 +555,6 @@ static int tile_map_irq(const struct pci_dev *dev, u8 device, u8 pin)
 	return controller->irq_intx_table[pin - 1];
 }
 
-
 static void fixup_read_and_payload_sizes(struct pci_controller *controller)
 {
 	gxio_trio_context_t *trio_context = controller->trio;
@@ -472,9 +568,7 @@ static void fixup_read_and_payload_sizes(struct pci_controller *controller)
 
 	mac = controller->mac;
 
-	/*
-	 * Set our max read request size to be 4KB.
-	 */
+	/* Set our max read request size to be 4KB. */
 	reg_offset =
 		(TRIO_PCIE_RC_DEVICE_CONTROL <<
 			TRIO_CFG_REGION_ADDR__REG_SHIFT) |
@@ -483,10 +577,10 @@ static void fixup_read_and_payload_sizes(struct pci_controller *controller)
 		(mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
 
 	dev_control.word = __gxio_mmio_read32(trio_context->mmio_base_mac +
-						reg_offset);
+					      reg_offset);
 	dev_control.max_read_req_sz = 5;
 	__gxio_mmio_write32(trio_context->mmio_base_mac + reg_offset,
-						dev_control.word);
+			    dev_control.word);
 
 	/*
 	 * Set the max payload size supported by this Gx PCIe MAC.
@@ -502,10 +596,10 @@ static void fixup_read_and_payload_sizes(struct pci_controller *controller)
 		(mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
 
 	rc_dev_cap.word = __gxio_mmio_read32(trio_context->mmio_base_mac +
-						reg_offset);
+					     reg_offset);
 	rc_dev_cap.mps_sup = 1;
 	__gxio_mmio_write32(trio_context->mmio_base_mac + reg_offset,
-						rc_dev_cap.word);
+			    rc_dev_cap.word);
 
 	/* Configure PCI Express MPS setting. */
 	list_for_each_entry(child, &root_bus->children, node)
@@ -528,7 +622,7 @@ static void fixup_read_and_payload_sizes(struct pci_controller *controller)
 				    dev_control.max_payload_size,
 				    dev_control.max_read_req_sz,
 				    mac);
-        if (err < 0) {
+	if (err < 0) {
 		pr_err("PCI: PCIE_CONFIGURE_MAC_MPS_MRS failure, "
 			"MAC %d on TRIO %d\n",
 			mac, controller->trio_index);
@@ -565,21 +659,14 @@ static int setup_pcie_rc_delay(char *str)
 		if (!isdigit(*str))
 			return -EINVAL;
 		delay = simple_strtoul(str, (char **)&str, 10);
-		if (delay > MAX_RC_DELAY)
-			return -EINVAL;
 	}
 
 	rc_delay[trio_index][mac] = delay ? : DEFAULT_RC_DELAY;
-	pr_info("Delaying PCIe RC link training for %u sec"
-		" on MAC %lu on TRIO %lu\n", rc_delay[trio_index][mac],
-		mac, trio_index);
 	return 0;
 }
 early_param("pcie_rc_delay", setup_pcie_rc_delay);
 
-/*
- * PCI initialization entry point, called by subsys_initcall.
- */
+/* PCI initialization entry point, called by subsys_initcall. */
 int __init pcibios_init(void)
 {
 	resource_size_t offset;
@@ -589,35 +676,10 @@ int __init pcibios_init(void)
 
 	tile_pci_init();
 
-	if (num_rc_controllers == 0 && num_ep_controllers == 0)
+	if (num_rc_controllers == 0)
 		return 0;
 
 	/*
-	 * We loop over all the TRIO shims and set up the MMIO mappings.
-	 */
-	for (i = 0; i < TILEGX_NUM_TRIO; i++) {
-		gxio_trio_context_t *context = &trio_contexts[i];
-
-		if (context->fd < 0)
-			continue;
-
-		/*
-		 * Map in the MMIO space for the MAC.
-		 */
-		offset = 0;
-		context->mmio_base_mac =
-			iorpc_ioremap(context->fd, offset,
-				      HV_TRIO_CONFIG_IOREMAP_SIZE);
-		if (context->mmio_base_mac == NULL) {
-			pr_err("PCI: MAC map failure on TRIO %d\n", i);
-
-			hv_dev_close(context->fd);
-			context->fd = -1;
-			continue;
-		}
-	}
-
-	/*
 	 * Delay a bit in case devices aren't ready.  Some devices are
 	 * known to require at least 20ms here, but we use a more
 	 * conservative value.
@@ -628,7 +690,6 @@ int __init pcibios_init(void)
 	for (next_busno = 0, i = 0; i < num_rc_controllers; i++) {
 		struct pci_controller *controller = &pci_controllers[i];
 		gxio_trio_context_t *trio_context = controller->trio;
-		TRIO_PCIE_INTFC_PORT_CONFIG_t port_config;
 		TRIO_PCIE_INTFC_PORT_STATUS_t port_status;
 		TRIO_PCIE_INTFC_TX_FIFO_CTL_t tx_fifo_ctl;
 		struct pci_bus *bus;
@@ -645,75 +706,64 @@ int __init pcibios_init(void)
 		mac = controller->mac;
 
 		/*
-		 * Check the port strap state which will override the BIB
-		 * setting.
+		 * Check for PCIe link-up status to decide if we need
+		 * to force the link to come up.
 		 */
-
 		reg_offset =
-			(TRIO_PCIE_INTFC_PORT_CONFIG <<
+			(TRIO_PCIE_INTFC_PORT_STATUS <<
 				TRIO_CFG_REGION_ADDR__REG_SHIFT) |
 			(TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_INTERFACE <<
-				TRIO_CFG_REGION_ADDR__INTFC_SHIFT ) |
+				TRIO_CFG_REGION_ADDR__INTFC_SHIFT) |
 			(mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
 
-		port_config.word =
+		port_status.word =
 			__gxio_mmio_read(trio_context->mmio_base_mac +
 					 reg_offset);
-
-		if ((port_config.strap_state !=
-			TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_RC) &&
-			(port_config.strap_state !=
-			TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_RC_G1)) {
-			/*
-			 * If this is really intended to be an EP port,
-			 * record it so that the endpoint driver will know about it.
-			 */
-			if (port_config.strap_state ==
-			TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_ENDPOINT ||
-			port_config.strap_state ==
-			TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_ENDPOINT_G1)
-				pcie_ports[trio_index][mac].allow_ep = 1;
-
-			continue;
+		if (!port_status.dl_up) {
+			if (rc_delay[trio_index][mac]) {
+				pr_info("Delaying PCIe RC TRIO init %d sec"
+					" on MAC %d on TRIO %d\n",
+					rc_delay[trio_index][mac], mac,
+					trio_index);
+				msleep(rc_delay[trio_index][mac] * 1000);
+			}
+			ret = gxio_trio_force_rc_link_up(trio_context, mac);
+			if (ret < 0)
+				pr_err("PCI: PCIE_FORCE_LINK_UP failure, "
+					"MAC %d on TRIO %d\n", mac, trio_index);
 		}
 
-		/*
-		 * Delay the RC link training if needed.
-		 */
-		if (rc_delay[trio_index][mac])
-			msleep(rc_delay[trio_index][mac] * 1000);
-
-		ret = gxio_trio_force_rc_link_up(trio_context, mac);
-		if (ret < 0)
-			pr_err("PCI: PCIE_FORCE_LINK_UP failure, "
-				"MAC %d on TRIO %d\n", mac, trio_index);
-
 		pr_info("PCI: Found PCI controller #%d on TRIO %d MAC %d\n", i,
 			trio_index, controller->mac);
 
-		/*
-		 * Wait a bit here because some EP devices take longer
-		 * to come up.
-		 */
-		msleep(1000);
-
-		/*
-		 * Check for PCIe link-up status.
-		 */
-
-		reg_offset =
-			(TRIO_PCIE_INTFC_PORT_STATUS <<
-				TRIO_CFG_REGION_ADDR__REG_SHIFT) |
-			(TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_INTERFACE <<
-				TRIO_CFG_REGION_ADDR__INTFC_SHIFT ) |
-			(mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
+		/* Delay the bus probe if needed. */
+		if (rc_delay[trio_index][mac]) {
+			pr_info("Delaying PCIe RC bus enumerating %d sec"
+				" on MAC %d on TRIO %d\n",
+				rc_delay[trio_index][mac], mac,
+				trio_index);
+			msleep(rc_delay[trio_index][mac] * 1000);
+		} else {
+			/*
+			 * Wait a bit here because some EP devices
+			 * take longer to come up.
+			 */
+			msleep(1000);
+		}
 
+		/* Check for PCIe link-up status again. */
 		port_status.word =
 			__gxio_mmio_read(trio_context->mmio_base_mac +
 					 reg_offset);
 		if (!port_status.dl_up) {
-			pr_err("PCI: link is down, MAC %d on TRIO %d\n",
-				mac, trio_index);
+			if (pcie_ports[trio_index].ports[mac].removable) {
+				pr_info("PCI: link is down, MAC %d on TRIO %d\n",
+					mac, trio_index);
+				pr_info("This is expected if no PCIe card"
+					" is connected to this link\n");
+			} else
+				pr_err("PCI: link is down, MAC %d on TRIO %d\n",
+					mac, trio_index);
 			continue;
 		}
 
@@ -739,7 +789,6 @@ int __init pcibios_init(void)
 		 * Change the device ID so that Linux bus crawl doesn't confuse
 		 * the internal bridge with any Tilera endpoints.
 		 */
-
 		reg_offset =
 			(TRIO_PCIE_RC_DEVICE_ID_VEN_ID <<
 				TRIO_CFG_REGION_ADDR__REG_SHIFT) |
@@ -752,10 +801,7 @@ int __init pcibios_init(void)
 				    TRIO_PCIE_RC_DEVICE_ID_VEN_ID__DEV_ID_SHIFT) |
 				    TILERA_VENDOR_ID);
 
-		/*
-		 * Set the internal P2P bridge class code.
-		 */
-
+		/* Set the internal P2P bridge class code. */
 		reg_offset =
 			(TRIO_PCIE_RC_REVISION_ID <<
 				TRIO_CFG_REGION_ADDR__REG_SHIFT) |
@@ -766,26 +812,22 @@ int __init pcibios_init(void)
 		class_code_revision =
 			__gxio_mmio_read32(trio_context->mmio_base_mac +
 					   reg_offset);
-		class_code_revision = (class_code_revision & 0xff ) |
-					(PCI_CLASS_BRIDGE_PCI << 16);
+		class_code_revision = (class_code_revision & 0xff) |
+			(PCI_CLASS_BRIDGE_PCI << 16);
 
 		__gxio_mmio_write32(trio_context->mmio_base_mac +
 				    reg_offset, class_code_revision);
 
 #ifdef USE_SHARED_PCIE_CONFIG_REGION
 
-		/*
-		 * Map in the MMIO space for the PIO region.
-		 */
+		/* Map in the MMIO space for the PIO region. */
 		offset = HV_TRIO_PIO_OFFSET(trio_context->pio_cfg_index) |
 			(((unsigned long long)mac) <<
 			TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR__MAC_SHIFT);
 
 #else
 
-		/*
-		 * Alloc a PIO region for PCI config access per MAC.
-		 */
+		/* Alloc a PIO region for PCI config access per MAC. */
 		ret = gxio_trio_alloc_pio_regions(trio_context, 1, 0, 0);
 		if (ret < 0) {
 			pr_err("PCI: PCI CFG PIO alloc failure for mac %d "
@@ -796,9 +838,7 @@ int __init pcibios_init(void)
 
 		trio_context->pio_cfg_index[mac] = ret;
 
-		/*
-		 * For PIO CFG, the bus_address_hi parameter is 0.
-		 */
+		/* For PIO CFG, the bus_address_hi parameter is 0. */
 		ret = gxio_trio_init_pio_region_aux(trio_context,
 			trio_context->pio_cfg_index[mac],
 			mac, 0, HV_TRIO_PIO_FLAG_CONFIG_SPACE);
@@ -815,9 +855,15 @@ int __init pcibios_init(void)
 
 #endif
 
+		/*
+		 * To save VMALLOC space, we take advantage of the fact that
+		 * bit 29 in the PIO CFG address format is reserved 0. With
+		 * TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR__MAC_SHIFT being 30,
+		 * this cuts VMALLOC space usage from 1GB to 512MB per mac.
+		 */
 		trio_context->mmio_base_pio_cfg[mac] =
-			iorpc_ioremap(trio_context->fd, offset,
-			(1 << TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR__MAC_SHIFT));
+			iorpc_ioremap(trio_context->fd, offset, (1UL <<
+			(TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR__MAC_SHIFT - 1)));
 		if (trio_context->mmio_base_pio_cfg[mac] == NULL) {
 			pr_err("PCI: PIO map failure for mac %d on TRIO %d\n",
 				mac, trio_index);
@@ -825,9 +871,7 @@ int __init pcibios_init(void)
 			continue;
 		}
 
-		/*
-		 * Initialize the PCIe interrupts.
-		 */
+		/* Initialize the PCIe interrupts. */
 		if (tile_init_irqs(controller)) {
 			pr_err("PCI: IRQs init failure for mac %d on TRIO %d\n",
 				mac, trio_index);
@@ -838,17 +882,16 @@ int __init pcibios_init(void)
 		/*
 		 * The PCI memory resource is located above the PA space.
 		 * The memory range for the PCI root bus should not overlap
-		 * with the physical RAM
+		 * with the physical RAM.
 		 */
 		pci_add_resource_offset(&resources, &controller->mem_space,
 					controller->mem_offset);
-
+		pci_add_resource(&resources, &controller->io_space);
 		controller->first_busno = next_busno;
 		bus = pci_scan_root_bus(NULL, next_busno, controller->ops,
 					controller, &resources);
 		controller->root_bus = bus;
 		next_busno = bus->busn_res.end + 1;
-
 	}
 
 	/* Do machine dependent PCI interrupt routing */
@@ -860,7 +903,6 @@ int __init pcibios_init(void)
 	 * It allocates all of the resources (I/O memory, etc)
 	 * associated with the devices read in above.
 	 */
-
 	pci_assign_unassigned_resources();
 
 	/* Record the I/O resources in the PCI controller structure. */
@@ -868,9 +910,6 @@ int __init pcibios_init(void)
 		struct pci_controller *controller = &pci_controllers[i];
 		gxio_trio_context_t *trio_context = controller->trio;
 		struct pci_bus *root_bus = pci_controllers[i].root_bus;
-		struct pci_bus *next_bus;
-		uint32_t bus_address_hi;
-		struct pci_dev *dev;
 		int ret;
 		int j;
 
@@ -884,43 +923,12 @@ int __init pcibios_init(void)
 		/* Configure the max_payload_size values for this domain. */
 		fixup_read_and_payload_sizes(controller);
 
-		list_for_each_entry(dev, &root_bus->devices, bus_list) {
-			/* Find the PCI host controller, ie. the 1st bridge. */
-			if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI &&
-				(PCI_SLOT(dev->devfn) == 0)) {
-				next_bus = dev->subordinate;
-				pci_controllers[i].mem_resources[0] =
-					*next_bus->resource[0];
-				pci_controllers[i].mem_resources[1] =
-					 *next_bus->resource[1];
-				pci_controllers[i].mem_resources[2] =
-					 *next_bus->resource[2];
-
-				break;
-			}
-		}
-
-		if (pci_controllers[i].mem_resources[1].flags & IORESOURCE_MEM)
-			bus_address_hi =
-				pci_controllers[i].mem_resources[1].start >> 32;
-		else if (pci_controllers[i].mem_resources[2].flags & IORESOURCE_PREFETCH)
-			bus_address_hi =
-				pci_controllers[i].mem_resources[2].start >> 32;
-		else {
-			/* This is unlikely. */
-			pr_err("PCI: no memory resources on TRIO %d mac %d\n",
-				controller->trio_index, controller->mac);
-			continue;
-		}
-
-		/*
-		 * Alloc a PIO region for PCI memory access for each RC port.
-		 */
+		/* Alloc a PIO region for PCI memory access for each RC port. */
 		ret = gxio_trio_alloc_pio_regions(trio_context, 1, 0, 0);
 		if (ret < 0) {
 			pr_err("PCI: MEM PIO alloc failure on TRIO %d mac %d, "
-				"give up\n", controller->trio_index,
-				controller->mac);
+			       "give up\n", controller->trio_index,
+			       controller->mac);
 
 			continue;
 		}
@@ -938,12 +946,45 @@ int __init pcibios_init(void)
 						    0);
 		if (ret < 0) {
 			pr_err("PCI: MEM PIO init failure on TRIO %d mac %d, "
-				"give up\n", controller->trio_index,
-				controller->mac);
+			       "give up\n", controller->trio_index,
+			       controller->mac);
 
 			continue;
 		}
 
+#ifdef CONFIG_TILE_PCI_IO
+		/*
+		 * Alloc a PIO region for PCI I/O space access for each RC port.
+		 */
+		ret = gxio_trio_alloc_pio_regions(trio_context, 1, 0, 0);
+		if (ret < 0) {
+			pr_err("PCI: I/O PIO alloc failure on TRIO %d mac %d, "
+			       "give up\n", controller->trio_index,
+			       controller->mac);
+
+			continue;
+		}
+
+		controller->pio_io_index = ret;
+
+		/*
+		 * For PIO IO, the bus_address_hi parameter is hard-coded 0
+		 * because PCI I/O address space is 32-bit.
+		 */
+		ret = gxio_trio_init_pio_region_aux(trio_context,
+						    controller->pio_io_index,
+						    controller->mac,
+						    0,
+						    HV_TRIO_PIO_FLAG_IO_SPACE);
+		if (ret < 0) {
+			pr_err("PCI: I/O PIO init failure on TRIO %d mac %d, "
+			       "give up\n", controller->trio_index,
+			       controller->mac);
+
+			continue;
+		}
+#endif
+
 		/*
 		 * Configure a Mem-Map region for each memory controller so
 		 * that Linux can map all of its PA space to the PCI bus.
@@ -958,9 +999,9 @@ int __init pcibios_init(void)
 							  0);
 			if (ret < 0) {
 				pr_err("PCI: Mem-Map alloc failure on TRIO %d "
-					"mac %d for MC %d, give up\n",
-					controller->trio_index,
-					controller->mac, j);
+				       "mac %d for MC %d, give up\n",
+				       controller->trio_index,
+				       controller->mac, j);
 
 				goto alloc_mem_map_failed;
 			}
@@ -991,9 +1032,9 @@ int __init pcibios_init(void)
 				GXIO_TRIO_ORDER_MODE_UNORDERED);
 			if (ret < 0) {
 				pr_err("PCI: Mem-Map init failure on TRIO %d "
-					"mac %d for MC %d, give up\n",
-					controller->trio_index,
-					controller->mac, j);
+				       "mac %d for MC %d, give up\n",
+				       controller->trio_index,
+				       controller->mac, j);
 
 				goto alloc_mem_map_failed;
 			}
@@ -1002,23 +1043,19 @@ int __init pcibios_init(void)
 alloc_mem_map_failed:
 			break;
 		}
-
 	}
 
 	return 0;
 }
 subsys_initcall(pcibios_init);
 
-/* Note: to be deleted after Linux 3.6 merge. */
+/* No bus fixups needed. */
 void pcibios_fixup_bus(struct pci_bus *bus)
 {
 }
 
-/*
- * This can be called from the generic PCI layer, but doesn't need to
- * do anything.
- */
-char *pcibios_setup(char *str)
+/* Process any "pci=" kernel boot arguments. */
+char *__init pcibios_setup(char *str)
 {
 	if (!strcmp(str, "off")) {
 		pci_probe = 0;
@@ -1029,8 +1066,7 @@ char *pcibios_setup(char *str)
 
 /*
  * Enable memory address decoding, as appropriate, for the
- * device described by the 'dev' struct. The I/O decoding
- * is disabled, though the TILE-Gx supports I/O addressing.
+ * device described by the 'dev' struct.
  *
  * This is called from the generic PCI layer, and can be called
  * for bridges or endpoints.
@@ -1040,13 +1076,24 @@ int pcibios_enable_device(struct pci_dev *dev, int mask)
 	return pci_enable_resources(dev, mask);
 }
 
-/* Called for each device after PCI setup is done. */
+/*
+ * Called for each device after PCI setup is done.
+ * We initialize the PCI device capabilities conservatively, assuming that
+ * all devices can only address the 32-bit DMA space. The exception here is
+ * that the device dma_offset is set to the value that matches the 64-bit
+ * capable devices. This is OK because dma_offset is not used by legacy
+ * dma_ops, nor by the hybrid dma_ops's streaming DMAs, which are 64-bit ops.
+ * This implementation matches the kernel design of setting PCI devices'
+ * coherent_dma_mask to 0xffffffffull by default, allowing the device drivers
+ * to skip calling pci_set_consistent_dma_mask(DMA_BIT_MASK(32)).
+ */
 static void pcibios_fixup_final(struct pci_dev *pdev)
 {
-	set_dma_ops(&pdev->dev, gx_pci_dma_map_ops);
+	set_dma_ops(&pdev->dev, gx_legacy_pci_dma_map_ops);
 	set_dma_offset(&pdev->dev, TILE_PCI_MEM_MAP_BASE_OFFSET);
 	pdev->dev.archdata.max_direct_dma_addr =
 		TILE_PCI_MAX_DIRECT_DMA_ADDRESS;
+	pdev->dev.coherent_dma_mask = TILE_PCI_MAX_DIRECT_DMA_ADDRESS;
 }
 DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pcibios_fixup_final);
 
@@ -1060,19 +1107,15 @@ void __iomem *ioremap(resource_size_t phys_addr, unsigned long size)
 	resource_size_t start;
 	resource_size_t end;
 	int trio_fd;
-	int i, j;
+	int i;
 
 	start = phys_addr;
 	end = phys_addr + size - 1;
 
 	/*
-	 * In the following, each PCI controller's mem_resources[1]
-	 * represents its (non-prefetchable) PCI memory resource and
-	 * mem_resources[2] refers to its prefetchable PCI memory resource.
-	 * By searching phys_addr in each controller's mem_resources[], we can
+	 * By searching phys_addr in each controller's mem_space, we can
 	 * determine the controller that should accept the PCI memory access.
 	 */
-
 	for (i = 0; i < num_rc_controllers; i++) {
 		/*
 		 * Skip controllers that are not properly initialized or
@@ -1081,25 +1124,18 @@ void __iomem *ioremap(resource_size_t phys_addr, unsigned long size)
 		if (pci_controllers[i].root_bus == NULL)
 			continue;
 
-		for (j = 1; j < 3; j++) {
-			bar_start =
-				pci_controllers[i].mem_resources[j].start;
-			bar_end =
-				pci_controllers[i].mem_resources[j].end;
-
-			if ((start >= bar_start) && (end <= bar_end)) {
+		bar_start = pci_controllers[i].mem_space.start;
+		bar_end = pci_controllers[i].mem_space.end;
 
-				controller = &pci_controllers[i];
-
-				goto got_it;
-			}
+		if ((start >= bar_start) && (end <= bar_end)) {
+			controller = &pci_controllers[i];
+			break;
 		}
 	}
 
 	if (controller == NULL)
 		return NULL;
 
-got_it:
 	trio_fd = controller->trio->fd;
 
 	/* Convert the resource start to the bus address offset. */
@@ -1107,14 +1143,71 @@ got_it:
 
 	offset = HV_TRIO_PIO_OFFSET(controller->pio_mem_index) + start;
 
-	/*
-	 * We need to keep the PCI bus address's in-page offset in the VA.
-	 */
+	/* We need to keep the PCI bus address's in-page offset in the VA. */
 	return iorpc_ioremap(trio_fd, offset, size) +
-		(phys_addr & (PAGE_SIZE - 1));
+		(start & (PAGE_SIZE - 1));
 }
 EXPORT_SYMBOL(ioremap);
 
+#ifdef CONFIG_TILE_PCI_IO
+/* Map a PCI I/O address into VA space. */
+void __iomem *ioport_map(unsigned long port, unsigned int size)
+{
+	struct pci_controller *controller = NULL;
+	resource_size_t bar_start;
+	resource_size_t bar_end;
+	resource_size_t offset;
+	resource_size_t start;
+	resource_size_t end;
+	int trio_fd;
+	int i;
+
+	start = port;
+	end = port + size - 1;
+
+	/*
+	 * By searching the port in each controller's io_space, we can
+	 * determine the controller that should accept the PCI I/O access.
+	 */
+	for (i = 0; i < num_rc_controllers; i++) {
+		/*
+		 * Skip controllers that are not properly initialized or
+		 * have down links.
+		 */
+		if (pci_controllers[i].root_bus == NULL)
+			continue;
+
+		bar_start = pci_controllers[i].io_space.start;
+		bar_end = pci_controllers[i].io_space.end;
+
+		if ((start >= bar_start) && (end <= bar_end)) {
+			controller = &pci_controllers[i];
+			break;
+		}
+	}
+
+	if (controller == NULL)
+		return NULL;
+
+	trio_fd = controller->trio->fd;
+
+	/* Convert the resource start to the bus address offset. */
+	port -= controller->io_space.start;
+
+	offset = HV_TRIO_PIO_OFFSET(controller->pio_io_index) + port;
+
+	/* We need to keep the PCI bus address's in-page offset in the VA. */
+	return iorpc_ioremap(trio_fd, offset, size) + (port & (PAGE_SIZE - 1));
+}
+EXPORT_SYMBOL(ioport_map);
+
+void ioport_unmap(void __iomem *addr)
+{
+	iounmap(addr);
+}
+EXPORT_SYMBOL(ioport_unmap);
+#endif
+
 void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
 {
 	iounmap(addr);
@@ -1136,7 +1229,6 @@ EXPORT_SYMBOL(pci_iounmap);
  * offset is in bytes, from the start of config space for the
  * specified bus & device.
  */
-
 static int tile_cfg_read(struct pci_bus *bus, unsigned int devfn, int offset,
 			 int size, u32 *val)
 {
@@ -1186,7 +1278,6 @@ static int tile_cfg_read(struct pci_bus *bus, unsigned int devfn, int offset,
 	 * Accesses to the directly attached device have to be
 	 * sent as type-0 configs.
 	 */
-
 	if (busnum == (controller->first_busno + 1)) {
 		/*
 		 * There is only one device off of our built-in P2P bridge.
@@ -1208,9 +1299,8 @@ static int tile_cfg_read(struct pci_bus *bus, unsigned int devfn, int offset,
 	 * Note that we don't set the mac field in cfg_addr because the
 	 * mapping is per port.
 	 */
-
 	mmio_addr = trio_context->mmio_base_pio_cfg[controller->mac] +
-			cfg_addr.word;
+		cfg_addr.word;
 
 valid_device:
 
@@ -1314,7 +1404,6 @@ static int tile_cfg_write(struct pci_bus *bus, unsigned int devfn, int offset,
 	 * Accesses to the directly attached device have to be
 	 * sent as type-0 configs.
 	 */
-
 	if (busnum == (controller->first_busno + 1)) {
 		/*
 		 * There is only one device off of our built-in P2P bridge.
@@ -1336,7 +1425,6 @@ static int tile_cfg_write(struct pci_bus *bus, unsigned int devfn, int offset,
 	 * Note that we don't set the mac field in cfg_addr because the
 	 * mapping is per port.
 	 */
-
 	mmio_addr = trio_context->mmio_base_pio_cfg[controller->mac] +
 			cfg_addr.word;
 
@@ -1374,11 +1462,8 @@ static struct pci_ops tile_cfg_ops = {
 };
 
 
-/*
- * MSI support starts here.
- */
-static unsigned int
-tilegx_msi_startup(struct irq_data *d)
+/* MSI support starts here. */
+static unsigned int tilegx_msi_startup(struct irq_data *d)
 {
 	if (d->msi_desc)
 		unmask_msi_irq(d);
@@ -1386,21 +1471,18 @@ tilegx_msi_startup(struct irq_data *d)
 	return 0;
 }
 
-static void
-tilegx_msi_ack(struct irq_data *d)
+static void tilegx_msi_ack(struct irq_data *d)
 {
 	__insn_mtspr(SPR_IPI_EVENT_RESET_K, 1UL << d->irq);
 }
 
-static void
-tilegx_msi_mask(struct irq_data *d)
+static void tilegx_msi_mask(struct irq_data *d)
 {
 	mask_msi_irq(d);
 	__insn_mtspr(SPR_IPI_MASK_SET_K, 1UL << d->irq);
 }
 
-static void
-tilegx_msi_unmask(struct irq_data *d)
+static void tilegx_msi_unmask(struct irq_data *d)
 {
 	__insn_mtspr(SPR_IPI_MASK_RESET_K, 1UL << d->irq);
 	unmask_msi_irq(d);
@@ -1457,32 +1539,55 @@ int arch_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc)
 	trio_context = controller->trio;
 
 	/*
-	 * Allocate the Mem-Map that will accept the MSI write and
-	 * trigger the TILE-side interrupts.
+	 * Allocate a scatter-queue that will accept the MSI write and
+	 * trigger the TILE-side interrupts. We use the scatter-queue regions
+	 * before the mem map regions, because the latter are needed by more
+	 * applications.
 	 */
-	mem_map = gxio_trio_alloc_memory_maps(trio_context, 1, 0, 0);
-	if (mem_map < 0) {
-		dev_printk(KERN_INFO, &pdev->dev,
-			"%s Mem-Map alloc failure. "
-			"Failed to initialize MSI interrupts. "
-			"Falling back to legacy interrupts.\n",
-			desc->msi_attrib.is_msix ? "MSI-X" : "MSI");
+	mem_map = gxio_trio_alloc_scatter_queues(trio_context, 1, 0, 0);
+	if (mem_map >= 0) {
+		TRIO_MAP_SQ_DOORBELL_FMT_t doorbell_template = {{
+			.pop = 0,
+			.doorbell = 1,
+		}};
+
+		mem_map += TRIO_NUM_MAP_MEM_REGIONS;
+		mem_map_base = MEM_MAP_INTR_REGIONS_BASE +
+			mem_map * MEM_MAP_INTR_REGION_SIZE;
+		mem_map_limit = mem_map_base + MEM_MAP_INTR_REGION_SIZE - 1;
+
+		msi_addr = mem_map_base + MEM_MAP_INTR_REGION_SIZE - 8;
+		msg.data = (unsigned int)doorbell_template.word;
+	} else {
+		/* SQ regions are out, allocate from map mem regions. */
+		mem_map = gxio_trio_alloc_memory_maps(trio_context, 1, 0, 0);
+		if (mem_map < 0) {
+			dev_printk(KERN_INFO, &pdev->dev,
+				"%s Mem-Map alloc failure. "
+				"Failed to initialize MSI interrupts. "
+				"Falling back to legacy interrupts.\n",
+				desc->msi_attrib.is_msix ? "MSI-X" : "MSI");
+			ret = -ENOMEM;
+			goto msi_mem_map_alloc_failure;
+		}
 
-		ret = -ENOMEM;
-		goto msi_mem_map_alloc_failure;
+		mem_map_base = MEM_MAP_INTR_REGIONS_BASE +
+			mem_map * MEM_MAP_INTR_REGION_SIZE;
+		mem_map_limit = mem_map_base + MEM_MAP_INTR_REGION_SIZE - 1;
+
+		msi_addr = mem_map_base + TRIO_MAP_MEM_REG_INT3 -
+			TRIO_MAP_MEM_REG_INT0;
+
+		msg.data = mem_map;
 	}
 
 	/* We try to distribute different IRQs to different tiles. */
 	cpu = tile_irq_cpu(irq);
 
 	/*
-	 * Now call up to the HV to configure the Mem-Map interrupt and
+	 * Now call up to the HV to configure the MSI interrupt and
 	 * set up the IPI binding.
 	 */
-	mem_map_base = MEM_MAP_INTR_REGIONS_BASE +
-		mem_map * MEM_MAP_INTR_REGION_SIZE;
-	mem_map_limit = mem_map_base + MEM_MAP_INTR_REGION_SIZE - 1;
-
 	ret = gxio_trio_config_msi_intr(trio_context, cpu_x(cpu), cpu_y(cpu),
 					KERNEL_PL, irq, controller->mac,
 					mem_map, mem_map_base, mem_map_limit,
@@ -1495,13 +1600,9 @@ int arch_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc)
 
 	irq_set_msi_desc(irq, desc);
 
-	msi_addr = mem_map_base + TRIO_MAP_MEM_REG_INT3 - TRIO_MAP_MEM_REG_INT0;
-
 	msg.address_hi = msi_addr >> 32;
 	msg.address_lo = msi_addr & 0xffffffff;
 
-	msg.data = mem_map;
-
 	write_msi_msg(irq, &msg);
 	irq_set_chip_and_handler(irq, &tilegx_msi_chip, handle_level_irq);
 	irq_set_handler_data(irq, controller);
diff --git a/arch/tile/kernel/proc.c b/arch/tile/kernel/proc.c
index dafc447b5125..681100c59fda 100644
--- a/arch/tile/kernel/proc.c
+++ b/arch/tile/kernel/proc.c
@@ -113,7 +113,6 @@ arch_initcall(proc_tile_init);
  * Support /proc/sys/tile directory
  */
 
-#ifndef __tilegx__  /* FIXME: GX: no support for unaligned access yet */
 static ctl_table unaligned_subtable[] = {
 	{
 		.procname	= "enabled",
@@ -160,4 +159,3 @@ static int __init proc_sys_tile_init(void)
 }
 
 arch_initcall(proc_sys_tile_init);
-#endif
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index 8ac304484f98..16ed58948757 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -33,6 +33,7 @@
 #include <asm/syscalls.h>
 #include <asm/traps.h>
 #include <asm/setup.h>
+#include <asm/uaccess.h>
 #ifdef CONFIG_HARDWALL
 #include <asm/hardwall.h>
 #endif
@@ -74,19 +75,6 @@ void arch_release_thread_info(struct thread_info *info)
 {
 	struct single_step_state *step_state = info->step_state;
 
-#ifdef CONFIG_HARDWALL
-	/*
-	 * We free a thread_info from the context of the task that has
-	 * been scheduled next, so the original task is already dead.
-	 * Calling deactivate here just frees up the data structures.
-	 * If the task we're freeing held the last reference to a
-	 * hardwall fd, it would have been released prior to this point
-	 * anyway via exit_files(), and the hardwall_task.info pointers
-	 * would be NULL by now.
-	 */
-	hardwall_deactivate_all(info->task);
-#endif
-
 	if (step_state) {
 
 		/*
@@ -160,6 +148,14 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 	 */
 	task_thread_info(p)->step_state = NULL;
 
+#ifdef __tilegx__
+	/*
+	 * Do not clone unalign jit fixup from the parent; each thread
+	 * must allocate its own on demand.
+	 */
+	task_thread_info(p)->unalign_jit_base = NULL;
+#endif
+
 	/*
 	 * Copy the registers onto the kernel stack so the
 	 * return-from-interrupt code will reload it into registers.
@@ -191,16 +187,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 	memset(&p->thread.dma_async_tlb, 0, sizeof(struct async_tlb));
 #endif
 
-#if CHIP_HAS_SN_PROC()
-	/* Likewise, the new thread is not running static processor code. */
-	p->thread.sn_proc_running = 0;
-	memset(&p->thread.sn_async_tlb, 0, sizeof(struct async_tlb));
-#endif
-
-#if CHIP_HAS_PROC_STATUS_SPR()
 	/* New thread has its miscellaneous processor state bits clear. */
 	p->thread.proc_status = 0;
-#endif
 
 #ifdef CONFIG_HARDWALL
 	/* New thread does not own any networks. */
@@ -218,19 +206,32 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 	return 0;
 }
 
+int set_unalign_ctl(struct task_struct *tsk, unsigned int val)
+{
+	task_thread_info(tsk)->align_ctl = val;
+	return 0;
+}
+
+int get_unalign_ctl(struct task_struct *tsk, unsigned long adr)
+{
+	return put_user(task_thread_info(tsk)->align_ctl,
+			(unsigned int __user *)adr);
+}
+
+static struct task_struct corrupt_current = { .comm = "<corrupt>" };
+
 /*
  * Return "current" if it looks plausible, or else a pointer to a dummy.
  * This can be helpful if we are just trying to emit a clean panic.
  */
 struct task_struct *validate_current(void)
 {
-	static struct task_struct corrupt = { .comm = "<corrupt>" };
 	struct task_struct *tsk = current;
 	if (unlikely((unsigned long)tsk < PAGE_OFFSET ||
 		     (high_memory && (void *)tsk > high_memory) ||
 		     ((unsigned long)tsk & (__alignof__(*tsk) - 1)) != 0)) {
 		pr_err("Corrupt 'current' %p (sp %#lx)\n", tsk, stack_pointer);
-		tsk = &corrupt;
+		tsk = &corrupt_current;
 	}
 	return tsk;
 }
@@ -369,15 +370,11 @@ static void save_arch_state(struct thread_struct *t)
 	t->system_save[2] = __insn_mfspr(SPR_SYSTEM_SAVE_0_2);
 	t->system_save[3] = __insn_mfspr(SPR_SYSTEM_SAVE_0_3);
 	t->intctrl_0 = __insn_mfspr(SPR_INTCTRL_0_STATUS);
-#if CHIP_HAS_PROC_STATUS_SPR()
 	t->proc_status = __insn_mfspr(SPR_PROC_STATUS);
-#endif
 #if !CHIP_HAS_FIXED_INTVEC_BASE()
 	t->interrupt_vector_base = __insn_mfspr(SPR_INTERRUPT_VECTOR_BASE_0);
 #endif
-#if CHIP_HAS_TILE_RTF_HWM()
 	t->tile_rtf_hwm = __insn_mfspr(SPR_TILE_RTF_HWM);
-#endif
 #if CHIP_HAS_DSTREAM_PF()
 	t->dstream_pf = __insn_mfspr(SPR_DSTREAM_PF);
 #endif
@@ -398,15 +395,11 @@ static void restore_arch_state(const struct thread_struct *t)
 	__insn_mtspr(SPR_SYSTEM_SAVE_0_2, t->system_save[2]);
 	__insn_mtspr(SPR_SYSTEM_SAVE_0_3, t->system_save[3]);
 	__insn_mtspr(SPR_INTCTRL_0_STATUS, t->intctrl_0);
-#if CHIP_HAS_PROC_STATUS_SPR()
 	__insn_mtspr(SPR_PROC_STATUS, t->proc_status);
-#endif
 #if !CHIP_HAS_FIXED_INTVEC_BASE()
 	__insn_mtspr(SPR_INTERRUPT_VECTOR_BASE_0, t->interrupt_vector_base);
 #endif
-#if CHIP_HAS_TILE_RTF_HWM()
 	__insn_mtspr(SPR_TILE_RTF_HWM, t->tile_rtf_hwm);
-#endif
 #if CHIP_HAS_DSTREAM_PF()
 	__insn_mtspr(SPR_DSTREAM_PF, t->dstream_pf);
 #endif
@@ -415,26 +408,11 @@ static void restore_arch_state(const struct thread_struct *t)
 
 void _prepare_arch_switch(struct task_struct *next)
 {
-#if CHIP_HAS_SN_PROC()
-	int snctl;
-#endif
 #if CHIP_HAS_TILE_DMA()
 	struct tile_dma_state *dma = &current->thread.tile_dma_state;
 	if (dma->enabled)
 		save_tile_dma_state(dma);
 #endif
-#if CHIP_HAS_SN_PROC()
-	/*
-	 * Suspend the static network processor if it was running.
-	 * We do not suspend the fabric itself, just like we don't
-	 * try to suspend the UDN.
-	 */
-	snctl = __insn_mfspr(SPR_SNCTL);
-	current->thread.sn_proc_running =
-		(snctl & SPR_SNCTL__FRZPROC_MASK) == 0;
-	if (current->thread.sn_proc_running)
-		__insn_mtspr(SPR_SNCTL, snctl | SPR_SNCTL__FRZPROC_MASK);
-#endif
 }
 
 
@@ -462,17 +440,6 @@ struct task_struct *__sched _switch_to(struct task_struct *prev,
 	/* Restore other arch state. */
 	restore_arch_state(&next->thread);
 
-#if CHIP_HAS_SN_PROC()
-	/*
-	 * Restart static network processor in the new process
-	 * if it was running before.
-	 */
-	if (next->thread.sn_proc_running) {
-		int snctl = __insn_mfspr(SPR_SNCTL);
-		__insn_mtspr(SPR_SNCTL, snctl & ~SPR_SNCTL__FRZPROC_MASK);
-	}
-#endif
-
 #ifdef CONFIG_HARDWALL
 	/* Enable or disable access to the network registers appropriately. */
 	hardwall_switch_tasks(prev, next);
@@ -514,7 +481,7 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags)
 		schedule();
 		return 1;
 	}
-#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
+#if CHIP_HAS_TILE_DMA()
 	if (thread_info_flags & _TIF_ASYNC_TLB) {
 		do_async_page_fault(regs);
 		return 1;
@@ -564,7 +531,15 @@ void flush_thread(void)
  */
 void exit_thread(void)
 {
-	/* Nothing */
+#ifdef CONFIG_HARDWALL
+	/*
+	 * Remove the task from the list of tasks that are associated
+	 * with any live hardwalls.  (If the task that is exiting held
+	 * the last reference to a hardwall fd, it would already have
+	 * been released and deactivated at this point.)
+	 */
+	hardwall_deactivate_all(current);
+#endif
 }
 
 void show_regs(struct pt_regs *regs)
@@ -573,23 +548,24 @@ void show_regs(struct pt_regs *regs)
 	int i;
 
 	pr_err("\n");
-	show_regs_print_info(KERN_ERR);
+	if (tsk != &corrupt_current)
+		show_regs_print_info(KERN_ERR);
 #ifdef __tilegx__
-	for (i = 0; i < 51; i += 3)
+	for (i = 0; i < 17; i++)
 		pr_err(" r%-2d: "REGFMT" r%-2d: "REGFMT" r%-2d: "REGFMT"\n",
-		       i, regs->regs[i], i+1, regs->regs[i+1],
-		       i+2, regs->regs[i+2]);
-	pr_err(" r51: "REGFMT" r52: "REGFMT" tp : "REGFMT"\n",
-	       regs->regs[51], regs->regs[52], regs->tp);
+		       i, regs->regs[i], i+18, regs->regs[i+18],
+		       i+36, regs->regs[i+36]);
+	pr_err(" r17: "REGFMT" r35: "REGFMT" tp : "REGFMT"\n",
+	       regs->regs[17], regs->regs[35], regs->tp);
 	pr_err(" sp : "REGFMT" lr : "REGFMT"\n", regs->sp, regs->lr);
 #else
-	for (i = 0; i < 52; i += 4)
+	for (i = 0; i < 13; i++)
 		pr_err(" r%-2d: "REGFMT" r%-2d: "REGFMT
 		       " r%-2d: "REGFMT" r%-2d: "REGFMT"\n",
-		       i, regs->regs[i], i+1, regs->regs[i+1],
-		       i+2, regs->regs[i+2], i+3, regs->regs[i+3]);
-	pr_err(" r52: "REGFMT" tp : "REGFMT" sp : "REGFMT" lr : "REGFMT"\n",
-	       regs->regs[52], regs->tp, regs->sp, regs->lr);
+		       i, regs->regs[i], i+14, regs->regs[i+14],
+		       i+27, regs->regs[i+27], i+40, regs->regs[i+40]);
+	pr_err(" r13: "REGFMT" tp : "REGFMT" sp : "REGFMT" lr : "REGFMT"\n",
+	       regs->regs[13], regs->tp, regs->sp, regs->lr);
 #endif
 	pr_err(" pc : "REGFMT" ex1: %ld     faultnum: %ld\n",
 	       regs->pc, regs->ex1, regs->faultnum);
diff --git a/arch/tile/kernel/ptrace.c b/arch/tile/kernel/ptrace.c
index 0f83ed4602b2..de98c6ddf136 100644
--- a/arch/tile/kernel/ptrace.c
+++ b/arch/tile/kernel/ptrace.c
@@ -265,6 +265,21 @@ int do_syscall_trace_enter(struct pt_regs *regs)
 
 void do_syscall_trace_exit(struct pt_regs *regs)
 {
+	long errno;
+
+	/*
+	 * The standard tile calling convention returns the value (or negative
+	 * errno) in r0, and zero (or positive errno) in r1.
+	 * It saves a couple of cycles on the hot path to do this work in
+	 * registers only as we return, rather than updating the in-memory
+	 * struct ptregs.
+	 */
+	errno = (long) regs->regs[0];
+	if (errno < 0 && errno > -4096)
+		regs->regs[1] = -errno;
+	else
+		regs->regs[1] = 0;
+
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
 		tracehook_report_syscall_exit(regs, 0);
 
@@ -272,7 +287,7 @@ void do_syscall_trace_exit(struct pt_regs *regs)
 		trace_sys_exit(regs, regs->regs[0]);
 }
 
-void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
+void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs)
 {
 	struct siginfo info;
 
@@ -288,5 +303,5 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
 /* Handle synthetic interrupt delivered only by the simulator. */
 void __kprobes do_breakpoint(struct pt_regs* regs, int fault_num)
 {
-	send_sigtrap(current, regs, fault_num);
+	send_sigtrap(current, regs);
 }
diff --git a/arch/tile/kernel/reboot.c b/arch/tile/kernel/reboot.c
index d1b5c913ae72..6c5d2c070a12 100644
--- a/arch/tile/kernel/reboot.c
+++ b/arch/tile/kernel/reboot.c
@@ -27,7 +27,6 @@
 
 void machine_halt(void)
 {
-	warn_early_printk();
 	arch_local_irq_disable_all();
 	smp_send_stop();
 	hv_halt();
@@ -35,7 +34,6 @@ void machine_halt(void)
 
 void machine_power_off(void)
 {
-	warn_early_printk();
 	arch_local_irq_disable_all();
 	smp_send_stop();
 	hv_power_off();
diff --git a/arch/tile/kernel/regs_32.S b/arch/tile/kernel/regs_32.S
index c12280c2d904..542cae17a93a 100644
--- a/arch/tile/kernel/regs_32.S
+++ b/arch/tile/kernel/regs_32.S
@@ -20,7 +20,7 @@
 #include <asm/switch_to.h>
 
 /*
- * See <asm/system.h>; called with prev and next task_struct pointers.
+ * See <asm/switch_to.h>; called with prev and next task_struct pointers.
  * "prev" is returned in r0 for _switch_to and also for ret_from_fork.
  *
  * We want to save pc/sp in "prev", and get the new pc/sp from "next".
@@ -39,7 +39,7 @@
  */
 
 #if CALLEE_SAVED_REGS_COUNT != 24
-# error Mismatch between <asm/system.h> and kernel/entry.S
+# error Mismatch between <asm/switch_to.h> and kernel/entry.S
 #endif
 #define FRAME_SIZE ((2 + CALLEE_SAVED_REGS_COUNT) * 4)
 
diff --git a/arch/tile/kernel/regs_64.S b/arch/tile/kernel/regs_64.S
index 0829fd01fa30..bbffcc6f340f 100644
--- a/arch/tile/kernel/regs_64.S
+++ b/arch/tile/kernel/regs_64.S
@@ -20,7 +20,7 @@
 #include <asm/switch_to.h>
 
 /*
- * See <asm/system.h>; called with prev and next task_struct pointers.
+ * See <asm/switch_to.h>; called with prev and next task_struct pointers.
  * "prev" is returned in r0 for _switch_to and also for ret_from_fork.
  *
  * We want to save pc/sp in "prev", and get the new pc/sp from "next".
@@ -39,7 +39,7 @@
  */
 
 #if CALLEE_SAVED_REGS_COUNT != 24
-# error Mismatch between <asm/system.h> and kernel/entry.S
+# error Mismatch between <asm/switch_to.h> and kernel/entry.S
 #endif
 #define FRAME_SIZE ((2 + CALLEE_SAVED_REGS_COUNT) * 8)
 
diff --git a/arch/tile/kernel/relocate_kernel_32.S b/arch/tile/kernel/relocate_kernel_32.S
index 010b418515f8..e44fbcf8cbd5 100644
--- a/arch/tile/kernel/relocate_kernel_32.S
+++ b/arch/tile/kernel/relocate_kernel_32.S
@@ -20,15 +20,6 @@
 #include <asm/page.h>
 #include <hv/hypervisor.h>
 
-#define ___hvb	MEM_SV_INTRPT + HV_GLUE_START_CPA
-
-#define ___hv_dispatch(f) (___hvb + (HV_DISPATCH_ENTRY_SIZE * f))
-
-#define ___hv_console_putc ___hv_dispatch(HV_DISPATCH_CONSOLE_PUTC)
-#define ___hv_halt         ___hv_dispatch(HV_DISPATCH_HALT)
-#define ___hv_reexec       ___hv_dispatch(HV_DISPATCH_REEXEC)
-#define ___hv_flush_remote ___hv_dispatch(HV_DISPATCH_FLUSH_REMOTE)
-
 #undef RELOCATE_NEW_KERNEL_VERBOSE
 
 STD_ENTRY(relocate_new_kernel)
@@ -43,8 +34,8 @@ STD_ENTRY(relocate_new_kernel)
 	addi	sp, sp, -8
 	/* we now have a stack (whether we need one or not) */
 
-	moveli	r40, lo16(___hv_console_putc)
-	auli	r40, r40, ha16(___hv_console_putc)
+	moveli	r40, lo16(hv_console_putc)
+	auli	r40, r40, ha16(hv_console_putc)
 
 #ifdef RELOCATE_NEW_KERNEL_VERBOSE
 	moveli	r0, 'r'
@@ -86,7 +77,6 @@ STD_ENTRY(relocate_new_kernel)
 	move	r30, sp
 	addi	sp, sp, -8
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 	/*
 	 * On TILEPro, we need to flush all tiles' caches, since we may
 	 * have been doing hash-for-home caching there.  Note that we
@@ -114,15 +104,14 @@ STD_ENTRY(relocate_new_kernel)
 	}
 	{
 	 move	r8, zero	 /* asids */
-	 moveli	r20, lo16(___hv_flush_remote)
+	 moveli	r20, lo16(hv_flush_remote)
 	}
 	{
 	 move	r9, zero	 /* asidcount */
-	 auli	r20, r20, ha16(___hv_flush_remote)
+	 auli	r20, r20, ha16(hv_flush_remote)
 	}
 
 	jalr	r20
-#endif
 
 	/* r33 is destination pointer, default to zero */
 
@@ -175,8 +164,8 @@ STD_ENTRY(relocate_new_kernel)
 	move	r0, r32
 	moveli	r1, 0		/* arg to hv_reexec is 64 bits */
 
-	moveli	r41, lo16(___hv_reexec)
-	auli	r41, r41, ha16(___hv_reexec)
+	moveli	r41, lo16(hv_reexec)
+	auli	r41, r41, ha16(hv_reexec)
 
 	jalr	r41
 
@@ -267,8 +256,8 @@ STD_ENTRY(relocate_new_kernel)
 	moveli	r0, '\n'
 	jalr	r40
 .Lhalt:
-	moveli	r41, lo16(___hv_halt)
-	auli	r41, r41, ha16(___hv_halt)
+	moveli	r41, lo16(hv_halt)
+	auli	r41, r41, ha16(hv_halt)
 
 	jalr	r41
 	STD_ENDPROC(relocate_new_kernel)
diff --git a/arch/tile/kernel/relocate_kernel_64.S b/arch/tile/kernel/relocate_kernel_64.S
index 1c09a4f5a4ea..d9d8cf6176e8 100644
--- a/arch/tile/kernel/relocate_kernel_64.S
+++ b/arch/tile/kernel/relocate_kernel_64.S
@@ -34,11 +34,11 @@ STD_ENTRY(relocate_new_kernel)
 	addi	sp, sp, -8
 	/* we now have a stack (whether we need one or not) */
 
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
 	moveli	r40, hw2_last(hv_console_putc)
 	shl16insli r40, r40, hw1(hv_console_putc)
 	shl16insli r40, r40, hw0(hv_console_putc)
 
-#ifdef RELOCATE_NEW_KERNEL_VERBOSE
 	moveli	r0, 'r'
 	jalr	r40
 
@@ -78,7 +78,6 @@ STD_ENTRY(relocate_new_kernel)
 	move	r30, sp
 	addi	sp, sp, -16
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 	/*
 	 * On TILE-GX, we need to flush all tiles' caches, since we may
 	 * have been doing hash-for-home caching there.  Note that we
@@ -116,7 +115,6 @@ STD_ENTRY(relocate_new_kernel)
 	shl16insli	r20, r20, hw0(hv_flush_remote)
 
 	jalr	r20
-#endif
 
 	/* r33 is destination pointer, default to zero */
 
@@ -176,10 +174,12 @@ STD_ENTRY(relocate_new_kernel)
 
 	/* we should not get here */
 
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
 	moveli	r0, '?'
 	jalr	r40
 	moveli	r0, '\n'
 	jalr	r40
+#endif
 
 	j	.Lhalt
 
@@ -237,7 +237,9 @@ STD_ENTRY(relocate_new_kernel)
 	j	.Lloop
 
 
-.Lerr:	moveli	r0, 'e'
+.Lerr:
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
+	moveli	r0, 'e'
 	jalr	r40
 	moveli	r0, 'r'
 	jalr	r40
@@ -245,6 +247,7 @@ STD_ENTRY(relocate_new_kernel)
 	jalr	r40
 	moveli	r0, '\n'
 	jalr	r40
+#endif
 .Lhalt:
 	moveli r41, hw2_last(hv_halt)
 	shl16insli r41, r41, hw1(hv_halt)
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index eceb8344280f..4c34caea9dd3 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -154,6 +154,65 @@ static int __init setup_maxnodemem(char *str)
 }
 early_param("maxnodemem", setup_maxnodemem);
 
+struct memmap_entry {
+	u64 addr;	/* start of memory segment */
+	u64 size;	/* size of memory segment */
+};
+static struct memmap_entry memmap_map[64];
+static int memmap_nr;
+
+static void add_memmap_region(u64 addr, u64 size)
+{
+	if (memmap_nr >= ARRAY_SIZE(memmap_map)) {
+		pr_err("Ooops! Too many entries in the memory map!\n");
+		return;
+	}
+	memmap_map[memmap_nr].addr = addr;
+	memmap_map[memmap_nr].size = size;
+	memmap_nr++;
+}
+
+static int __init setup_memmap(char *p)
+{
+	char *oldp;
+	u64 start_at, mem_size;
+
+	if (!p)
+		return -EINVAL;
+
+	if (!strncmp(p, "exactmap", 8)) {
+		pr_err("\"memmap=exactmap\" not valid on tile\n");
+		return 0;
+	}
+
+	oldp = p;
+	mem_size = memparse(p, &p);
+	if (p == oldp)
+		return -EINVAL;
+
+	if (*p == '@') {
+		pr_err("\"memmap=nn@ss\" (force RAM) invalid on tile\n");
+	} else if (*p == '#') {
+		pr_err("\"memmap=nn#ss\" (force ACPI data) invalid on tile\n");
+	} else if (*p == '$') {
+		start_at = memparse(p+1, &p);
+		add_memmap_region(start_at, mem_size);
+	} else {
+		if (mem_size == 0)
+			return -EINVAL;
+		maxmem_pfn = (mem_size >> HPAGE_SHIFT) <<
+			(HPAGE_SHIFT - PAGE_SHIFT);
+	}
+	return *p == '\0' ? 0 : -EINVAL;
+}
+early_param("memmap", setup_memmap);
+
+static int __init setup_mem(char *str)
+{
+	return setup_maxmem(str);
+}
+early_param("mem", setup_mem);  /* compatibility with x86 */
+
 static int __init setup_isolnodes(char *str)
 {
 	char buf[MAX_NUMNODES * 5];
@@ -209,7 +268,7 @@ early_param("vmalloc", parse_vmalloc);
 /*
  * Determine for each controller where its lowmem is mapped and how much of
  * it is mapped there.  On controller zero, the first few megabytes are
- * already mapped in as code at MEM_SV_INTRPT, so in principle we could
+ * already mapped in as code at MEM_SV_START, so in principle we could
  * start our data mappings higher up, but for now we don't bother, to avoid
  * additional confusion.
  *
@@ -614,11 +673,12 @@ static void __init setup_bootmem_allocator_node(int i)
 	/*
 	 * Throw away any memory aliased by the PCI region.
 	 */
-	if (pci_reserve_start_pfn < end && pci_reserve_end_pfn > start)
-		reserve_bootmem(PFN_PHYS(pci_reserve_start_pfn),
-				PFN_PHYS(pci_reserve_end_pfn -
-					 pci_reserve_start_pfn),
+	if (pci_reserve_start_pfn < end && pci_reserve_end_pfn > start) {
+		start = max(pci_reserve_start_pfn, start);
+		end = min(pci_reserve_end_pfn, end);
+		reserve_bootmem(PFN_PHYS(start), PFN_PHYS(end - start),
 				BOOTMEM_EXCLUSIVE);
+	}
 #endif
 }
 
@@ -628,6 +688,31 @@ static void __init setup_bootmem_allocator(void)
 	for (i = 0; i < MAX_NUMNODES; ++i)
 		setup_bootmem_allocator_node(i);
 
+	/* Reserve any memory excluded by "memmap" arguments. */
+	for (i = 0; i < memmap_nr; ++i) {
+		struct memmap_entry *m = &memmap_map[i];
+		reserve_bootmem(m->addr, m->size, 0);
+	}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+	if (initrd_start) {
+		/* Make sure the initrd memory region is not modified. */
+		if (reserve_bootmem(initrd_start, initrd_end - initrd_start,
+				    BOOTMEM_EXCLUSIVE)) {
+			pr_crit("The initrd memory region has been polluted. Disabling it.\n");
+			initrd_start = 0;
+			initrd_end = 0;
+		} else {
+			/*
+			 * Translate initrd_start & initrd_end from PA to VA for
+			 * future access.
+			 */
+			initrd_start += PAGE_OFFSET;
+			initrd_end += PAGE_OFFSET;
+		}
+	}
+#endif
+
 #ifdef CONFIG_KEXEC
 	if (crashk_res.start != crashk_res.end)
 		reserve_bootmem(crashk_res.start, resource_size(&crashk_res), 0);
@@ -961,9 +1046,6 @@ void setup_cpu(int boot)
 	arch_local_irq_unmask(INT_DMATLB_MISS);
 	arch_local_irq_unmask(INT_DMATLB_ACCESS);
 #endif
-#if CHIP_HAS_SN_PROC()
-	arch_local_irq_unmask(INT_SNITLB_MISS);
-#endif
 #ifdef __tilegx__
 	arch_local_irq_unmask(INT_SINGLE_STEP_K);
 #endif
@@ -978,10 +1060,6 @@ void setup_cpu(int boot)
 	/* Static network is not restricted. */
 	__insn_mtspr(SPR_MPL_SN_ACCESS_SET_0, 1);
 #endif
-#if CHIP_HAS_SN_PROC()
-	__insn_mtspr(SPR_MPL_SN_NOTIFY_SET_0, 1);
-	__insn_mtspr(SPR_MPL_SN_CPL_SET_0, 1);
-#endif
 
 	/*
 	 * Set the MPL for interrupt control 0 & 1 to the corresponding
@@ -1029,6 +1107,10 @@ static void __init load_hv_initrd(void)
 	int fd, rc;
 	void *initrd;
 
+	/* If initrd has already been set, skip initramfs file in hvfs. */
+	if (initrd_start)
+		return;
+
 	fd = hv_fs_findfile((HV_VirtAddr) initramfs_file);
 	if (fd == HV_ENOENT) {
 		if (set_initramfs_file) {
@@ -1067,6 +1149,25 @@ void __init free_initrd_mem(unsigned long begin, unsigned long end)
 	free_bootmem(__pa(begin), end - begin);
 }
 
+static int __init setup_initrd(char *str)
+{
+	char *endp;
+	unsigned long initrd_size;
+
+	initrd_size = str ? simple_strtoul(str, &endp, 0) : 0;
+	if (initrd_size == 0 || *endp != '@')
+		return -EINVAL;
+
+	initrd_start = simple_strtoul(endp+1, &endp, 0);
+	if (initrd_start == 0)
+		return -EINVAL;
+
+	initrd_end = initrd_start + initrd_size;
+
+	return 0;
+}
+early_param("initrd", setup_initrd);
+
 #else
 static inline void load_hv_initrd(void) {}
 #endif /* CONFIG_BLK_DEV_INITRD */
@@ -1134,7 +1235,7 @@ static void __init validate_va(void)
 #ifndef __tilegx__   /* FIXME: GX: probably some validation relevant here */
 	/*
 	 * Similarly, make sure we're only using allowed VAs.
-	 * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_INTRPT,
+	 * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_START,
 	 * and 0 .. KERNEL_HIGH_VADDR.
 	 * In addition, make sure we CAN'T use the end of memory, since
 	 * we use the last chunk of each pgd for the pgd_list.
@@ -1149,7 +1250,7 @@ static void __init validate_va(void)
 		if (range.size == 0)
 			break;
 		if (range.start <= MEM_USER_INTRPT &&
-		    range.start + range.size >= MEM_HV_INTRPT)
+		    range.start + range.size >= MEM_HV_START)
 			user_kernel_ok = 1;
 		if (range.start == 0)
 			max_va = range.size;
@@ -1183,7 +1284,6 @@ static void __init validate_va(void)
 struct cpumask __write_once cpu_lotar_map;
 EXPORT_SYMBOL(cpu_lotar_map);
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 /*
  * hash_for_home_map lists all the tiles that hash-for-home data
  * will be cached on.  Note that this may includes tiles that are not
@@ -1193,7 +1293,6 @@ EXPORT_SYMBOL(cpu_lotar_map);
  */
 struct cpumask hash_for_home_map;
 EXPORT_SYMBOL(hash_for_home_map);
-#endif
 
 /*
  * cpu_cacheable_map lists all the cpus whose caches the hypervisor can
@@ -1286,7 +1385,6 @@ static void __init setup_cpu_maps(void)
 		cpu_lotar_map = *cpu_possible_mask;
 	}
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 	/* Retrieve set of CPUs used for hash-for-home caching */
 	rc = hv_inquire_tiles(HV_INQ_TILES_HFH_CACHE,
 			      (HV_VirtAddr) hash_for_home_map.bits,
@@ -1294,9 +1392,6 @@ static void __init setup_cpu_maps(void)
 	if (rc < 0)
 		early_panic("hv_inquire_tiles(HFH_CACHE) failed: rc %d\n", rc);
 	cpumask_or(&cpu_cacheable_map, cpu_possible_mask, &hash_for_home_map);
-#else
-	cpu_cacheable_map = *cpu_possible_mask;
-#endif
 }
 
 
@@ -1492,7 +1587,7 @@ void __init setup_per_cpu_areas(void)
 
 			/* Update the vmalloc mapping and page home. */
 			unsigned long addr = (unsigned long)ptr + i;
-			pte_t *ptep = virt_to_pte(NULL, addr);
+			pte_t *ptep = virt_to_kpte(addr);
 			pte_t pte = *ptep;
 			BUG_ON(pfn != pte_pfn(pte));
 			pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_TILE_L3);
@@ -1501,12 +1596,12 @@ void __init setup_per_cpu_areas(void)
 
 			/* Update the lowmem mapping for consistency. */
 			lowmem_va = (unsigned long)pfn_to_kaddr(pfn);
-			ptep = virt_to_pte(NULL, lowmem_va);
+			ptep = virt_to_kpte(lowmem_va);
 			if (pte_huge(*ptep)) {
 				printk(KERN_DEBUG "early shatter of huge page"
 				       " at %#lx\n", lowmem_va);
 				shatter_pmd((pmd_t *)ptep);
-				ptep = virt_to_pte(NULL, lowmem_va);
+				ptep = virt_to_kpte(lowmem_va);
 				BUG_ON(pte_huge(*ptep));
 			}
 			BUG_ON(pfn != pte_pfn(*ptep));
@@ -1548,6 +1643,8 @@ insert_non_bus_resource(void)
 {
 	struct resource *res =
 		kzalloc(sizeof(struct resource), GFP_ATOMIC);
+	if (!res)
+		return NULL;
 	res->name = "Non-Bus Physical Address Space";
 	res->start = (1ULL << 32);
 	res->end = -1LL;
@@ -1561,11 +1658,13 @@ insert_non_bus_resource(void)
 #endif
 
 static struct resource* __init
-insert_ram_resource(u64 start_pfn, u64 end_pfn)
+insert_ram_resource(u64 start_pfn, u64 end_pfn, bool reserved)
 {
 	struct resource *res =
 		kzalloc(sizeof(struct resource), GFP_ATOMIC);
-	res->name = "System RAM";
+	if (!res)
+		return NULL;
+	res->name = reserved ? "Reserved" : "System RAM";
 	res->start = start_pfn << PAGE_SHIFT;
 	res->end = (end_pfn << PAGE_SHIFT) - 1;
 	res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
@@ -1585,7 +1684,7 @@ insert_ram_resource(u64 start_pfn, u64 end_pfn)
 static int __init request_standard_resources(void)
 {
 	int i;
-	enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
+	enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET };
 
 #if defined(CONFIG_PCI) && !defined(__tilegx__)
 	insert_non_bus_resource();
@@ -1600,11 +1699,11 @@ static int __init request_standard_resources(void)
 		    end_pfn > pci_reserve_start_pfn) {
 			if (end_pfn > pci_reserve_end_pfn)
 				insert_ram_resource(pci_reserve_end_pfn,
-						     end_pfn);
+						    end_pfn, 0);
 			end_pfn = pci_reserve_start_pfn;
 		}
 #endif
-		insert_ram_resource(start_pfn, end_pfn);
+		insert_ram_resource(start_pfn, end_pfn, 0);
 	}
 
 	code_resource.start = __pa(_text - CODE_DELTA);
@@ -1615,6 +1714,13 @@ static int __init request_standard_resources(void)
 	insert_resource(&iomem_resource, &code_resource);
 	insert_resource(&iomem_resource, &data_resource);
 
+	/* Mark any "memmap" regions busy for the resource manager. */
+	for (i = 0; i < memmap_nr; ++i) {
+		struct memmap_entry *m = &memmap_map[i];
+		insert_ram_resource(PFN_DOWN(m->addr),
+				    PFN_UP(m->addr + m->size - 1), 1);
+	}
+
 #ifdef CONFIG_KEXEC
 	insert_resource(&iomem_resource, &crashk_res);
 #endif
diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c
index 9531845bf661..2d1dbf38a9ab 100644
--- a/arch/tile/kernel/signal.c
+++ b/arch/tile/kernel/signal.c
@@ -33,6 +33,7 @@
 #include <asm/ucontext.h>
 #include <asm/sigframe.h>
 #include <asm/syscalls.h>
+#include <asm/vdso.h>
 #include <arch/interrupts.h>
 
 #define DEBUG_SIG 0
@@ -190,7 +191,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	if (err)
 		goto give_sigsegv;
 
-	restorer = VDSO_BASE;
+	restorer = VDSO_SYM(&__vdso_rt_sigreturn);
 	if (ka->sa.sa_flags & SA_RESTORER)
 		restorer = (unsigned long) ka->sa.sa_restorer;
 
diff --git a/arch/tile/kernel/single_step.c b/arch/tile/kernel/single_step.c
index 27742e87e255..de07fa7d1315 100644
--- a/arch/tile/kernel/single_step.c
+++ b/arch/tile/kernel/single_step.c
@@ -12,41 +12,30 @@
  *   more details.
  *
  * A code-rewriter that enables instruction single-stepping.
- * Derived from iLib's single-stepping code.
  */
 
-#ifndef __tilegx__   /* Hardware support for single step unavailable. */
-
-/* These functions are only used on the TILE platform */
+#include <linux/smp.h>
+#include <linux/ptrace.h>
 #include <linux/slab.h>
 #include <linux/thread_info.h>
 #include <linux/uaccess.h>
 #include <linux/mman.h>
 #include <linux/types.h>
 #include <linux/err.h>
+#include <linux/prctl.h>
 #include <asm/cacheflush.h>
+#include <asm/traps.h>
+#include <asm/uaccess.h>
 #include <asm/unaligned.h>
 #include <arch/abi.h>
+#include <arch/spr_def.h>
 #include <arch/opcode.h>
 
-#define signExtend17(val) sign_extend((val), 17)
-#define TILE_X1_MASK (0xffffffffULL << 31)
-
-int unaligned_printk;
 
-static int __init setup_unaligned_printk(char *str)
-{
-	long val;
-	if (strict_strtol(str, 0, &val) != 0)
-		return 0;
-	unaligned_printk = val;
-	pr_info("Printk for each unaligned data accesses is %s\n",
-		unaligned_printk ? "enabled" : "disabled");
-	return 1;
-}
-__setup("unaligned_printk=", setup_unaligned_printk);
+#ifndef __tilegx__   /* Hardware support for single step unavailable. */
 
-unsigned int unaligned_fixup_count;
+#define signExtend17(val) sign_extend((val), 17)
+#define TILE_X1_MASK (0xffffffffULL << 31)
 
 enum mem_op {
 	MEMOP_NONE,
@@ -56,12 +45,13 @@ enum mem_op {
 	MEMOP_STORE_POSTINCR
 };
 
-static inline tile_bundle_bits set_BrOff_X1(tile_bundle_bits n, s32 offset)
+static inline tilepro_bundle_bits set_BrOff_X1(tilepro_bundle_bits n,
+	s32 offset)
 {
-	tile_bundle_bits result;
+	tilepro_bundle_bits result;
 
 	/* mask out the old offset */
-	tile_bundle_bits mask = create_BrOff_X1(-1);
+	tilepro_bundle_bits mask = create_BrOff_X1(-1);
 	result = n & (~mask);
 
 	/* or in the new offset */
@@ -70,10 +60,11 @@ static inline tile_bundle_bits set_BrOff_X1(tile_bundle_bits n, s32 offset)
 	return result;
 }
 
-static inline tile_bundle_bits move_X1(tile_bundle_bits n, int dest, int src)
+static inline tilepro_bundle_bits move_X1(tilepro_bundle_bits n, int dest,
+	int src)
 {
-	tile_bundle_bits result;
-	tile_bundle_bits op;
+	tilepro_bundle_bits result;
+	tilepro_bundle_bits op;
 
 	result = n & (~TILE_X1_MASK);
 
@@ -87,13 +78,13 @@ static inline tile_bundle_bits move_X1(tile_bundle_bits n, int dest, int src)
 	return result;
 }
 
-static inline tile_bundle_bits nop_X1(tile_bundle_bits n)
+static inline tilepro_bundle_bits nop_X1(tilepro_bundle_bits n)
 {
 	return move_X1(n, TREG_ZERO, TREG_ZERO);
 }
 
-static inline tile_bundle_bits addi_X1(
-	tile_bundle_bits n, int dest, int src, int imm)
+static inline tilepro_bundle_bits addi_X1(
+	tilepro_bundle_bits n, int dest, int src, int imm)
 {
 	n &= ~TILE_X1_MASK;
 
@@ -107,15 +98,26 @@ static inline tile_bundle_bits addi_X1(
 	return n;
 }
 
-static tile_bundle_bits rewrite_load_store_unaligned(
+static tilepro_bundle_bits rewrite_load_store_unaligned(
 	struct single_step_state *state,
-	tile_bundle_bits bundle,
+	tilepro_bundle_bits bundle,
 	struct pt_regs *regs,
 	enum mem_op mem_op,
 	int size, int sign_ext)
 {
 	unsigned char __user *addr;
 	int val_reg, addr_reg, err, val;
+	int align_ctl;
+
+	align_ctl = unaligned_fixup;
+	switch (task_thread_info(current)->align_ctl) {
+	case PR_UNALIGN_NOPRINT:
+		align_ctl = 1;
+		break;
+	case PR_UNALIGN_SIGBUS:
+		align_ctl = 0;
+		break;
+	}
 
 	/* Get address and value registers */
 	if (bundle & TILEPRO_BUNDLE_Y_ENCODING_MASK) {
@@ -160,7 +162,7 @@ static tile_bundle_bits rewrite_load_store_unaligned(
 	 * tilepro hardware would be doing, if it could provide us with the
 	 * actual bad address in an SPR, which it doesn't.
 	 */
-	if (unaligned_fixup == 0) {
+	if (align_ctl == 0) {
 		siginfo_t info = {
 			.si_signo = SIGBUS,
 			.si_code = BUS_ADRALN,
@@ -209,14 +211,14 @@ static tile_bundle_bits rewrite_load_store_unaligned(
 
 	if (err) {
 		siginfo_t info = {
-			.si_signo = SIGSEGV,
-			.si_code = SEGV_MAPERR,
+			.si_signo = SIGBUS,
+			.si_code = BUS_ADRALN,
 			.si_addr = addr
 		};
-		trace_unhandled_signal("segfault", regs,
-				       (unsigned long)addr, SIGSEGV);
+		trace_unhandled_signal("bad address for unaligned fixup", regs,
+				       (unsigned long)addr, SIGBUS);
 		force_sig_info(info.si_signo, &info, current);
-		return (tile_bundle_bits) 0;
+		return (tilepro_bundle_bits) 0;
 	}
 
 	if (unaligned_printk || unaligned_fixup_count == 0) {
@@ -285,7 +287,7 @@ void single_step_execve(void)
 	ti->step_state = NULL;
 }
 
-/**
+/*
  * single_step_once() - entry point when single stepping has been triggered.
  * @regs: The machine register state
  *
@@ -304,20 +306,31 @@ void single_step_execve(void)
  */
 void single_step_once(struct pt_regs *regs)
 {
-	extern tile_bundle_bits __single_step_ill_insn;
-	extern tile_bundle_bits __single_step_j_insn;
-	extern tile_bundle_bits __single_step_addli_insn;
-	extern tile_bundle_bits __single_step_auli_insn;
+	extern tilepro_bundle_bits __single_step_ill_insn;
+	extern tilepro_bundle_bits __single_step_j_insn;
+	extern tilepro_bundle_bits __single_step_addli_insn;
+	extern tilepro_bundle_bits __single_step_auli_insn;
 	struct thread_info *info = (void *)current_thread_info();
 	struct single_step_state *state = info->step_state;
 	int is_single_step = test_ti_thread_flag(info, TIF_SINGLESTEP);
-	tile_bundle_bits __user *buffer, *pc;
-	tile_bundle_bits bundle;
+	tilepro_bundle_bits __user *buffer, *pc;
+	tilepro_bundle_bits bundle;
 	int temp_reg;
 	int target_reg = TREG_LR;
 	int err;
 	enum mem_op mem_op = MEMOP_NONE;
 	int size = 0, sign_ext = 0;  /* happy compiler */
+	int align_ctl;
+
+	align_ctl = unaligned_fixup;
+	switch (task_thread_info(current)->align_ctl) {
+	case PR_UNALIGN_NOPRINT:
+		align_ctl = 1;
+		break;
+	case PR_UNALIGN_SIGBUS:
+		align_ctl = 0;
+		break;
+	}
 
 	asm(
 "    .pushsection .rodata.single_step\n"
@@ -390,7 +403,7 @@ void single_step_once(struct pt_regs *regs)
 	if (regs->faultnum == INT_SWINT_1)
 		regs->pc -= 8;
 
-	pc = (tile_bundle_bits __user *)(regs->pc);
+	pc = (tilepro_bundle_bits __user *)(regs->pc);
 	if (get_user(bundle, pc) != 0) {
 		pr_err("Couldn't read instruction at %p trying to step\n", pc);
 		return;
@@ -533,7 +546,6 @@ void single_step_once(struct pt_regs *regs)
 			}
 			break;
 
-#if CHIP_HAS_WH64()
 		/* postincrement operations */
 		case IMM_0_OPCODE_X1:
 			switch (get_ImmOpcodeExtension_X1(bundle)) {
@@ -568,7 +580,6 @@ void single_step_once(struct pt_regs *regs)
 				break;
 			}
 			break;
-#endif /* CHIP_HAS_WH64() */
 		}
 
 		if (state->update) {
@@ -627,9 +638,9 @@ void single_step_once(struct pt_regs *regs)
 
 	/*
 	 * Check if we need to rewrite an unaligned load/store.
-	 * Returning zero is a special value meaning we need to SIGSEGV.
+	 * Returning zero is a special value meaning we generated a signal.
 	 */
-	if (mem_op != MEMOP_NONE && unaligned_fixup >= 0) {
+	if (mem_op != MEMOP_NONE && align_ctl >= 0) {
 		bundle = rewrite_load_store_unaligned(state, bundle, regs,
 						      mem_op, size, sign_ext);
 		if (bundle == 0)
@@ -668,9 +679,9 @@ void single_step_once(struct pt_regs *regs)
 		}
 
 		/* End with a jump back to the next instruction */
-		delta = ((regs->pc + TILE_BUNDLE_SIZE_IN_BYTES) -
+		delta = ((regs->pc + TILEPRO_BUNDLE_SIZE_IN_BYTES) -
 			(unsigned long)buffer) >>
-			TILE_LOG2_BUNDLE_ALIGNMENT_IN_BYTES;
+			TILEPRO_LOG2_BUNDLE_ALIGNMENT_IN_BYTES;
 		bundle = __single_step_j_insn;
 		bundle |= create_JOffLong_X1(delta);
 		err |= __put_user(bundle, buffer++);
@@ -698,9 +709,6 @@ void single_step_once(struct pt_regs *regs)
 }
 
 #else
-#include <linux/smp.h>
-#include <linux/ptrace.h>
-#include <arch/spr_def.h>
 
 static DEFINE_PER_CPU(unsigned long, ss_saved_pc);
 
@@ -743,10 +751,10 @@ void gx_singlestep_handle(struct pt_regs *regs, int fault_num)
 	} else if ((*ss_pc != regs->pc) ||
 		   (!(control & SPR_SINGLE_STEP_CONTROL_1__CANCELED_MASK))) {
 
-		ptrace_notify(SIGTRAP);
 		control |= SPR_SINGLE_STEP_CONTROL_1__CANCELED_MASK;
 		control |= SPR_SINGLE_STEP_CONTROL_1__INHIBIT_MASK;
 		__insn_mtspr(SPR_SINGLE_STEP_CONTROL_K, control);
+		send_sigtrap(current, regs);
 	}
 }
 
diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c
index cbc73a8b8fe1..01e8ab29f43a 100644
--- a/arch/tile/kernel/smp.c
+++ b/arch/tile/kernel/smp.c
@@ -20,8 +20,13 @@
 #include <linux/irq.h>
 #include <linux/module.h>
 #include <asm/cacheflush.h>
+#include <asm/homecache.h>
 
-HV_Topology smp_topology __write_once;
+/*
+ * We write to width and height with a single store in head_NN.S,
+ * so make the variable aligned to "long".
+ */
+HV_Topology smp_topology __write_once __aligned(sizeof(long));
 EXPORT_SYMBOL(smp_topology);
 
 #if CHIP_HAS_IPI()
@@ -100,8 +105,8 @@ static void smp_start_cpu_interrupt(void)
 /* Handler to stop the current cpu. */
 static void smp_stop_cpu_interrupt(void)
 {
-	set_cpu_online(smp_processor_id(), 0);
 	arch_local_irq_disable_all();
+	set_cpu_online(smp_processor_id(), 0);
 	for (;;)
 		asm("nap; nop");
 }
@@ -167,9 +172,16 @@ static void ipi_flush_icache_range(void *info)
 void flush_icache_range(unsigned long start, unsigned long end)
 {
 	struct ipi_flush flush = { start, end };
-	preempt_disable();
-	on_each_cpu(ipi_flush_icache_range, &flush, 1);
-	preempt_enable();
+
+	/* If invoked with irqs disabled, we can not issue IPIs. */
+	if (irqs_disabled())
+		flush_remote(0, HV_FLUSH_EVICT_L1I, NULL, 0, 0, 0,
+			NULL, NULL, 0);
+	else {
+		preempt_disable();
+		on_each_cpu(ipi_flush_icache_range, &flush, 1);
+		preempt_enable();
+	}
 }
 
 
diff --git a/arch/tile/kernel/smpboot.c b/arch/tile/kernel/smpboot.c
index a535655b7089..732e9d138661 100644
--- a/arch/tile/kernel/smpboot.c
+++ b/arch/tile/kernel/smpboot.c
@@ -142,13 +142,15 @@ static struct cpumask cpu_started;
  */
 static void start_secondary(void)
 {
-	int cpuid = smp_processor_id();
+	int cpuid;
+
+	preempt_disable();
+
+	cpuid = smp_processor_id();
 
 	/* Set our thread pointer appropriately. */
 	set_my_cpu_offset(__per_cpu_offset[cpuid]);
 
-	preempt_disable();
-
 	/*
 	 * In large machines even this will slow us down, since we
 	 * will be contending for for the printk spinlock.
diff --git a/arch/tile/kernel/stack.c b/arch/tile/kernel/stack.c
index af8dfc9665f6..362284af3afd 100644
--- a/arch/tile/kernel/stack.c
+++ b/arch/tile/kernel/stack.c
@@ -29,6 +29,7 @@
 #include <asm/switch_to.h>
 #include <asm/sigframe.h>
 #include <asm/stack.h>
+#include <asm/vdso.h>
 #include <arch/abi.h>
 #include <arch/interrupts.h>
 
@@ -102,9 +103,8 @@ static struct pt_regs *valid_fault_handler(struct KBacktraceIterator* kbt)
 	    p->sp >= sp) {
 		if (kbt->verbose)
 			pr_err("  <%s while in kernel mode>\n", fault);
-	} else if (EX1_PL(p->ex1) == USER_PL &&
-	    p->pc < PAGE_OFFSET &&
-	    p->sp < PAGE_OFFSET) {
+	} else if (user_mode(p) &&
+		   p->sp < PAGE_OFFSET && p->sp != 0) {
 		if (kbt->verbose)
 			pr_err("  <%s while in user mode>\n", fault);
 	} else if (kbt->verbose) {
@@ -120,7 +120,7 @@ static struct pt_regs *valid_fault_handler(struct KBacktraceIterator* kbt)
 /* Is the pc pointing to a sigreturn trampoline? */
 static int is_sigreturn(unsigned long pc)
 {
-	return (pc == VDSO_BASE);
+	return current->mm && (pc == VDSO_SYM(&__vdso_rt_sigreturn));
 }
 
 /* Return a pt_regs pointer for a valid signal handler frame */
@@ -129,7 +129,7 @@ static struct pt_regs *valid_sigframe(struct KBacktraceIterator* kbt,
 {
 	BacktraceIterator *b = &kbt->it;
 
-	if (b->pc == VDSO_BASE && b->sp < PAGE_OFFSET &&
+	if (is_sigreturn(b->pc) && b->sp < PAGE_OFFSET &&
 	    b->sp % sizeof(long) == 0) {
 		int retval;
 		pagefault_disable();
@@ -195,21 +195,21 @@ static int KBacktraceIterator_next_item_inclusive(
  */
 static void validate_stack(struct pt_regs *regs)
 {
-	int cpu = smp_processor_id();
+	int cpu = raw_smp_processor_id();
 	unsigned long ksp0 = get_current_ksp0();
-	unsigned long ksp0_base = ksp0 - THREAD_SIZE;
+	unsigned long ksp0_base = ksp0 & -THREAD_SIZE;
 	unsigned long sp = stack_pointer;
 
 	if (EX1_PL(regs->ex1) == KERNEL_PL && regs->sp >= ksp0) {
-		pr_err("WARNING: cpu %d: kernel stack page %#lx underrun!\n"
+		pr_err("WARNING: cpu %d: kernel stack %#lx..%#lx underrun!\n"
 		       "  sp %#lx (%#lx in caller), caller pc %#lx, lr %#lx\n",
-		       cpu, ksp0_base, sp, regs->sp, regs->pc, regs->lr);
+		       cpu, ksp0_base, ksp0, sp, regs->sp, regs->pc, regs->lr);
 	}
 
 	else if (sp < ksp0_base + sizeof(struct thread_info)) {
-		pr_err("WARNING: cpu %d: kernel stack page %#lx overrun!\n"
+		pr_err("WARNING: cpu %d: kernel stack %#lx..%#lx overrun!\n"
 		       "  sp %#lx (%#lx in caller), caller pc %#lx, lr %#lx\n",
-		       cpu, ksp0_base, sp, regs->sp, regs->pc, regs->lr);
+		       cpu, ksp0_base, ksp0, sp, regs->sp, regs->pc, regs->lr);
 	}
 }
 
@@ -352,6 +352,26 @@ static void describe_addr(struct KBacktraceIterator *kbt,
 }
 
 /*
+ * Avoid possible crash recursion during backtrace.  If it happens, it
+ * makes it easy to lose the actual root cause of the failure, so we
+ * put a simple guard on all the backtrace loops.
+ */
+static bool start_backtrace(void)
+{
+	if (current->thread.in_backtrace) {
+		pr_err("Backtrace requested while in backtrace!\n");
+		return false;
+	}
+	current->thread.in_backtrace = true;
+	return true;
+}
+
+static void end_backtrace(void)
+{
+	current->thread.in_backtrace = false;
+}
+
+/*
  * This method wraps the backtracer's more generic support.
  * It is only invoked from the architecture-specific code; show_stack()
  * and dump_stack() (in entry.S) are architecture-independent entry points.
@@ -361,6 +381,8 @@ void tile_show_stack(struct KBacktraceIterator *kbt, int headers)
 	int i;
 	int have_mmap_sem = 0;
 
+	if (!start_backtrace())
+		return;
 	if (headers) {
 		/*
 		 * Add a blank line since if we are called from panic(),
@@ -371,7 +393,7 @@ void tile_show_stack(struct KBacktraceIterator *kbt, int headers)
 		pr_err("Starting stack dump of tid %d, pid %d (%s)"
 		       " on cpu %d at cycle %lld\n",
 		       kbt->task->pid, kbt->task->tgid, kbt->task->comm,
-		       smp_processor_id(), get_cycles());
+		       raw_smp_processor_id(), get_cycles());
 	}
 	kbt->verbose = 1;
 	i = 0;
@@ -402,6 +424,7 @@ void tile_show_stack(struct KBacktraceIterator *kbt, int headers)
 		pr_err("Stack dump complete\n");
 	if (have_mmap_sem)
 		up_read(&kbt->task->mm->mmap_sem);
+	end_backtrace();
 }
 EXPORT_SYMBOL(tile_show_stack);
 
@@ -463,6 +486,8 @@ void save_stack_trace_tsk(struct task_struct *task, struct stack_trace *trace)
 	int skip = trace->skip;
 	int i = 0;
 
+	if (!start_backtrace())
+		goto done;
 	if (task == NULL || task == current)
 		KBacktraceIterator_init_current(&kbt);
 	else
@@ -476,6 +501,8 @@ void save_stack_trace_tsk(struct task_struct *task, struct stack_trace *trace)
 			break;
 		trace->entries[i++] = kbt.it.pc;
 	}
+	end_backtrace();
+done:
 	trace->nr_entries = i;
 }
 EXPORT_SYMBOL(save_stack_trace_tsk);
diff --git a/arch/tile/kernel/sys.c b/arch/tile/kernel/sys.c
index b881a7be24bd..38debe706061 100644
--- a/arch/tile/kernel/sys.c
+++ b/arch/tile/kernel/sys.c
@@ -38,8 +38,10 @@
 SYSCALL_DEFINE3(cacheflush, unsigned long, addr, unsigned long, len,
 		unsigned long, flags)
 {
+	/* DCACHE is not particularly effective if not bound to one cpu. */
 	if (flags & DCACHE)
-		homecache_evict(cpumask_of(smp_processor_id()));
+		homecache_evict(cpumask_of(raw_smp_processor_id()));
+
 	if (flags & ICACHE)
 		flush_remote(0, HV_FLUSH_EVICT_L1I, mm_cpumask(current->mm),
 			     0, 0, 0, NULL, NULL, 0);
diff --git a/arch/tile/kernel/sysfs.c b/arch/tile/kernel/sysfs.c
index e25b0a89c18f..a3ed12f8f83b 100644
--- a/arch/tile/kernel/sysfs.c
+++ b/arch/tile/kernel/sysfs.c
@@ -157,6 +157,67 @@ hvconfig_bin_read(struct file *filp, struct kobject *kobj,
 	return count;
 }
 
+static ssize_t hv_stats_show(struct device *dev,
+			     struct device_attribute *attr,
+			     char *page)
+{
+	int cpu = dev->id;
+	long lotar = HV_XY_TO_LOTAR(cpu_x(cpu), cpu_y(cpu));
+
+	ssize_t n = hv_confstr(HV_CONFSTR_HV_STATS,
+			       (unsigned long)page, PAGE_SIZE - 1,
+			       lotar, 0);
+	n = n < 0 ? 0 : min(n, (ssize_t)PAGE_SIZE - 1);
+	page[n] = '\0';
+	return n;
+}
+
+static ssize_t hv_stats_store(struct device *dev,
+			      struct device_attribute *attr,
+			      const char *page,
+			      size_t count)
+{
+	int cpu = dev->id;
+	long lotar = HV_XY_TO_LOTAR(cpu_x(cpu), cpu_y(cpu));
+
+	ssize_t n = hv_confstr(HV_CONFSTR_HV_STATS, 0, 0, lotar, 1);
+	return n < 0 ? n : count;
+}
+
+static DEVICE_ATTR(hv_stats, 0644, hv_stats_show, hv_stats_store);
+
+static int hv_stats_device_add(struct device *dev, struct subsys_interface *sif)
+{
+	int err, cpu = dev->id;
+
+	if (!cpu_online(cpu))
+		return 0;
+
+	err = sysfs_create_file(&dev->kobj, &dev_attr_hv_stats.attr);
+
+	return err;
+}
+
+static int hv_stats_device_remove(struct device *dev,
+				  struct subsys_interface *sif)
+{
+	int cpu = dev->id;
+
+	if (!cpu_online(cpu))
+		return 0;
+
+	sysfs_remove_file(&dev->kobj, &dev_attr_hv_stats.attr);
+	return 0;
+}
+
+
+static struct subsys_interface hv_stats_interface = {
+	.name			= "hv_stats",
+	.subsys			= &cpu_subsys,
+	.add_dev		= hv_stats_device_add,
+	.remove_dev		= hv_stats_device_remove,
+};
+
 static int __init create_sysfs_entries(void)
 {
 	int err = 0;
@@ -188,6 +249,21 @@ static int __init create_sysfs_entries(void)
 		err = sysfs_create_bin_file(hypervisor_kobj, &hvconfig_bin);
 	}
 
+	if (!err) {
+		/*
+		 * Don't bother adding the hv_stats files on each CPU if
+		 * our hypervisor doesn't supply statistics.
+		 */
+		int cpu = raw_smp_processor_id();
+		long lotar = HV_XY_TO_LOTAR(cpu_x(cpu), cpu_y(cpu));
+		char dummy;
+		ssize_t n = hv_confstr(HV_CONFSTR_HV_STATS,
+				       (unsigned long) &dummy, 1,
+				       lotar, 0);
+		if (n >= 0)
+			err = subsys_interface_register(&hv_stats_interface);
+	}
+
 	return err;
 }
 subsys_initcall(create_sysfs_entries);
diff --git a/arch/tile/kernel/time.c b/arch/tile/kernel/time.c
index 7c353d8c2da9..5d10642db63e 100644
--- a/arch/tile/kernel/time.c
+++ b/arch/tile/kernel/time.c
@@ -23,8 +23,10 @@
 #include <linux/smp.h>
 #include <linux/delay.h>
 #include <linux/module.h>
+#include <linux/timekeeper_internal.h>
 #include <asm/irq_regs.h>
 #include <asm/traps.h>
+#include <asm/vdso.h>
 #include <hv/hypervisor.h>
 #include <arch/interrupts.h>
 #include <arch/spr_def.h>
@@ -110,7 +112,6 @@ void __init time_init(void)
 	setup_tile_timer();
 }
 
-
 /*
  * Define the tile timer clock event device.  The timer is driven by
  * the TILE_TIMER_CONTROL register, which consists of a 31-bit down
@@ -237,3 +238,37 @@ cycles_t ns2cycles(unsigned long nsecs)
 	struct clock_event_device *dev = &__raw_get_cpu_var(tile_timer);
 	return ((u64)nsecs * dev->mult) >> dev->shift;
 }
+
+void update_vsyscall_tz(void)
+{
+	/* Userspace gettimeofday will spin while this value is odd. */
+	++vdso_data->tz_update_count;
+	smp_wmb();
+	vdso_data->tz_minuteswest = sys_tz.tz_minuteswest;
+	vdso_data->tz_dsttime = sys_tz.tz_dsttime;
+	smp_wmb();
+	++vdso_data->tz_update_count;
+}
+
+void update_vsyscall(struct timekeeper *tk)
+{
+	struct timespec wall_time = tk_xtime(tk);
+	struct timespec *wtm = &tk->wall_to_monotonic;
+	struct clocksource *clock = tk->clock;
+
+	if (clock != &cycle_counter_cs)
+		return;
+
+	/* Userspace gettimeofday will spin while this value is odd. */
+	++vdso_data->tb_update_count;
+	smp_wmb();
+	vdso_data->xtime_tod_stamp = clock->cycle_last;
+	vdso_data->xtime_clock_sec = wall_time.tv_sec;
+	vdso_data->xtime_clock_nsec = wall_time.tv_nsec;
+	vdso_data->wtom_clock_sec = wtm->tv_sec;
+	vdso_data->wtom_clock_nsec = wtm->tv_nsec;
+	vdso_data->mult = clock->mult;
+	vdso_data->shift = clock->shift;
+	smp_wmb();
+	++vdso_data->tb_update_count;
+}
diff --git a/arch/tile/kernel/tlb.c b/arch/tile/kernel/tlb.c
index 3fd54d5bbd4c..f23b53515671 100644
--- a/arch/tile/kernel/tlb.c
+++ b/arch/tile/kernel/tlb.c
@@ -91,8 +91,14 @@ void flush_tlb_all(void)
 	}
 }
 
+/*
+ * Callers need to flush the L1I themselves if necessary, e.g. for
+ * kernel module unload.  Otherwise we assume callers are not using
+ * executable pgprot_t's.  Using EVICT_L1I means that dataplane cpus
+ * will get an unnecessary interrupt otherwise.
+ */
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-	flush_remote(0, HV_FLUSH_EVICT_L1I, cpu_online_mask,
+	flush_remote(0, 0, NULL,
 		     start, end - start, PAGE_SIZE, cpu_online_mask, NULL, 0);
 }
diff --git a/arch/tile/kernel/traps.c b/arch/tile/kernel/traps.c
index 5b19a23c8908..6b603d556ca6 100644
--- a/arch/tile/kernel/traps.c
+++ b/arch/tile/kernel/traps.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/kprobes.h>
+#include <linux/kdebug.h>
 #include <linux/module.h>
 #include <linux/reboot.h>
 #include <linux/uaccess.h>
@@ -29,7 +30,7 @@
 
 void __init trap_init(void)
 {
-	/* Nothing needed here since we link code at .intrpt1 */
+	/* Nothing needed here since we link code at .intrpt */
 }
 
 int unaligned_fixup = 1;
@@ -100,13 +101,7 @@ static int retry_gpv(unsigned int gpv_reason)
 
 #endif /* CHIP_HAS_TILE_DMA() */
 
-#ifdef __tilegx__
-#define bundle_bits tilegx_bundle_bits
-#else
-#define bundle_bits tile_bundle_bits
-#endif
-
-extern bundle_bits bpt_code;
+extern tile_bundle_bits bpt_code;
 
 asm(".pushsection .rodata.bpt_code,\"a\";"
     ".align 8;"
@@ -114,7 +109,7 @@ asm(".pushsection .rodata.bpt_code,\"a\";"
     ".size bpt_code,.-bpt_code;"
     ".popsection");
 
-static int special_ill(bundle_bits bundle, int *sigp, int *codep)
+static int special_ill(tile_bundle_bits bundle, int *sigp, int *codep)
 {
 	int sig, code, maxcode;
 
@@ -214,24 +209,73 @@ static const char *const int_name[] = {
 #endif
 };
 
+static int do_bpt(struct pt_regs *regs)
+{
+	unsigned long bundle, bcode, bpt;
+
+	bundle = *(unsigned long *)instruction_pointer(regs);
+
+	/*
+	 * bpt shoule be { bpt; nop }, which is 0x286a44ae51485000ULL.
+	 * we encode the unused least significant bits for other purpose.
+	 */
+	bpt = bundle & ~((1ULL << 12) - 1);
+	if (bpt != TILE_BPT_BUNDLE)
+		return 0;
+
+	bcode = bundle & ((1ULL << 12) - 1);
+	/*
+	 * notify the kprobe handlers, if instruction is likely to
+	 * pertain to them.
+	 */
+	switch (bcode) {
+	/* breakpoint_insn */
+	case 0:
+		notify_die(DIE_BREAK, "debug", regs, bundle,
+			INT_ILL, SIGTRAP);
+		break;
+	/* compiled_bpt */
+	case DIE_COMPILED_BPT:
+		notify_die(DIE_COMPILED_BPT, "debug", regs, bundle,
+			INT_ILL, SIGTRAP);
+		break;
+	/* breakpoint2_insn */
+	case DIE_SSTEPBP:
+		notify_die(DIE_SSTEPBP, "single_step", regs, bundle,
+			INT_ILL, SIGTRAP);
+		break;
+	default:
+		return 0;
+	}
+
+	return 1;
+}
+
 void __kprobes do_trap(struct pt_regs *regs, int fault_num,
 		       unsigned long reason)
 {
 	siginfo_t info = { 0 };
 	int signo, code;
 	unsigned long address = 0;
-	bundle_bits instr;
+	tile_bundle_bits instr;
+	int is_kernel = !user_mode(regs);
+
+	/* Handle breakpoints, etc. */
+	if (is_kernel && fault_num == INT_ILL && do_bpt(regs))
+		return;
 
-	/* Re-enable interrupts. */
-	local_irq_enable();
+	/* Re-enable interrupts, if they were previously enabled. */
+	if (!(regs->flags & PT_FLAGS_DISABLE_IRQ))
+		local_irq_enable();
 
 	/*
 	 * If it hits in kernel mode and we can't fix it up, just exit the
 	 * current process and hope for the best.
 	 */
-	if (!user_mode(regs)) {
+	if (is_kernel) {
 		const char *name;
-		if (fixup_exception(regs))  /* only UNALIGN_DATA in practice */
+		char buf[100];
+		if (fixup_exception(regs))  /* ILL_TRANS or UNALIGN_DATA */
 			return;
 		if (fault_num >= 0 &&
 		    fault_num < sizeof(int_name)/sizeof(int_name[0]) &&
@@ -239,10 +283,16 @@ void __kprobes do_trap(struct pt_regs *regs, int fault_num,
 			name = int_name[fault_num];
 		else
 			name = "Unknown interrupt";
-		pr_alert("Kernel took bad trap %d (%s) at PC %#lx\n",
-			 fault_num, name, regs->pc);
 		if (fault_num == INT_GPV)
-			pr_alert("GPV_REASON is %#lx\n", reason);
+			snprintf(buf, sizeof(buf), "; GPV_REASON %#lx", reason);
+#ifdef __tilegx__
+		else if (fault_num == INT_ILL_TRANS)
+			snprintf(buf, sizeof(buf), "; address %#lx", reason);
+#endif
+		else
+			buf[0] = '\0';
+		pr_alert("Kernel took bad trap %d (%s) at PC %#lx%s\n",
+			 fault_num, name, regs->pc, buf);
 		show_regs(regs);
 		do_exit(SIGKILL);  /* FIXME: implement i386 die() */
 		return;
@@ -324,11 +374,8 @@ void __kprobes do_trap(struct pt_regs *regs, int fault_num,
 		fill_ra_stack();
 
 		signo = SIGSEGV;
+		address = reason;
 		code = SEGV_MAPERR;
-		if (reason & SPR_ILL_TRANS_REASON__I_STREAM_VA_RMASK)
-			address = regs->pc;
-		else
-			address = 0;  /* FIXME: GX: single-step for address */
 		break;
 	}
 #endif
diff --git a/arch/tile/kernel/unaligned.c b/arch/tile/kernel/unaligned.c
new file mode 100644
index 000000000000..b425fb6a480d
--- /dev/null
+++ b/arch/tile/kernel/unaligned.c
@@ -0,0 +1,1609 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * A code-rewriter that handles unaligned exception.
+ */
+
+#include <linux/smp.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+#include <linux/uaccess.h>
+#include <linux/mman.h>
+#include <linux/types.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/compat.h>
+#include <linux/prctl.h>
+#include <asm/cacheflush.h>
+#include <asm/traps.h>
+#include <asm/uaccess.h>
+#include <asm/unaligned.h>
+#include <arch/abi.h>
+#include <arch/spr_def.h>
+#include <arch/opcode.h>
+
+
+/*
+ * This file handles unaligned exception for tile-Gx. The tilepro's unaligned
+ * exception is supported out of single_step.c
+ */
+
+int unaligned_printk;
+
+static int __init setup_unaligned_printk(char *str)
+{
+	long val;
+	if (kstrtol(str, 0, &val) != 0)
+		return 0;
+	unaligned_printk = val;
+	pr_info("Printk for each unaligned data accesses is %s\n",
+		unaligned_printk ? "enabled" : "disabled");
+	return 1;
+}
+__setup("unaligned_printk=", setup_unaligned_printk);
+
+unsigned int unaligned_fixup_count;
+
+#ifdef __tilegx__
+
+/*
+ * Unalign data jit fixup code fragement. Reserved space is 128 bytes.
+ * The 1st 64-bit word saves fault PC address, 2nd word is the fault
+ * instruction bundle followed by 14 JIT bundles.
+ */
+
+struct unaligned_jit_fragment {
+	unsigned long       pc;
+	tilegx_bundle_bits  bundle;
+	tilegx_bundle_bits  insn[14];
+};
+
+/*
+ * Check if a nop or fnop at bundle's pipeline X0.
+ */
+
+static bool is_bundle_x0_nop(tilegx_bundle_bits bundle)
+{
+	return (((get_UnaryOpcodeExtension_X0(bundle) ==
+		  NOP_UNARY_OPCODE_X0) &&
+		 (get_RRROpcodeExtension_X0(bundle) ==
+		  UNARY_RRR_0_OPCODE_X0) &&
+		 (get_Opcode_X0(bundle) ==
+		  RRR_0_OPCODE_X0)) ||
+		((get_UnaryOpcodeExtension_X0(bundle) ==
+		  FNOP_UNARY_OPCODE_X0) &&
+		 (get_RRROpcodeExtension_X0(bundle) ==
+		  UNARY_RRR_0_OPCODE_X0) &&
+		 (get_Opcode_X0(bundle) ==
+		  RRR_0_OPCODE_X0)));
+}
+
+/*
+ * Check if nop or fnop at bundle's pipeline X1.
+ */
+
+static bool is_bundle_x1_nop(tilegx_bundle_bits bundle)
+{
+	return (((get_UnaryOpcodeExtension_X1(bundle) ==
+		  NOP_UNARY_OPCODE_X1) &&
+		 (get_RRROpcodeExtension_X1(bundle) ==
+		  UNARY_RRR_0_OPCODE_X1) &&
+		 (get_Opcode_X1(bundle) ==
+		  RRR_0_OPCODE_X1)) ||
+		((get_UnaryOpcodeExtension_X1(bundle) ==
+		  FNOP_UNARY_OPCODE_X1) &&
+		 (get_RRROpcodeExtension_X1(bundle) ==
+		  UNARY_RRR_0_OPCODE_X1) &&
+		 (get_Opcode_X1(bundle) ==
+		  RRR_0_OPCODE_X1)));
+}
+
+/*
+ * Check if nop or fnop at bundle's Y0 pipeline.
+ */
+
+static bool is_bundle_y0_nop(tilegx_bundle_bits bundle)
+{
+	return (((get_UnaryOpcodeExtension_Y0(bundle) ==
+		  NOP_UNARY_OPCODE_Y0) &&
+		 (get_RRROpcodeExtension_Y0(bundle) ==
+		  UNARY_RRR_1_OPCODE_Y0) &&
+		 (get_Opcode_Y0(bundle) ==
+		  RRR_1_OPCODE_Y0)) ||
+		((get_UnaryOpcodeExtension_Y0(bundle) ==
+		  FNOP_UNARY_OPCODE_Y0) &&
+		 (get_RRROpcodeExtension_Y0(bundle) ==
+		  UNARY_RRR_1_OPCODE_Y0) &&
+		 (get_Opcode_Y0(bundle) ==
+		  RRR_1_OPCODE_Y0)));
+}
+
+/*
+ * Check if nop or fnop at bundle's pipeline Y1.
+ */
+
+static bool is_bundle_y1_nop(tilegx_bundle_bits bundle)
+{
+	return (((get_UnaryOpcodeExtension_Y1(bundle) ==
+		  NOP_UNARY_OPCODE_Y1) &&
+		 (get_RRROpcodeExtension_Y1(bundle) ==
+		  UNARY_RRR_1_OPCODE_Y1) &&
+		 (get_Opcode_Y1(bundle) ==
+		  RRR_1_OPCODE_Y1)) ||
+		((get_UnaryOpcodeExtension_Y1(bundle) ==
+		  FNOP_UNARY_OPCODE_Y1) &&
+		 (get_RRROpcodeExtension_Y1(bundle) ==
+		  UNARY_RRR_1_OPCODE_Y1) &&
+		 (get_Opcode_Y1(bundle) ==
+		  RRR_1_OPCODE_Y1)));
+}
+
+/*
+ * Test if a bundle's y0 and y1 pipelines are both nop or fnop.
+ */
+
+static bool is_y0_y1_nop(tilegx_bundle_bits bundle)
+{
+	return is_bundle_y0_nop(bundle) && is_bundle_y1_nop(bundle);
+}
+
+/*
+ * Test if a bundle's x0 and x1 pipelines are both nop or fnop.
+ */
+
+static bool is_x0_x1_nop(tilegx_bundle_bits bundle)
+{
+	return is_bundle_x0_nop(bundle) && is_bundle_x1_nop(bundle);
+}
+
+/*
+ * Find the destination, source registers of fault unalign access instruction
+ * at X1 or Y2. Also, allocate up to 3 scratch registers clob1, clob2 and
+ * clob3, which are guaranteed different from any register used in the fault
+ * bundle. r_alias is used to return if the other instructions other than the
+ * unalign load/store shares same register with ra, rb and rd.
+ */
+
+static void find_regs(tilegx_bundle_bits bundle, uint64_t *rd, uint64_t *ra,
+		      uint64_t *rb, uint64_t *clob1, uint64_t *clob2,
+		      uint64_t *clob3, bool *r_alias)
+{
+	int i;
+	uint64_t reg;
+	uint64_t reg_map = 0, alias_reg_map = 0, map;
+	bool alias;
+
+	*ra = -1;
+	*rb = -1;
+
+	if (rd)
+		*rd = -1;
+
+	*clob1 = -1;
+	*clob2 = -1;
+	*clob3 = -1;
+	alias = false;
+
+	/*
+	 * Parse fault bundle, find potential used registers and mark
+	 * corresponding bits in reg_map and alias_map. These 2 bit maps
+	 * are used to find the scratch registers and determine if there
+	 * is register alais.
+	 */
+	if (bundle & TILEGX_BUNDLE_MODE_MASK) {  /* Y Mode Bundle. */
+
+		reg = get_SrcA_Y2(bundle);
+		reg_map |= 1ULL << reg;
+		*ra = reg;
+		reg = get_SrcBDest_Y2(bundle);
+		reg_map |= 1ULL << reg;
+
+		if (rd) {
+			/* Load. */
+			*rd = reg;
+			alias_reg_map = (1ULL << *rd) | (1ULL << *ra);
+		} else {
+			/* Store. */
+			*rb = reg;
+			alias_reg_map = (1ULL << *ra) | (1ULL << *rb);
+		}
+
+		if (!is_bundle_y1_nop(bundle)) {
+			reg = get_SrcA_Y1(bundle);
+			reg_map |= (1ULL << reg);
+			map = (1ULL << reg);
+
+			reg = get_SrcB_Y1(bundle);
+			reg_map |= (1ULL << reg);
+			map |= (1ULL << reg);
+
+			reg = get_Dest_Y1(bundle);
+			reg_map |= (1ULL << reg);
+			map |= (1ULL << reg);
+
+			if (map & alias_reg_map)
+				alias = true;
+		}
+
+		if (!is_bundle_y0_nop(bundle)) {
+			reg = get_SrcA_Y0(bundle);
+			reg_map |= (1ULL << reg);
+			map = (1ULL << reg);
+
+			reg = get_SrcB_Y0(bundle);
+			reg_map |= (1ULL << reg);
+			map |= (1ULL << reg);
+
+			reg = get_Dest_Y0(bundle);
+			reg_map |= (1ULL << reg);
+			map |= (1ULL << reg);
+
+			if (map & alias_reg_map)
+				alias = true;
+		}
+	} else	{ /* X Mode Bundle. */
+
+		reg = get_SrcA_X1(bundle);
+		reg_map |= (1ULL << reg);
+		*ra = reg;
+		if (rd)	{
+			/* Load. */
+			reg = get_Dest_X1(bundle);
+			reg_map |= (1ULL << reg);
+			*rd = reg;
+			alias_reg_map = (1ULL << *rd) | (1ULL << *ra);
+		} else {
+			/* Store. */
+			reg = get_SrcB_X1(bundle);
+			reg_map |= (1ULL << reg);
+			*rb = reg;
+			alias_reg_map = (1ULL << *ra) | (1ULL << *rb);
+		}
+
+		if (!is_bundle_x0_nop(bundle)) {
+			reg = get_SrcA_X0(bundle);
+			reg_map |= (1ULL << reg);
+			map = (1ULL << reg);
+
+			reg = get_SrcB_X0(bundle);
+			reg_map |= (1ULL << reg);
+			map |= (1ULL << reg);
+
+			reg = get_Dest_X0(bundle);
+			reg_map |= (1ULL << reg);
+			map |= (1ULL << reg);
+
+			if (map & alias_reg_map)
+				alias = true;
+		}
+	}
+
+	/*
+	 * "alias" indicates if the unalign access registers have collision
+	 * with others in the same bundle. We jsut simply test all register
+	 * operands case (RRR), ignored the case with immidate. If a bundle
+	 * has no register alias, we may do fixup in a simple or fast manner.
+	 * So if an immidata field happens to hit with a register, we may end
+	 * up fall back to the generic handling.
+	 */
+
+	*r_alias = alias;
+
+	/* Flip bits on reg_map. */
+	reg_map ^= -1ULL;
+
+	/* Scan reg_map lower 54(TREG_SP) bits to find 3 set bits. */
+	for (i = 0; i < TREG_SP; i++) {
+		if (reg_map & (0x1ULL << i)) {
+			if (*clob1 == -1) {
+				*clob1 = i;
+			} else if (*clob2 == -1) {
+				*clob2 = i;
+			} else if (*clob3 == -1) {
+				*clob3 = i;
+				return;
+			}
+		}
+	}
+}
+
+/*
+ * Sanity check for register ra, rb, rd, clob1/2/3. Return true if any of them
+ * is unexpected.
+ */
+
+static bool check_regs(uint64_t rd, uint64_t ra, uint64_t rb,
+		       uint64_t clob1, uint64_t clob2,  uint64_t clob3)
+{
+	bool unexpected = false;
+	if ((ra >= 56) && (ra != TREG_ZERO))
+		unexpected = true;
+
+	if ((clob1 >= 56) || (clob2 >= 56) || (clob3 >= 56))
+		unexpected = true;
+
+	if (rd != -1) {
+		if ((rd >= 56) && (rd != TREG_ZERO))
+			unexpected = true;
+	} else {
+		if ((rb >= 56) && (rb != TREG_ZERO))
+			unexpected = true;
+	}
+	return unexpected;
+}
+
+
+#define  GX_INSN_X0_MASK   ((1ULL << 31) - 1)
+#define  GX_INSN_X1_MASK   (((1ULL << 31) - 1) << 31)
+#define  GX_INSN_Y0_MASK   ((0xFULL << 27) | (0xFFFFFULL))
+#define  GX_INSN_Y1_MASK   (GX_INSN_Y0_MASK << 31)
+#define  GX_INSN_Y2_MASK   ((0x7FULL << 51) | (0x7FULL << 20))
+
+#ifdef __LITTLE_ENDIAN
+#define  GX_INSN_BSWAP(_bundle_)    (_bundle_)
+#else
+#define  GX_INSN_BSWAP(_bundle_)    swab64(_bundle_)
+#endif /* __LITTLE_ENDIAN */
+
+/*
+ * __JIT_CODE(.) creates template bundles in .rodata.unalign_data section.
+ * The corresponding static function jix_x#_###(.) generates partial or
+ * whole bundle based on the template and given arguments.
+ */
+
+#define __JIT_CODE(_X_)						\
+	asm (".pushsection .rodata.unalign_data, \"a\"\n"	\
+	     _X_"\n"						\
+	     ".popsection\n")
+
+__JIT_CODE("__unalign_jit_x1_mtspr:   {mtspr 0,  r0}");
+static tilegx_bundle_bits jit_x1_mtspr(int spr, int reg)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_mtspr;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_mtspr) & GX_INSN_X1_MASK) |
+		create_MT_Imm14_X1(spr) | create_SrcA_X1(reg);
+}
+
+__JIT_CODE("__unalign_jit_x1_mfspr:   {mfspr r0, 0}");
+static tilegx_bundle_bits  jit_x1_mfspr(int reg, int spr)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_mfspr;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_mfspr) & GX_INSN_X1_MASK) |
+		create_MF_Imm14_X1(spr) | create_Dest_X1(reg);
+}
+
+__JIT_CODE("__unalign_jit_x0_addi:   {addi  r0, r0, 0; iret}");
+static tilegx_bundle_bits  jit_x0_addi(int rd, int ra, int imm8)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x0_addi;
+	return (GX_INSN_BSWAP(__unalign_jit_x0_addi) & GX_INSN_X0_MASK) |
+		create_Dest_X0(rd) | create_SrcA_X0(ra) |
+		create_Imm8_X0(imm8);
+}
+
+__JIT_CODE("__unalign_jit_x1_ldna:   {ldna  r0, r0}");
+static tilegx_bundle_bits  jit_x1_ldna(int rd, int ra)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_ldna;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_ldna) &  GX_INSN_X1_MASK) |
+		create_Dest_X1(rd) | create_SrcA_X1(ra);
+}
+
+__JIT_CODE("__unalign_jit_x0_dblalign:   {dblalign r0, r0 ,r0}");
+static tilegx_bundle_bits  jit_x0_dblalign(int rd, int ra, int rb)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x0_dblalign;
+	return (GX_INSN_BSWAP(__unalign_jit_x0_dblalign) & GX_INSN_X0_MASK) |
+		create_Dest_X0(rd) | create_SrcA_X0(ra) |
+		create_SrcB_X0(rb);
+}
+
+__JIT_CODE("__unalign_jit_x1_iret:   {iret}");
+static tilegx_bundle_bits  jit_x1_iret(void)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_iret;
+	return GX_INSN_BSWAP(__unalign_jit_x1_iret) & GX_INSN_X1_MASK;
+}
+
+__JIT_CODE("__unalign_jit_x01_fnop:   {fnop;fnop}");
+static tilegx_bundle_bits  jit_x0_fnop(void)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x01_fnop;
+	return GX_INSN_BSWAP(__unalign_jit_x01_fnop) & GX_INSN_X0_MASK;
+}
+
+static tilegx_bundle_bits  jit_x1_fnop(void)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x01_fnop;
+	return GX_INSN_BSWAP(__unalign_jit_x01_fnop) & GX_INSN_X1_MASK;
+}
+
+__JIT_CODE("__unalign_jit_y2_dummy:   {fnop; fnop; ld zero, sp}");
+static tilegx_bundle_bits  jit_y2_dummy(void)
+{
+	extern  tilegx_bundle_bits __unalign_jit_y2_dummy;
+	return GX_INSN_BSWAP(__unalign_jit_y2_dummy) & GX_INSN_Y2_MASK;
+}
+
+static tilegx_bundle_bits  jit_y1_fnop(void)
+{
+	extern  tilegx_bundle_bits __unalign_jit_y2_dummy;
+	return GX_INSN_BSWAP(__unalign_jit_y2_dummy) & GX_INSN_Y1_MASK;
+}
+
+__JIT_CODE("__unalign_jit_x1_st1_add:  {st1_add r1, r0, 0}");
+static tilegx_bundle_bits  jit_x1_st1_add(int ra, int rb, int imm8)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_st1_add;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_st1_add) &
+		(~create_SrcA_X1(-1)) &
+		GX_INSN_X1_MASK) | create_SrcA_X1(ra) |
+		create_SrcB_X1(rb) | create_Dest_Imm8_X1(imm8);
+}
+
+__JIT_CODE("__unalign_jit_x1_st:  {crc32_8 r1, r0, r0; st  r0, r0}");
+static tilegx_bundle_bits  jit_x1_st(int ra, int rb)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_st;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_st) & GX_INSN_X1_MASK) |
+		create_SrcA_X1(ra) | create_SrcB_X1(rb);
+}
+
+__JIT_CODE("__unalign_jit_x1_st_add:  {st_add  r1, r0, 0}");
+static tilegx_bundle_bits  jit_x1_st_add(int ra, int rb, int imm8)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_st_add;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_st_add) &
+		(~create_SrcA_X1(-1)) &
+		GX_INSN_X1_MASK) | create_SrcA_X1(ra) |
+		create_SrcB_X1(rb) | create_Dest_Imm8_X1(imm8);
+}
+
+__JIT_CODE("__unalign_jit_x1_ld:  {crc32_8 r1, r0, r0; ld  r0, r0}");
+static tilegx_bundle_bits  jit_x1_ld(int rd, int ra)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_ld;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_ld) & GX_INSN_X1_MASK) |
+		create_Dest_X1(rd) | create_SrcA_X1(ra);
+}
+
+__JIT_CODE("__unalign_jit_x1_ld_add:  {ld_add  r1, r0, 0}");
+static tilegx_bundle_bits  jit_x1_ld_add(int rd, int ra, int imm8)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_ld_add;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_ld_add) &
+		(~create_Dest_X1(-1)) &
+		GX_INSN_X1_MASK) | create_Dest_X1(rd) |
+		create_SrcA_X1(ra) | create_Imm8_X1(imm8);
+}
+
+__JIT_CODE("__unalign_jit_x0_bfexts:  {bfexts r0, r0, 0, 0}");
+static tilegx_bundle_bits  jit_x0_bfexts(int rd, int ra, int bfs, int bfe)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x0_bfexts;
+	return (GX_INSN_BSWAP(__unalign_jit_x0_bfexts) &
+		GX_INSN_X0_MASK) |
+		create_Dest_X0(rd) | create_SrcA_X0(ra) |
+		create_BFStart_X0(bfs) | create_BFEnd_X0(bfe);
+}
+
+__JIT_CODE("__unalign_jit_x0_bfextu:  {bfextu r0, r0, 0, 0}");
+static tilegx_bundle_bits  jit_x0_bfextu(int rd, int ra, int bfs, int bfe)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x0_bfextu;
+	return (GX_INSN_BSWAP(__unalign_jit_x0_bfextu) &
+		GX_INSN_X0_MASK) |
+		create_Dest_X0(rd) | create_SrcA_X0(ra) |
+		create_BFStart_X0(bfs) | create_BFEnd_X0(bfe);
+}
+
+__JIT_CODE("__unalign_jit_x1_addi:  {bfextu r1, r1, 0, 0; addi r0, r0, 0}");
+static tilegx_bundle_bits  jit_x1_addi(int rd, int ra, int imm8)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_addi;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_addi) & GX_INSN_X1_MASK) |
+		create_Dest_X1(rd) | create_SrcA_X1(ra) |
+		create_Imm8_X1(imm8);
+}
+
+__JIT_CODE("__unalign_jit_x0_shrui:  {shrui r0, r0, 0; iret}");
+static tilegx_bundle_bits  jit_x0_shrui(int rd, int ra, int imm6)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x0_shrui;
+	return (GX_INSN_BSWAP(__unalign_jit_x0_shrui) &
+		GX_INSN_X0_MASK) |
+		create_Dest_X0(rd) | create_SrcA_X0(ra) |
+		create_ShAmt_X0(imm6);
+}
+
+__JIT_CODE("__unalign_jit_x0_rotli:  {rotli r0, r0, 0; iret}");
+static tilegx_bundle_bits  jit_x0_rotli(int rd, int ra, int imm6)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x0_rotli;
+	return (GX_INSN_BSWAP(__unalign_jit_x0_rotli) &
+		GX_INSN_X0_MASK) |
+		create_Dest_X0(rd) | create_SrcA_X0(ra) |
+		create_ShAmt_X0(imm6);
+}
+
+__JIT_CODE("__unalign_jit_x1_bnezt:  {bnezt r0, __unalign_jit_x1_bnezt}");
+static tilegx_bundle_bits  jit_x1_bnezt(int ra, int broff)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_bnezt;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_bnezt) &
+		GX_INSN_X1_MASK) |
+		create_SrcA_X1(ra) | create_BrOff_X1(broff);
+}
+
+#undef __JIT_CODE
+
+/*
+ * This function generates unalign fixup JIT.
+ *
+ * We fist find unalign load/store instruction's destination, source
+ * reguisters: ra, rb and rd. and 3 scratch registers by calling
+ * find_regs(...). 3 scratch clobbers should not alias with any register
+ * used in the fault bundle. Then analyze the fault bundle to determine
+ * if it's a load or store, operand width, branch or address increment etc.
+ * At last generated JIT is copied into JIT code area in user space.
+ */
+
+static
+void jit_bundle_gen(struct pt_regs *regs, tilegx_bundle_bits bundle,
+		    int align_ctl)
+{
+	struct thread_info *info = current_thread_info();
+	struct unaligned_jit_fragment frag;
+	struct unaligned_jit_fragment *jit_code_area;
+	tilegx_bundle_bits bundle_2 = 0;
+	/* If bundle_2_enable = false, bundle_2 is fnop/nop operation. */
+	bool     bundle_2_enable = true;
+	uint64_t ra, rb, rd = -1, clob1, clob2, clob3;
+	/*
+	 * Indicate if the unalign access
+	 * instruction's registers hit with
+	 * others in the same bundle.
+	 */
+	bool     alias = false;
+	bool     load_n_store = true;
+	bool     load_store_signed = false;
+	unsigned int  load_store_size = 8;
+	bool     y1_br = false;  /* True, for a branch in same bundle at Y1.*/
+	int      y1_br_reg = 0;
+	/* True for link operation. i.e. jalr or lnk at Y1 */
+	bool     y1_lr = false;
+	int      y1_lr_reg = 0;
+	bool     x1_add = false;/* True, for load/store ADD instruction at X1*/
+	int      x1_add_imm8 = 0;
+	bool     unexpected = false;
+	int      n = 0, k;
+
+	jit_code_area =
+		(struct unaligned_jit_fragment *)(info->unalign_jit_base);
+
+	memset((void *)&frag, 0, sizeof(frag));
+
+	/* 0: X mode, Otherwise: Y mode. */
+	if (bundle & TILEGX_BUNDLE_MODE_MASK) {
+		unsigned int mod, opcode;
+
+		if (get_Opcode_Y1(bundle) == RRR_1_OPCODE_Y1 &&
+		    get_RRROpcodeExtension_Y1(bundle) ==
+		    UNARY_RRR_1_OPCODE_Y1) {
+
+			opcode = get_UnaryOpcodeExtension_Y1(bundle);
+
+			/*
+			 * Test "jalr", "jalrp", "jr", "jrp" instruction at Y1
+			 * pipeline.
+			 */
+			switch (opcode) {
+			case JALR_UNARY_OPCODE_Y1:
+			case JALRP_UNARY_OPCODE_Y1:
+				y1_lr = true;
+				y1_lr_reg = 55; /* Link register. */
+				/* FALLTHROUGH */
+			case JR_UNARY_OPCODE_Y1:
+			case JRP_UNARY_OPCODE_Y1:
+				y1_br = true;
+				y1_br_reg = get_SrcA_Y1(bundle);
+				break;
+			case LNK_UNARY_OPCODE_Y1:
+				/* "lnk" at Y1 pipeline. */
+				y1_lr = true;
+				y1_lr_reg = get_Dest_Y1(bundle);
+				break;
+			}
+		}
+
+		opcode = get_Opcode_Y2(bundle);
+		mod = get_Mode(bundle);
+
+		/*
+		 *  bundle_2 is bundle after making Y2 as a dummy operation
+		 *  - ld zero, sp
+		 */
+		bundle_2 = (bundle & (~GX_INSN_Y2_MASK)) | jit_y2_dummy();
+
+		/* Make Y1 as fnop if Y1 is a branch or lnk operation. */
+		if (y1_br || y1_lr) {
+			bundle_2 &= ~(GX_INSN_Y1_MASK);
+			bundle_2 |= jit_y1_fnop();
+		}
+
+		if (is_y0_y1_nop(bundle_2))
+			bundle_2_enable = false;
+
+		if (mod == MODE_OPCODE_YC2) {
+			/* Store. */
+			load_n_store = false;
+			load_store_size = 1 << opcode;
+			load_store_signed = false;
+			find_regs(bundle, 0, &ra, &rb, &clob1, &clob2,
+				  &clob3, &alias);
+			if (load_store_size > 8)
+				unexpected = true;
+		} else {
+			/* Load. */
+			load_n_store = true;
+			if (mod == MODE_OPCODE_YB2) {
+				switch (opcode) {
+				case LD_OPCODE_Y2:
+					load_store_signed = false;
+					load_store_size = 8;
+					break;
+				case LD4S_OPCODE_Y2:
+					load_store_signed = true;
+					load_store_size = 4;
+					break;
+				case LD4U_OPCODE_Y2:
+					load_store_signed = false;
+					load_store_size = 4;
+					break;
+				default:
+					unexpected = true;
+				}
+			} else if (mod == MODE_OPCODE_YA2) {
+				if (opcode == LD2S_OPCODE_Y2) {
+					load_store_signed = true;
+					load_store_size = 2;
+				} else if (opcode == LD2U_OPCODE_Y2) {
+					load_store_signed = false;
+					load_store_size = 2;
+				} else
+					unexpected = true;
+			} else
+				unexpected = true;
+			find_regs(bundle, &rd, &ra, &rb, &clob1, &clob2,
+				  &clob3, &alias);
+		}
+	} else {
+		unsigned int opcode;
+
+		/* bundle_2 is bundle after making X1 as "fnop". */
+		bundle_2 = (bundle & (~GX_INSN_X1_MASK)) | jit_x1_fnop();
+
+		if (is_x0_x1_nop(bundle_2))
+			bundle_2_enable = false;
+
+		if (get_Opcode_X1(bundle) == RRR_0_OPCODE_X1) {
+			opcode = get_UnaryOpcodeExtension_X1(bundle);
+
+			if (get_RRROpcodeExtension_X1(bundle) ==
+			    UNARY_RRR_0_OPCODE_X1) {
+				load_n_store = true;
+				find_regs(bundle, &rd, &ra, &rb, &clob1,
+					  &clob2, &clob3, &alias);
+
+				switch (opcode) {
+				case LD_UNARY_OPCODE_X1:
+					load_store_signed = false;
+					load_store_size = 8;
+					break;
+				case LD4S_UNARY_OPCODE_X1:
+					load_store_signed = true;
+					/* FALLTHROUGH */
+				case LD4U_UNARY_OPCODE_X1:
+					load_store_size = 4;
+					break;
+
+				case LD2S_UNARY_OPCODE_X1:
+					load_store_signed = true;
+					/* FALLTHROUGH */
+				case LD2U_UNARY_OPCODE_X1:
+					load_store_size = 2;
+					break;
+				default:
+					unexpected = true;
+				}
+			} else {
+				load_n_store = false;
+				load_store_signed = false;
+				find_regs(bundle, 0, &ra, &rb,
+					  &clob1, &clob2, &clob3,
+					  &alias);
+
+				opcode = get_RRROpcodeExtension_X1(bundle);
+				switch (opcode)	{
+				case ST_RRR_0_OPCODE_X1:
+					load_store_size = 8;
+					break;
+				case ST4_RRR_0_OPCODE_X1:
+					load_store_size = 4;
+					break;
+				case ST2_RRR_0_OPCODE_X1:
+					load_store_size = 2;
+					break;
+				default:
+					unexpected = true;
+				}
+			}
+		} else if (get_Opcode_X1(bundle) == IMM8_OPCODE_X1) {
+			load_n_store = true;
+			opcode = get_Imm8OpcodeExtension_X1(bundle);
+			switch (opcode)	{
+			case LD_ADD_IMM8_OPCODE_X1:
+				load_store_size = 8;
+				break;
+
+			case LD4S_ADD_IMM8_OPCODE_X1:
+				load_store_signed = true;
+				/* FALLTHROUGH */
+			case LD4U_ADD_IMM8_OPCODE_X1:
+				load_store_size = 4;
+				break;
+
+			case LD2S_ADD_IMM8_OPCODE_X1:
+				load_store_signed = true;
+				/* FALLTHROUGH */
+			case LD2U_ADD_IMM8_OPCODE_X1:
+				load_store_size = 2;
+				break;
+
+			case ST_ADD_IMM8_OPCODE_X1:
+				load_n_store = false;
+				load_store_size = 8;
+				break;
+			case ST4_ADD_IMM8_OPCODE_X1:
+				load_n_store = false;
+				load_store_size = 4;
+				break;
+			case ST2_ADD_IMM8_OPCODE_X1:
+				load_n_store = false;
+				load_store_size = 2;
+				break;
+			default:
+				unexpected = true;
+			}
+
+			if (!unexpected) {
+				x1_add = true;
+				if (load_n_store)
+					x1_add_imm8 = get_Imm8_X1(bundle);
+				else
+					x1_add_imm8 = get_Dest_Imm8_X1(bundle);
+			}
+
+			find_regs(bundle, load_n_store ? (&rd) : NULL,
+				  &ra, &rb, &clob1, &clob2, &clob3, &alias);
+		} else
+			unexpected = true;
+	}
+
+	/*
+	 * Some sanity check for register numbers extracted from fault bundle.
+	 */
+	if (check_regs(rd, ra, rb, clob1, clob2, clob3) == true)
+		unexpected = true;
+
+	/* Give warning if register ra has an aligned address. */
+	if (!unexpected)
+		WARN_ON(!((load_store_size - 1) & (regs->regs[ra])));
+
+
+	/*
+	 * Fault came from kernel space, here we only need take care of
+	 * unaligned "get_user/put_user" macros defined in "uaccess.h".
+	 * Basically, we will handle bundle like this:
+	 * {ld/2u/4s rd, ra; movei rx, 0} or {st/2/4 ra, rb; movei rx, 0}
+	 * (Refer to file "arch/tile/include/asm/uaccess.h" for details).
+	 * For either load or store, byte-wise operation is performed by calling
+	 * get_user() or put_user(). If the macro returns non-zero value,
+	 * set the value to rx, otherwise set zero to rx. Finally make pc point
+	 * to next bundle and return.
+	 */
+
+	if (EX1_PL(regs->ex1) != USER_PL) {
+
+		unsigned long rx = 0;
+		unsigned long x = 0, ret = 0;
+
+		if (y1_br || y1_lr || x1_add ||
+		    (load_store_signed !=
+		     (load_n_store && load_store_size == 4))) {
+			/* No branch, link, wrong sign-ext or load/store add. */
+			unexpected = true;
+		} else if (!unexpected) {
+			if (bundle & TILEGX_BUNDLE_MODE_MASK) {
+				/*
+				 * Fault bundle is Y mode.
+				 * Check if the Y1 and Y0 is the form of
+				 * { movei rx, 0; nop/fnop }, if yes,
+				 * find the rx.
+				 */
+
+				if ((get_Opcode_Y1(bundle) == ADDI_OPCODE_Y1)
+				    && (get_SrcA_Y1(bundle) == TREG_ZERO) &&
+				    (get_Imm8_Y1(bundle) == 0) &&
+				    is_bundle_y0_nop(bundle)) {
+					rx = get_Dest_Y1(bundle);
+				} else if ((get_Opcode_Y0(bundle) ==
+					    ADDI_OPCODE_Y0) &&
+					   (get_SrcA_Y0(bundle) == TREG_ZERO) &&
+					   (get_Imm8_Y0(bundle) == 0) &&
+					   is_bundle_y1_nop(bundle)) {
+					rx = get_Dest_Y0(bundle);
+				} else {
+					unexpected = true;
+				}
+			} else {
+				/*
+				 * Fault bundle is X mode.
+				 * Check if the X0 is 'movei rx, 0',
+				 * if yes, find the rx.
+				 */
+
+				if ((get_Opcode_X0(bundle) == IMM8_OPCODE_X0)
+				    && (get_Imm8OpcodeExtension_X0(bundle) ==
+					ADDI_IMM8_OPCODE_X0) &&
+				    (get_SrcA_X0(bundle) == TREG_ZERO) &&
+				    (get_Imm8_X0(bundle) == 0)) {
+					rx = get_Dest_X0(bundle);
+				} else {
+					unexpected = true;
+				}
+			}
+
+			/* rx should be less than 56. */
+			if (!unexpected && (rx >= 56))
+				unexpected = true;
+		}
+
+		if (!search_exception_tables(regs->pc)) {
+			/* No fixup in the exception tables for the pc. */
+			unexpected = true;
+		}
+
+		if (unexpected) {
+			/* Unexpected unalign kernel fault. */
+			struct task_struct *tsk = validate_current();
+
+			bust_spinlocks(1);
+
+			show_regs(regs);
+
+			if (unlikely(tsk->pid < 2)) {
+				panic("Kernel unalign fault running %s!",
+				      tsk->pid ? "init" : "the idle task");
+			}
+#ifdef SUPPORT_DIE
+			die("Oops", regs);
+#endif
+			bust_spinlocks(1);
+
+			do_group_exit(SIGKILL);
+
+		} else {
+			unsigned long i, b = 0;
+			unsigned char *ptr =
+				(unsigned char *)regs->regs[ra];
+			if (load_n_store) {
+				/* handle get_user(x, ptr) */
+				for (i = 0; i < load_store_size; i++) {
+					ret = get_user(b, ptr++);
+					if (!ret) {
+						/* Success! update x. */
+#ifdef __LITTLE_ENDIAN
+						x |= (b << (8 * i));
+#else
+						x <<= 8;
+						x |= b;
+#endif /* __LITTLE_ENDIAN */
+					} else {
+						x = 0;
+						break;
+					}
+				}
+
+				/* Sign-extend 4-byte loads. */
+				if (load_store_size == 4)
+					x = (long)(int)x;
+
+				/* Set register rd. */
+				regs->regs[rd] = x;
+
+				/* Set register rx. */
+				regs->regs[rx] = ret;
+
+				/* Bump pc. */
+				regs->pc += 8;
+
+			} else {
+				/* Handle put_user(x, ptr) */
+				x = regs->regs[rb];
+#ifdef __LITTLE_ENDIAN
+				b = x;
+#else
+				/*
+				 * Swap x in order to store x from low
+				 * to high memory same as the
+				 * little-endian case.
+				 */
+				switch (load_store_size) {
+				case 8:
+					b = swab64(x);
+					break;
+				case 4:
+					b = swab32(x);
+					break;
+				case 2:
+					b = swab16(x);
+					break;
+				}
+#endif /* __LITTLE_ENDIAN */
+				for (i = 0; i < load_store_size; i++) {
+					ret = put_user(b, ptr++);
+					if (ret)
+						break;
+					/* Success! shift 1 byte. */
+					b >>= 8;
+				}
+				/* Set register rx. */
+				regs->regs[rx] = ret;
+
+				/* Bump pc. */
+				regs->pc += 8;
+			}
+		}
+
+		unaligned_fixup_count++;
+
+		if (unaligned_printk) {
+			pr_info("%s/%d. Unalign fixup for kernel access "
+				"to userspace %lx.",
+				current->comm, current->pid, regs->regs[ra]);
+		}
+
+		/* Done! Return to the exception handler. */
+		return;
+	}
+
+	if ((align_ctl == 0) || unexpected) {
+		siginfo_t info = {
+			.si_signo = SIGBUS,
+			.si_code = BUS_ADRALN,
+			.si_addr = (unsigned char __user *)0
+		};
+		if (unaligned_printk)
+			pr_info("Unalign bundle: unexp @%llx, %llx",
+				(unsigned long long)regs->pc,
+				(unsigned long long)bundle);
+
+		if (ra < 56) {
+			unsigned long uaa = (unsigned long)regs->regs[ra];
+			/* Set bus Address. */
+			info.si_addr = (unsigned char __user *)uaa;
+		}
+
+		unaligned_fixup_count++;
+
+		trace_unhandled_signal("unaligned fixup trap", regs,
+				       (unsigned long)info.si_addr, SIGBUS);
+		force_sig_info(info.si_signo, &info, current);
+		return;
+	}
+
+#ifdef __LITTLE_ENDIAN
+#define UA_FIXUP_ADDR_DELTA          1
+#define UA_FIXUP_BFEXT_START(_B_)    0
+#define UA_FIXUP_BFEXT_END(_B_)     (8 * (_B_) - 1)
+#else /* __BIG_ENDIAN */
+#define UA_FIXUP_ADDR_DELTA          -1
+#define UA_FIXUP_BFEXT_START(_B_)   (64 - 8 * (_B_))
+#define UA_FIXUP_BFEXT_END(_B_)      63
+#endif /* __LITTLE_ENDIAN */
+
+
+
+	if ((ra != rb) && (rd != TREG_SP) && !alias &&
+	    !y1_br && !y1_lr && !x1_add) {
+		/*
+		 * Simple case: ra != rb and no register alias found,
+		 * and no branch or link. This will be the majority.
+		 * We can do a little better for simplae case than the
+		 * generic scheme below.
+		 */
+		if (!load_n_store) {
+			/*
+			 * Simple store: ra != rb, no need for scratch register.
+			 * Just store and rotate to right bytewise.
+			 */
+#ifdef __BIG_ENDIAN
+			frag.insn[n++] =
+				jit_x0_addi(ra, ra, load_store_size - 1) |
+				jit_x1_fnop();
+#endif /* __BIG_ENDIAN */
+			for (k = 0; k < load_store_size; k++) {
+				/* Store a byte. */
+				frag.insn[n++] =
+					jit_x0_rotli(rb, rb, 56) |
+					jit_x1_st1_add(ra, rb,
+						       UA_FIXUP_ADDR_DELTA);
+			}
+#ifdef __BIG_ENDIAN
+			frag.insn[n] = jit_x1_addi(ra, ra, 1);
+#else
+			frag.insn[n] = jit_x1_addi(ra, ra,
+						   -1 * load_store_size);
+#endif /* __LITTLE_ENDIAN */
+
+			if (load_store_size == 8) {
+				frag.insn[n] |= jit_x0_fnop();
+			} else if (load_store_size == 4) {
+				frag.insn[n] |= jit_x0_rotli(rb, rb, 32);
+			} else { /* = 2 */
+				frag.insn[n] |= jit_x0_rotli(rb, rb, 16);
+			}
+			n++;
+			if (bundle_2_enable)
+				frag.insn[n++] = bundle_2;
+			frag.insn[n++] = jit_x0_fnop() | jit_x1_iret();
+		} else {
+			if (rd == ra) {
+				/* Use two clobber registers: clob1/2. */
+				frag.insn[n++] =
+					jit_x0_addi(TREG_SP, TREG_SP, -16) |
+					jit_x1_fnop();
+				frag.insn[n++] =
+					jit_x0_addi(clob1, ra, 7) |
+					jit_x1_st_add(TREG_SP, clob1, -8);
+				frag.insn[n++] =
+					jit_x0_addi(clob2, ra, 0) |
+					jit_x1_st(TREG_SP, clob2);
+				frag.insn[n++] =
+					jit_x0_fnop() |
+					jit_x1_ldna(rd, ra);
+				frag.insn[n++] =
+					jit_x0_fnop() |
+					jit_x1_ldna(clob1, clob1);
+				/*
+				 * Note: we must make sure that rd must not
+				 * be sp. Recover clob1/2 from stack.
+				 */
+				frag.insn[n++] =
+					jit_x0_dblalign(rd, clob1, clob2) |
+					jit_x1_ld_add(clob2, TREG_SP, 8);
+				frag.insn[n++] =
+					jit_x0_fnop() |
+					jit_x1_ld_add(clob1, TREG_SP, 16);
+			} else {
+				/* Use one clobber register: clob1 only. */
+				frag.insn[n++] =
+					jit_x0_addi(TREG_SP, TREG_SP, -16) |
+					jit_x1_fnop();
+				frag.insn[n++] =
+					jit_x0_addi(clob1, ra, 7) |
+					jit_x1_st(TREG_SP, clob1);
+				frag.insn[n++] =
+					jit_x0_fnop() |
+					jit_x1_ldna(rd, ra);
+				frag.insn[n++] =
+					jit_x0_fnop() |
+					jit_x1_ldna(clob1, clob1);
+				/*
+				 * Note: we must make sure that rd must not
+				 * be sp. Recover clob1 from stack.
+				 */
+				frag.insn[n++] =
+					jit_x0_dblalign(rd, clob1, ra) |
+					jit_x1_ld_add(clob1, TREG_SP, 16);
+			}
+
+			if (bundle_2_enable)
+				frag.insn[n++] = bundle_2;
+			/*
+			 * For non 8-byte load, extract corresponding bytes and
+			 * signed extension.
+			 */
+			if (load_store_size == 4) {
+				if (load_store_signed)
+					frag.insn[n++] =
+						jit_x0_bfexts(
+							rd, rd,
+							UA_FIXUP_BFEXT_START(4),
+							UA_FIXUP_BFEXT_END(4)) |
+						jit_x1_fnop();
+				else
+					frag.insn[n++] =
+						jit_x0_bfextu(
+							rd, rd,
+							UA_FIXUP_BFEXT_START(4),
+							UA_FIXUP_BFEXT_END(4)) |
+						jit_x1_fnop();
+			} else if (load_store_size == 2) {
+				if (load_store_signed)
+					frag.insn[n++] =
+						jit_x0_bfexts(
+							rd, rd,
+							UA_FIXUP_BFEXT_START(2),
+							UA_FIXUP_BFEXT_END(2)) |
+						jit_x1_fnop();
+				else
+					frag.insn[n++] =
+						jit_x0_bfextu(
+							rd, rd,
+							UA_FIXUP_BFEXT_START(2),
+							UA_FIXUP_BFEXT_END(2)) |
+						jit_x1_fnop();
+			}
+
+			frag.insn[n++] =
+				jit_x0_fnop()  |
+				jit_x1_iret();
+		}
+	} else if (!load_n_store) {
+
+		/*
+		 * Generic memory store cases: use 3 clobber registers.
+		 *
+		 * Alloc space for saveing clob2,1,3 on user's stack.
+		 * register clob3 points to where clob2 saved, followed by
+		 * clob1 and 3 from high to low memory.
+		 */
+		frag.insn[n++] =
+			jit_x0_addi(TREG_SP, TREG_SP, -32)    |
+			jit_x1_fnop();
+		frag.insn[n++] =
+			jit_x0_addi(clob3, TREG_SP, 16)  |
+			jit_x1_st_add(TREG_SP, clob3, 8);
+#ifdef __LITTLE_ENDIAN
+		frag.insn[n++] =
+			jit_x0_addi(clob1, ra, 0)   |
+			jit_x1_st_add(TREG_SP, clob1, 8);
+#else
+		frag.insn[n++] =
+			jit_x0_addi(clob1, ra, load_store_size - 1)   |
+			jit_x1_st_add(TREG_SP, clob1, 8);
+#endif
+		if (load_store_size == 8) {
+			/*
+			 * We save one byte a time, not for fast, but compact
+			 * code. After each store, data source register shift
+			 * right one byte. unchanged after 8 stores.
+			 */
+			frag.insn[n++] =
+				jit_x0_addi(clob2, TREG_ZERO, 7)     |
+				jit_x1_st_add(TREG_SP, clob2, 16);
+			frag.insn[n++] =
+				jit_x0_rotli(rb, rb, 56)      |
+				jit_x1_st1_add(clob1, rb, UA_FIXUP_ADDR_DELTA);
+			frag.insn[n++] =
+				jit_x0_addi(clob2, clob2, -1) |
+				jit_x1_bnezt(clob2, -1);
+			frag.insn[n++] =
+				jit_x0_fnop()                 |
+				jit_x1_addi(clob2, y1_br_reg, 0);
+		} else if (load_store_size == 4) {
+			frag.insn[n++] =
+				jit_x0_addi(clob2, TREG_ZERO, 3)     |
+				jit_x1_st_add(TREG_SP, clob2, 16);
+			frag.insn[n++] =
+				jit_x0_rotli(rb, rb, 56)      |
+				jit_x1_st1_add(clob1, rb, UA_FIXUP_ADDR_DELTA);
+			frag.insn[n++] =
+				jit_x0_addi(clob2, clob2, -1) |
+				jit_x1_bnezt(clob2, -1);
+			/*
+			 * same as 8-byte case, but need shift another 4
+			 * byte to recover rb for 4-byte store.
+			 */
+			frag.insn[n++] = jit_x0_rotli(rb, rb, 32)      |
+				jit_x1_addi(clob2, y1_br_reg, 0);
+		} else { /* =2 */
+			frag.insn[n++] =
+				jit_x0_addi(clob2, rb, 0)     |
+				jit_x1_st_add(TREG_SP, clob2, 16);
+			for (k = 0; k < 2; k++) {
+				frag.insn[n++] =
+					jit_x0_shrui(rb, rb, 8)  |
+					jit_x1_st1_add(clob1, rb,
+						       UA_FIXUP_ADDR_DELTA);
+			}
+			frag.insn[n++] =
+				jit_x0_addi(rb, clob2, 0)       |
+				jit_x1_addi(clob2, y1_br_reg, 0);
+		}
+
+		if (bundle_2_enable)
+			frag.insn[n++] = bundle_2;
+
+		if (y1_lr) {
+			frag.insn[n++] =
+				jit_x0_fnop()                    |
+				jit_x1_mfspr(y1_lr_reg,
+					     SPR_EX_CONTEXT_0_0);
+		}
+		if (y1_br) {
+			frag.insn[n++] =
+				jit_x0_fnop()                    |
+				jit_x1_mtspr(SPR_EX_CONTEXT_0_0,
+					     clob2);
+		}
+		if (x1_add) {
+			frag.insn[n++] =
+				jit_x0_addi(ra, ra, x1_add_imm8) |
+				jit_x1_ld_add(clob2, clob3, -8);
+		} else {
+			frag.insn[n++] =
+				jit_x0_fnop()                    |
+				jit_x1_ld_add(clob2, clob3, -8);
+		}
+		frag.insn[n++] =
+			jit_x0_fnop()   |
+			jit_x1_ld_add(clob1, clob3, -8);
+		frag.insn[n++] = jit_x0_fnop()   | jit_x1_ld(clob3, clob3);
+		frag.insn[n++] = jit_x0_fnop()   | jit_x1_iret();
+
+	} else {
+		/*
+		 * Generic memory load cases.
+		 *
+		 * Alloc space for saveing clob1,2,3 on user's stack.
+		 * register clob3 points to where clob1 saved, followed
+		 * by clob2 and 3 from high to low memory.
+		 */
+
+		frag.insn[n++] =
+			jit_x0_addi(TREG_SP, TREG_SP, -32) |
+			jit_x1_fnop();
+		frag.insn[n++] =
+			jit_x0_addi(clob3, TREG_SP, 16) |
+			jit_x1_st_add(TREG_SP, clob3, 8);
+		frag.insn[n++] =
+			jit_x0_addi(clob2, ra, 0) |
+			jit_x1_st_add(TREG_SP, clob2, 8);
+
+		if (y1_br) {
+			frag.insn[n++] =
+				jit_x0_addi(clob1, y1_br_reg, 0) |
+				jit_x1_st_add(TREG_SP, clob1, 16);
+		} else {
+			frag.insn[n++] =
+				jit_x0_fnop() |
+				jit_x1_st_add(TREG_SP, clob1, 16);
+		}
+
+		if (bundle_2_enable)
+			frag.insn[n++] = bundle_2;
+
+		if (y1_lr) {
+			frag.insn[n++] =
+				jit_x0_fnop()  |
+				jit_x1_mfspr(y1_lr_reg,
+					     SPR_EX_CONTEXT_0_0);
+		}
+
+		if (y1_br) {
+			frag.insn[n++] =
+				jit_x0_fnop() |
+				jit_x1_mtspr(SPR_EX_CONTEXT_0_0,
+					     clob1);
+		}
+
+		frag.insn[n++] =
+			jit_x0_addi(clob1, clob2, 7)      |
+			jit_x1_ldna(rd, clob2);
+		frag.insn[n++] =
+			jit_x0_fnop()                     |
+			jit_x1_ldna(clob1, clob1);
+		frag.insn[n++] =
+			jit_x0_dblalign(rd, clob1, clob2) |
+			jit_x1_ld_add(clob1, clob3, -8);
+		if (x1_add) {
+			frag.insn[n++] =
+				jit_x0_addi(ra, ra, x1_add_imm8) |
+				jit_x1_ld_add(clob2, clob3, -8);
+		} else {
+			frag.insn[n++] =
+				jit_x0_fnop()  |
+				jit_x1_ld_add(clob2, clob3, -8);
+		}
+
+		frag.insn[n++] =
+			jit_x0_fnop() |
+			jit_x1_ld(clob3, clob3);
+
+		if (load_store_size == 4) {
+			if (load_store_signed)
+				frag.insn[n++] =
+					jit_x0_bfexts(
+						rd, rd,
+						UA_FIXUP_BFEXT_START(4),
+						UA_FIXUP_BFEXT_END(4)) |
+					jit_x1_fnop();
+			else
+				frag.insn[n++] =
+					jit_x0_bfextu(
+						rd, rd,
+						UA_FIXUP_BFEXT_START(4),
+						UA_FIXUP_BFEXT_END(4)) |
+					jit_x1_fnop();
+		} else if (load_store_size == 2) {
+			if (load_store_signed)
+				frag.insn[n++] =
+					jit_x0_bfexts(
+						rd, rd,
+						UA_FIXUP_BFEXT_START(2),
+						UA_FIXUP_BFEXT_END(2)) |
+					jit_x1_fnop();
+			else
+				frag.insn[n++] =
+					jit_x0_bfextu(
+						rd, rd,
+						UA_FIXUP_BFEXT_START(2),
+						UA_FIXUP_BFEXT_END(2)) |
+					jit_x1_fnop();
+		}
+
+		frag.insn[n++] = jit_x0_fnop() | jit_x1_iret();
+	}
+
+	/* Max JIT bundle count is 14. */
+	WARN_ON(n > 14);
+
+	if (!unexpected) {
+		int status = 0;
+		int idx = (regs->pc >> 3) &
+			((1ULL << (PAGE_SHIFT - UNALIGN_JIT_SHIFT)) - 1);
+
+		frag.pc = regs->pc;
+		frag.bundle = bundle;
+
+		if (unaligned_printk) {
+			pr_info("%s/%d, Unalign fixup: pc=%lx "
+				"bundle=%lx %d %d %d %d %d %d %d %d.",
+				current->comm, current->pid,
+				(unsigned long)frag.pc,
+				(unsigned long)frag.bundle,
+				(int)alias, (int)rd, (int)ra,
+				(int)rb, (int)bundle_2_enable,
+				(int)y1_lr, (int)y1_br, (int)x1_add);
+
+			for (k = 0; k < n; k += 2)
+				pr_info("[%d] %016llx %016llx", k,
+					(unsigned long long)frag.insn[k],
+					(unsigned long long)frag.insn[k+1]);
+		}
+
+		/* Swap bundle byte order for big endian sys. */
+#ifdef __BIG_ENDIAN
+		frag.bundle = GX_INSN_BSWAP(frag.bundle);
+		for (k = 0; k < n; k++)
+			frag.insn[k] = GX_INSN_BSWAP(frag.insn[k]);
+#endif /* __BIG_ENDIAN */
+
+		status = copy_to_user((void __user *)&jit_code_area[idx],
+				      &frag, sizeof(frag));
+		if (status) {
+			/* Fail to copy JIT into user land. send SIGSEGV. */
+			siginfo_t info = {
+				.si_signo = SIGSEGV,
+				.si_code = SEGV_MAPERR,
+				.si_addr = (void __user *)&jit_code_area[idx]
+			};
+
+			pr_warn("Unalign fixup: pid=%d %s jit_code_area=%llx",
+				current->pid, current->comm,
+				(unsigned long long)&jit_code_area[idx]);
+
+			trace_unhandled_signal("segfault in unalign fixup",
+					       regs,
+					       (unsigned long)info.si_addr,
+					       SIGSEGV);
+			force_sig_info(info.si_signo, &info, current);
+			return;
+		}
+
+
+		/* Do a cheaper increment, not accurate. */
+		unaligned_fixup_count++;
+		__flush_icache_range((unsigned long)&jit_code_area[idx],
+				     (unsigned long)&jit_code_area[idx] +
+				     sizeof(frag));
+
+		/* Setup SPR_EX_CONTEXT_0_0/1 for returning to user program.*/
+		__insn_mtspr(SPR_EX_CONTEXT_0_0, regs->pc + 8);
+		__insn_mtspr(SPR_EX_CONTEXT_0_1, PL_ICS_EX1(USER_PL, 0));
+
+		/* Modify pc at the start of new JIT. */
+		regs->pc = (unsigned long)&jit_code_area[idx].insn[0];
+		/* Set ICS in SPR_EX_CONTEXT_K_1. */
+		regs->ex1 = PL_ICS_EX1(USER_PL, 1);
+	}
+}
+
+
+/*
+ * C function to generate unalign data JIT. Called from unalign data
+ * interrupt handler.
+ *
+ * First check if unalign fix is disabled or exception did not not come from
+ * user space or sp register points to unalign address, if true, generate a
+ * SIGBUS. Then map a page into user space as JIT area if it is not mapped
+ * yet. Genenerate JIT code by calling jit_bundle_gen(). After that return
+ * back to exception handler.
+ *
+ * The exception handler will "iret" to new generated JIT code after
+ * restoring caller saved registers. In theory, the JIT code will perform
+ * another "iret" to resume user's program.
+ */
+
+void do_unaligned(struct pt_regs *regs, int vecnum)
+{
+	tilegx_bundle_bits __user  *pc;
+	tilegx_bundle_bits bundle;
+	struct thread_info *info = current_thread_info();
+	int align_ctl;
+
+	/* Checks the per-process unaligned JIT flags */
+	align_ctl = unaligned_fixup;
+	switch (task_thread_info(current)->align_ctl) {
+	case PR_UNALIGN_NOPRINT:
+		align_ctl = 1;
+		break;
+	case PR_UNALIGN_SIGBUS:
+		align_ctl = 0;
+		break;
+	}
+
+	/* Enable iterrupt in order to access user land. */
+	local_irq_enable();
+
+	/*
+	 * The fault came from kernel space. Two choices:
+	 * (a) unaligned_fixup < 1, we will first call get/put_user fixup
+	 *     to return -EFAULT. If no fixup, simply panic the kernel.
+	 * (b) unaligned_fixup >=1, we will try to fix the unaligned access
+	 *     if it was triggered by get_user/put_user() macros. Panic the
+	 *     kernel if it is not fixable.
+	 */
+
+	if (EX1_PL(regs->ex1) != USER_PL) {
+
+		if (align_ctl < 1) {
+			unaligned_fixup_count++;
+			/* If exception came from kernel, try fix it up. */
+			if (fixup_exception(regs)) {
+				if (unaligned_printk)
+					pr_info("Unalign fixup: %d %llx @%llx",
+						(int)unaligned_fixup,
+						(unsigned long long)regs->ex1,
+						(unsigned long long)regs->pc);
+				return;
+			}
+			/* Not fixable. Go panic. */
+			panic("Unalign exception in Kernel. pc=%lx",
+			      regs->pc);
+			return;
+		} else {
+			/*
+			 * Try to fix the exception. If we can't, panic the
+			 * kernel.
+			 */
+			bundle = GX_INSN_BSWAP(
+				*((tilegx_bundle_bits *)(regs->pc)));
+			jit_bundle_gen(regs, bundle, align_ctl);
+			return;
+		}
+	}
+
+	/*
+	 * Fault came from user with ICS or stack is not aligned.
+	 * If so, we will trigger SIGBUS.
+	 */
+	if ((regs->sp & 0x7) || (regs->ex1) || (align_ctl < 0)) {
+		siginfo_t info = {
+			.si_signo = SIGBUS,
+			.si_code = BUS_ADRALN,
+			.si_addr = (unsigned char __user *)0
+		};
+
+		if (unaligned_printk)
+			pr_info("Unalign fixup: %d %llx @%llx",
+				(int)unaligned_fixup,
+				(unsigned long long)regs->ex1,
+				(unsigned long long)regs->pc);
+
+		unaligned_fixup_count++;
+
+		trace_unhandled_signal("unaligned fixup trap", regs, 0, SIGBUS);
+		force_sig_info(info.si_signo, &info, current);
+		return;
+	}
+
+
+	/* Read the bundle casued the exception! */
+	pc = (tilegx_bundle_bits __user *)(regs->pc);
+	if (get_user(bundle, pc) != 0) {
+		/* Probably never be here since pc is valid user address.*/
+		siginfo_t info = {
+			.si_signo = SIGSEGV,
+			.si_code = SEGV_MAPERR,
+			.si_addr = (void __user *)pc
+		};
+		pr_err("Couldn't read instruction at %p trying to step\n", pc);
+		trace_unhandled_signal("segfault in unalign fixup", regs,
+				       (unsigned long)info.si_addr, SIGSEGV);
+		force_sig_info(info.si_signo, &info, current);
+		return;
+	}
+
+	if (!info->unalign_jit_base) {
+		void __user *user_page;
+
+		/*
+		 * Allocate a page in userland.
+		 * For 64-bit processes we try to place the mapping far
+		 * from anything else that might be going on (specifically
+		 * 64 GB below the top of the user address space).  If it
+		 * happens not to be possible to put it there, it's OK;
+		 * the kernel will choose another location and we'll
+		 * remember it for later.
+		 */
+		if (is_compat_task())
+			user_page = NULL;
+		else
+			user_page = (void __user *)(TASK_SIZE - (1UL << 36)) +
+				(current->pid << PAGE_SHIFT);
+
+		user_page = (void __user *) vm_mmap(NULL,
+						    (unsigned long)user_page,
+						    PAGE_SIZE,
+						    PROT_EXEC | PROT_READ |
+						    PROT_WRITE,
+#ifdef CONFIG_HOMECACHE
+						    MAP_CACHE_HOME_TASK |
+#endif
+						    MAP_PRIVATE |
+						    MAP_ANONYMOUS,
+						    0);
+
+		if (IS_ERR((void __force *)user_page)) {
+			pr_err("Out of kernel pages trying do_mmap.\n");
+			return;
+		}
+
+		/* Save the address in the thread_info struct */
+		info->unalign_jit_base = user_page;
+		if (unaligned_printk)
+			pr_info("Unalign bundle: %d:%d, allocate page @%llx",
+				raw_smp_processor_id(), current->pid,
+				(unsigned long long)user_page);
+	}
+
+	/* Generate unalign JIT */
+	jit_bundle_gen(regs, GX_INSN_BSWAP(bundle), align_ctl);
+}
+
+#endif /* __tilegx__ */
diff --git a/arch/tile/kernel/vdso.c b/arch/tile/kernel/vdso.c
new file mode 100644
index 000000000000..1533af24106e
--- /dev/null
+++ b/arch/tile/kernel/vdso.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/binfmts.h>
+#include <linux/compat.h>
+#include <linux/elf.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+
+#include <asm/vdso.h>
+#include <asm/mman.h>
+#include <asm/sections.h>
+
+#include <arch/sim.h>
+
+/* The alignment of the vDSO. */
+#define VDSO_ALIGNMENT  PAGE_SIZE
+
+
+static unsigned int vdso_pages;
+static struct page **vdso_pagelist;
+
+#ifdef CONFIG_COMPAT
+static unsigned int vdso32_pages;
+static struct page **vdso32_pagelist;
+#endif
+static int vdso_ready;
+
+/*
+ * The vdso data page.
+ */
+static union {
+	struct vdso_data	data;
+	u8			page[PAGE_SIZE];
+} vdso_data_store __page_aligned_data;
+
+struct vdso_data *vdso_data = &vdso_data_store.data;
+
+static unsigned int __read_mostly vdso_enabled = 1;
+
+static struct page **vdso_setup(void *vdso_kbase, unsigned int pages)
+{
+	int i;
+	struct page **pagelist;
+
+	pagelist = kzalloc(sizeof(struct page *) * (pages + 1), GFP_KERNEL);
+	BUG_ON(pagelist == NULL);
+	for (i = 0; i < pages - 1; i++) {
+		struct page *pg = virt_to_page(vdso_kbase + i*PAGE_SIZE);
+		ClearPageReserved(pg);
+		pagelist[i] = pg;
+	}
+	pagelist[pages - 1] = virt_to_page(vdso_data);
+	pagelist[pages] = NULL;
+
+	return pagelist;
+}
+
+static int __init vdso_init(void)
+{
+	int data_pages = sizeof(vdso_data_store) >> PAGE_SHIFT;
+
+	/*
+	 * We can disable vDSO support generally, but we need to retain
+	 * one page to support the two-bundle (16-byte) rt_sigreturn path.
+	 */
+	if (!vdso_enabled) {
+		size_t offset = (unsigned long)&__vdso_rt_sigreturn;
+		static struct page *sigret_page;
+		sigret_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		BUG_ON(sigret_page == NULL);
+		vdso_pagelist = &sigret_page;
+		vdso_pages = 1;
+		BUG_ON(offset >= PAGE_SIZE);
+		memcpy(page_address(sigret_page) + offset,
+		       vdso_start + offset, 16);
+#ifdef CONFIG_COMPAT
+		vdso32_pages = vdso_pages;
+		vdso32_pagelist = vdso_pagelist;
+#endif
+		vdso_ready = 1;
+		return 0;
+	}
+
+	vdso_pages = (vdso_end - vdso_start) >> PAGE_SHIFT;
+	vdso_pages += data_pages;
+	vdso_pagelist = vdso_setup(vdso_start, vdso_pages);
+
+#ifdef CONFIG_COMPAT
+	vdso32_pages = (vdso32_end - vdso32_start) >> PAGE_SHIFT;
+	vdso32_pages += data_pages;
+	vdso32_pagelist = vdso_setup(vdso32_start, vdso32_pages);
+#endif
+
+	smp_wmb();
+	vdso_ready = 1;
+
+	return 0;
+}
+arch_initcall(vdso_init);
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+	if (vma->vm_mm && vma->vm_start == VDSO_BASE)
+		return "[vdso]";
+#ifndef __tilegx__
+	if (vma->vm_start == MEM_USER_INTRPT)
+		return "[intrpt]";
+#endif
+	return NULL;
+}
+
+struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
+{
+	return NULL;
+}
+
+int in_gate_area(struct mm_struct *mm, unsigned long address)
+{
+	return 0;
+}
+
+int in_gate_area_no_mm(unsigned long address)
+{
+	return 0;
+}
+
+int setup_vdso_pages(void)
+{
+	struct page **pagelist;
+	unsigned long pages;
+	struct mm_struct *mm = current->mm;
+	unsigned long vdso_base = 0;
+	int retval = 0;
+
+	if (!vdso_ready)
+		return 0;
+
+	mm->context.vdso_base = 0;
+
+	pagelist = vdso_pagelist;
+	pages = vdso_pages;
+#ifdef CONFIG_COMPAT
+	if (is_compat_task()) {
+		pagelist = vdso32_pagelist;
+		pages = vdso32_pages;
+	}
+#endif
+
+	/*
+	 * vDSO has a problem and was disabled, just don't "enable" it for the
+	 * process.
+	 */
+	if (pages == 0)
+		return 0;
+
+	vdso_base = get_unmapped_area(NULL, vdso_base,
+				      (pages << PAGE_SHIFT) +
+				      ((VDSO_ALIGNMENT - 1) & PAGE_MASK),
+				      0, 0);
+	if (IS_ERR_VALUE(vdso_base)) {
+		retval = vdso_base;
+		return retval;
+	}
+
+	/* Add required alignment. */
+	vdso_base = ALIGN(vdso_base, VDSO_ALIGNMENT);
+
+	/*
+	 * Put vDSO base into mm struct. We need to do this before calling
+	 * install_special_mapping or the perf counter mmap tracking code
+	 * will fail to recognise it as a vDSO (since arch_vma_name fails).
+	 */
+	mm->context.vdso_base = vdso_base;
+
+	/*
+	 * our vma flags don't have VM_WRITE so by default, the process isn't
+	 * allowed to write those pages.
+	 * gdb can break that with ptrace interface, and thus trigger COW on
+	 * those pages but it's then your responsibility to never do that on
+	 * the "data" page of the vDSO or you'll stop getting kernel updates
+	 * and your nice userland gettimeofday will be totally dead.
+	 * It's fine to use that for setting breakpoints in the vDSO code
+	 * pages though
+	 */
+	retval = install_special_mapping(mm, vdso_base,
+					 pages << PAGE_SHIFT,
+					 VM_READ|VM_EXEC |
+					 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC,
+					 pagelist);
+	if (retval)
+		mm->context.vdso_base = 0;
+
+	return retval;
+}
+
+static __init int vdso_func(char *s)
+{
+	return kstrtouint(s, 0, &vdso_enabled);
+}
+__setup("vdso=", vdso_func);
diff --git a/arch/tile/kernel/vdso/Makefile b/arch/tile/kernel/vdso/Makefile
new file mode 100644
index 000000000000..e2b7a2f4ee41
--- /dev/null
+++ b/arch/tile/kernel/vdso/Makefile
@@ -0,0 +1,118 @@
+# Symbols present in the vdso
+vdso-syms = rt_sigreturn gettimeofday
+
+# Files to link into the vdso
+obj-vdso = $(patsubst %, v%.o, $(vdso-syms))
+
+# Build rules
+targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.lds
+obj-vdso := $(addprefix $(obj)/, $(obj-vdso))
+
+# vdso32 is only for tilegx -m32 compat task.
+VDSO32-$(CONFIG_COMPAT) := y
+
+obj-y += vdso.o
+obj-$(VDSO32-y) += vdso32.o
+extra-y += vdso.lds
+CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
+
+# vDSO code runs in userspace and -pg doesn't help with profiling anyway.
+CFLAGS_REMOVE_vdso.o = -pg
+CFLAGS_REMOVE_vdso32.o = -pg
+CFLAGS_REMOVE_vrt_sigreturn.o = -pg
+CFLAGS_REMOVE_vrt_sigreturn32.o = -pg
+CFLAGS_REMOVE_vgettimeofday.o = -pg
+CFLAGS_REMOVE_vgettimeofday32.o = -pg
+
+ifdef CONFIG_FEEDBACK_COLLECT
+# vDSO code runs in userspace, not collecting feedback data.
+CFLAGS_REMOVE_vdso.o = -ffeedback-generate
+CFLAGS_REMOVE_vdso32.o = -ffeedback-generate
+CFLAGS_REMOVE_vrt_sigreturn.o = -ffeedback-generate
+CFLAGS_REMOVE_vrt_sigreturn32.o = -ffeedback-generate
+CFLAGS_REMOVE_vgettimeofday.o = -ffeedback-generate
+CFLAGS_REMOVE_vgettimeofday32.o = -ffeedback-generate
+endif
+
+# Disable gcov profiling for VDSO code
+GCOV_PROFILE := n
+
+# Force dependency
+$(obj)/vdso.o: $(obj)/vdso.so
+
+# link rule for the .so file, .lds has to be first
+SYSCFLAGS_vdso.so.dbg = $(c_flags)
+$(obj)/vdso.so.dbg: $(src)/vdso.lds $(obj-vdso)
+	$(call if_changed,vdsold)
+
+
+# We also create a special relocatable object that should mirror the symbol
+# table and layout of the linked DSO.  With ld -R we can then refer to
+# these symbols in the kernel code rather than hand-coded addresses.
+extra-y += vdso-syms.o
+$(obj)/built-in.o: $(obj)/vdso-syms.o
+$(obj)/built-in.o: ld_flags += -R $(obj)/vdso-syms.o
+
+SYSCFLAGS_vdso.so.dbg = -shared -s -Wl,-soname=linux-vdso.so.1 \
+                            $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
+SYSCFLAGS_vdso_syms.o = -r
+$(obj)/vdso-syms.o: $(src)/vdso.lds $(obj)/vrt_sigreturn.o FORCE
+	$(call if_changed,vdsold)
+
+
+# strip rule for the .so file
+$(obj)/%.so: OBJCOPYFLAGS := -S
+$(obj)/%.so: $(obj)/%.so.dbg FORCE
+	$(call if_changed,objcopy)
+
+# actual build commands
+# The DSO images are built using a special linker script
+# Add -lgcc so tilepro gets static muldi3 and lshrdi3 definitions.
+# Make sure only to export the intended __vdso_xxx symbol offsets.
+quiet_cmd_vdsold = VDSOLD  $@
+      cmd_vdsold = $(CC) $(KCFLAGS) -nostdlib $(SYSCFLAGS_$(@F)) \
+                           -Wl,-T,$(filter-out FORCE,$^) -o $@.tmp -lgcc && \
+                   $(CROSS_COMPILE)objcopy \
+                           $(patsubst %, -G __vdso_%, $(vdso-syms)) $@.tmp $@
+
+# install commands for the unstripped file
+quiet_cmd_vdso_install = INSTALL $@
+      cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@
+
+vdso.so: $(obj)/vdso.so.dbg
+	@mkdir -p $(MODLIB)/vdso
+	$(call cmd,vdso_install)
+
+vdso32.so: $(obj)/vdso32.so.dbg
+	$(call cmd,vdso_install)
+
+vdso_install: vdso.so
+vdso32_install: vdso32.so
+
+
+KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS))
+KBUILD_AFLAGS_32 += -m32 -s
+KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
+KBUILD_CFLAGS_32 += -m32 -fPIC -shared
+
+obj-vdso32 = $(patsubst %, v%32.o, $(vdso-syms))
+obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32))
+
+targets += $(obj-vdso32) vdso32.so vdso32.so.dbg
+
+$(obj-vdso32:%=%): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
+$(obj-vdso32:%=%): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
+
+$(obj)/vgettimeofday32.o: $(obj)/vgettimeofday.c
+	$(call if_changed,cc_o_c)
+
+$(obj)/vrt_sigreturn32.o: $(obj)/vrt_sigreturn.S
+	$(call if_changed,as_o_S)
+
+# Force dependency
+$(obj)/vdso32.o: $(obj)/vdso32.so
+
+SYSCFLAGS_vdso32.so.dbg = -m32 -shared -s -Wl,-soname=linux-vdso32.so.1 \
+			    $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
+$(obj)/vdso32.so.dbg: $(src)/vdso.lds $(obj-vdso32)
+	$(call if_changed,vdsold)
diff --git a/arch/tile/kernel/vdso/vdso.S b/arch/tile/kernel/vdso/vdso.S
new file mode 100644
index 000000000000..3467adb41630
--- /dev/null
+++ b/arch/tile/kernel/vdso/vdso.S
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+	__PAGE_ALIGNED_DATA
+
+	.global vdso_start, vdso_end
+	.align PAGE_SIZE
+vdso_start:
+	.incbin "arch/tile/kernel/vdso/vdso.so"
+	.align PAGE_SIZE
+vdso_end:
+
+	.previous
diff --git a/arch/tile/kernel/vdso/vdso.lds.S b/arch/tile/kernel/vdso/vdso.lds.S
new file mode 100644
index 000000000000..041cd6c39c83
--- /dev/null
+++ b/arch/tile/kernel/vdso/vdso.lds.S
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#define VDSO_VERSION_STRING	LINUX_2.6
+
+
+OUTPUT_ARCH(tile)
+
+/* The ELF entry point can be used to set the AT_SYSINFO value. */
+ENTRY(__vdso_rt_sigreturn);
+
+
+SECTIONS
+{
+	. = SIZEOF_HEADERS;
+
+	.hash		: { *(.hash) }			:text
+	.gnu.hash	: { *(.gnu.hash) }
+	.dynsym		: { *(.dynsym) }
+	.dynstr		: { *(.dynstr) }
+	.gnu.version	: { *(.gnu.version) }
+	.gnu.version_d	: { *(.gnu.version_d) }
+	.gnu.version_r	: { *(.gnu.version_r) }
+
+	.note		: { *(.note.*) }		:text	:note
+	.dynamic	: { *(.dynamic) }		:text	:dynamic
+
+	.eh_frame_hdr	: { *(.eh_frame_hdr) }		:text	:eh_frame_hdr
+	.eh_frame	: { KEEP (*(.eh_frame)) }	:text
+
+	.rodata	 : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
+
+	/*
+	 * This linker script is used both with -r and with -shared.
+	 * For the layouts to match, we need to skip more than enough
+	 * space for the dynamic symbol table et al. If this amount
+	 * is insufficient, ld -shared will barf. Just increase it here.
+	 */
+	. = 0x1000;
+	.text		: { *(.text .text.*) }		:text
+
+	.data		: {
+		*(.got.plt) *(.got)
+		*(.data .data.* .gnu.linkonce.d.*)
+		*(.dynbss)
+		*(.bss .bss.* .gnu.linkonce.b.*)
+	}
+}
+
+
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+	text		PT_LOAD		FLAGS(5) FILEHDR PHDRS;	/* PF_R|PF_X */
+	dynamic		PT_DYNAMIC	FLAGS(4);		/* PF_R */
+	note		PT_NOTE		FLAGS(4);		/* PF_R */
+	eh_frame_hdr	PT_GNU_EH_FRAME;
+}
+
+
+/*
+ * This controls what userland symbols we export from the vDSO.
+ */
+VERSION
+{
+	VDSO_VERSION_STRING {
+	global:
+		__vdso_rt_sigreturn;
+		__vdso_gettimeofday;
+		gettimeofday;
+	local:*;
+	};
+}
diff --git a/arch/tile/kernel/vdso/vdso32.S b/arch/tile/kernel/vdso/vdso32.S
new file mode 100644
index 000000000000..1d1ac3257e11
--- /dev/null
+++ b/arch/tile/kernel/vdso/vdso32.S
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+	__PAGE_ALIGNED_DATA
+
+	.global vdso32_start, vdso32_end
+	.align PAGE_SIZE
+vdso32_start:
+	.incbin "arch/tile/kernel/vdso/vdso32.so"
+	.align PAGE_SIZE
+vdso32_end:
+
+	.previous
diff --git a/arch/tile/kernel/vdso/vgettimeofday.c b/arch/tile/kernel/vdso/vgettimeofday.c
new file mode 100644
index 000000000000..51ec8e46f5f9
--- /dev/null
+++ b/arch/tile/kernel/vdso/vgettimeofday.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#define VDSO_BUILD  /* avoid some shift warnings for -m32 in <asm/page.h> */
+#include <linux/time.h>
+#include <asm/timex.h>
+#include <asm/vdso.h>
+
+#if CHIP_HAS_SPLIT_CYCLE()
+static inline cycles_t get_cycles_inline(void)
+{
+	unsigned int high = __insn_mfspr(SPR_CYCLE_HIGH);
+	unsigned int low = __insn_mfspr(SPR_CYCLE_LOW);
+	unsigned int high2 = __insn_mfspr(SPR_CYCLE_HIGH);
+
+	while (unlikely(high != high2)) {
+		low = __insn_mfspr(SPR_CYCLE_LOW);
+		high = high2;
+		high2 = __insn_mfspr(SPR_CYCLE_HIGH);
+	}
+
+	return (((cycles_t)high) << 32) | low;
+}
+#define get_cycles get_cycles_inline
+#endif
+
+/*
+ * Find out the vDSO data page address in the process address space.
+ */
+inline unsigned long get_datapage(void)
+{
+	unsigned long ret;
+
+	/* vdso data page located in the 2nd vDSO page. */
+	asm volatile ("lnk %0" : "=r"(ret));
+	ret &= ~(PAGE_SIZE - 1);
+	ret += PAGE_SIZE;
+
+	return ret;
+}
+
+int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
+{
+	cycles_t cycles;
+	unsigned long count, sec, ns;
+	volatile struct vdso_data *vdso_data;
+
+	vdso_data = (struct vdso_data *)get_datapage();
+	/* The use of the timezone is obsolete, normally tz is NULL. */
+	if (unlikely(tz != NULL)) {
+		while (1) {
+			/* Spin until the update finish. */
+			count = vdso_data->tz_update_count;
+			if (count & 1)
+				continue;
+
+			tz->tz_minuteswest = vdso_data->tz_minuteswest;
+			tz->tz_dsttime = vdso_data->tz_dsttime;
+
+			/* Check whether updated, read again if so. */
+			if (count == vdso_data->tz_update_count)
+				break;
+		}
+	}
+
+	if (unlikely(tv == NULL))
+		return 0;
+
+	while (1) {
+		/* Spin until the update finish. */
+		count = vdso_data->tb_update_count;
+		if (count & 1)
+			continue;
+
+		cycles = (get_cycles() - vdso_data->xtime_tod_stamp);
+		ns = (cycles * vdso_data->mult) >> vdso_data->shift;
+		sec = vdso_data->xtime_clock_sec;
+		ns += vdso_data->xtime_clock_nsec;
+		if (ns >= NSEC_PER_SEC) {
+			ns -= NSEC_PER_SEC;
+			sec += 1;
+		}
+
+		/* Check whether updated, read again if so. */
+		if (count == vdso_data->tb_update_count)
+			break;
+	}
+
+	tv->tv_sec = sec;
+	tv->tv_usec = ns / 1000;
+
+	return 0;
+}
+
+int gettimeofday(struct timeval *tv, struct timezone *tz)
+	__attribute__((weak, alias("__vdso_gettimeofday")));
diff --git a/arch/tile/kernel/vdso/vrt_sigreturn.S b/arch/tile/kernel/vdso/vrt_sigreturn.S
new file mode 100644
index 000000000000..6326caf4a039
--- /dev/null
+++ b/arch/tile/kernel/vdso/vrt_sigreturn.S
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/linkage.h>
+#include <arch/abi.h>
+#include <asm/unistd.h>
+
+/*
+ * Note that libc has a copy of this function that it uses to compare
+ * against the PC when a stack backtrace ends, so if this code is
+ * changed, the libc implementation(s) should also be updated.
+ */
+ENTRY(__vdso_rt_sigreturn)
+	moveli TREG_SYSCALL_NR_NAME, __NR_rt_sigreturn
+	swint1
+	/* We don't use ENDPROC to avoid tagging this symbol as FUNC,
+	 * which confuses the perf tool.
+	 */
+	END(__vdso_rt_sigreturn)
diff --git a/arch/tile/kernel/vmlinux.lds.S b/arch/tile/kernel/vmlinux.lds.S
index a13ed902afbb..f1819423ffc9 100644
--- a/arch/tile/kernel/vmlinux.lds.S
+++ b/arch/tile/kernel/vmlinux.lds.S
@@ -5,7 +5,7 @@
 #include <hv/hypervisor.h>
 
 /* Text loads starting from the supervisor interrupt vector address. */
-#define TEXT_OFFSET MEM_SV_INTRPT
+#define TEXT_OFFSET MEM_SV_START
 
 OUTPUT_ARCH(tile)
 ENTRY(_start)
@@ -13,7 +13,7 @@ jiffies = jiffies_64;
 
 PHDRS
 {
-  intrpt1 PT_LOAD ;
+  intrpt PT_LOAD ;
   text PT_LOAD ;
   data PT_LOAD ;
 }
@@ -24,14 +24,17 @@ SECTIONS
   #define LOAD_OFFSET TEXT_OFFSET
 
   /* Interrupt vectors */
-  .intrpt1 (LOAD_OFFSET) : AT ( 0 )   /* put at the start of physical memory */
+  .intrpt (LOAD_OFFSET) : AT ( 0 )   /* put at the start of physical memory */
   {
     _text = .;
-    *(.intrpt1)
-  } :intrpt1 =0
+    *(.intrpt)
+  } :intrpt =0
 
   /* Hypervisor call vectors */
-  #include "hvglue.lds"
+  . = ALIGN(0x10000);
+  .hvglue : AT (ADDR(.hvglue) - LOAD_OFFSET) {
+    *(.hvglue)
+  } :NONE
 
   /* Now the real code */
   . = ALIGN(0x20000);
@@ -40,7 +43,11 @@ SECTIONS
     HEAD_TEXT
     SCHED_TEXT
     LOCK_TEXT
+    KPROBES_TEXT
+    IRQENTRY_TEXT
     __fix_text_end = .;   /* tile-cpack won't rearrange before this */
+    ALIGN_FUNCTION();
+    *(.hottext*)
     TEXT_TEXT
     *(.text.*)
     *(.coldtext*)
@@ -67,20 +74,8 @@ SECTIONS
   __init_end = .;
 
   _sdata = .;                   /* Start of data section */
-
   RO_DATA_SECTION(PAGE_SIZE)
-
-  /* initially writeable, then read-only */
-  . = ALIGN(PAGE_SIZE);
-  __w1data_begin = .;
-  .w1data : AT(ADDR(.w1data) - LOAD_OFFSET) {
-    VMLINUX_SYMBOL(__w1data_begin) = .;
-    *(.w1data)
-    VMLINUX_SYMBOL(__w1data_end) = .;
-  }
-
   RW_DATA_SECTION(L2_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE)
-
   _edata = .;
 
   EXCEPTION_TABLE(L2_CACHE_BYTES)
diff --git a/arch/tile/lib/Makefile b/arch/tile/lib/Makefile
index 985f59858234..c4211cbb2021 100644
--- a/arch/tile/lib/Makefile
+++ b/arch/tile/lib/Makefile
@@ -4,15 +4,15 @@
 
 lib-y = cacheflush.o checksum.o cpumask.o delay.o uaccess.o \
 	memmove.o memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \
-	strchr_$(BITS).o strlen_$(BITS).o
-
-ifeq ($(CONFIG_TILEGX),y)
-CFLAGS_REMOVE_memcpy_user_64.o = -fno-omit-frame-pointer
-lib-y += memcpy_user_64.o
-else
-lib-y += atomic_32.o atomic_asm_32.o memcpy_tile64.o
-endif
+	strchr_$(BITS).o strlen_$(BITS).o strnlen_$(BITS).o
 
+lib-$(CONFIG_TILEGX) += memcpy_user_64.o
+lib-$(CONFIG_TILEPRO) += atomic_32.o atomic_asm_32.o
 lib-$(CONFIG_SMP) += spinlock_$(BITS).o usercopy_$(BITS).o
 
 obj-$(CONFIG_MODULES) += exports.o
+
+# The finv_buffer_remote() and copy_{to,from}_user() routines can't
+# have -pg added, since they both rely on being leaf functions.
+CFLAGS_REMOVE_cacheflush.o = -pg
+CFLAGS_REMOVE_memcpy_user_64.o = -pg
diff --git a/arch/tile/lib/atomic_32.c b/arch/tile/lib/atomic_32.c
index f5cada70c3c8..759efa337be8 100644
--- a/arch/tile/lib/atomic_32.c
+++ b/arch/tile/lib/atomic_32.c
@@ -20,50 +20,12 @@
 #include <linux/atomic.h>
 #include <arch/chip.h>
 
-/* See <asm/atomic_32.h> */
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-
-/*
- * A block of memory containing locks for atomic ops. Each instance of this
- * struct will be homed on a different CPU.
- */
-struct atomic_locks_on_cpu {
-	int lock[ATOMIC_HASH_L2_SIZE];
-} __attribute__((aligned(ATOMIC_HASH_L2_SIZE * 4)));
-
-static DEFINE_PER_CPU(struct atomic_locks_on_cpu, atomic_lock_pool);
-
-/* The locks we'll use until __init_atomic_per_cpu is called. */
-static struct atomic_locks_on_cpu __initdata initial_atomic_locks;
-
-/* Hash into this vector to get a pointer to lock for the given atomic. */
-struct atomic_locks_on_cpu *atomic_lock_ptr[ATOMIC_HASH_L1_SIZE]
-	__write_once = {
-	[0 ... ATOMIC_HASH_L1_SIZE-1] (&initial_atomic_locks)
-};
-
-#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
 /* This page is remapped on startup to be hash-for-home. */
 int atomic_locks[PAGE_SIZE / sizeof(int)] __page_aligned_bss;
 
-#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
 int *__atomic_hashed_lock(volatile void *v)
 {
 	/* NOTE: this code must match "sys_cmpxchg" in kernel/intvec_32.S */
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-	unsigned long i =
-		(unsigned long) v & ((PAGE_SIZE-1) & -sizeof(long long));
-	unsigned long n = __insn_crc32_32(0, i);
-
-	/* Grab high bits for L1 index. */
-	unsigned long l1_index = n >> ((sizeof(n) * 8) - ATOMIC_HASH_L1_SHIFT);
-	/* Grab low bits for L2 index. */
-	unsigned long l2_index = n & (ATOMIC_HASH_L2_SIZE - 1);
-
-	return &atomic_lock_ptr[l1_index]->lock[l2_index];
-#else
 	/*
 	 * Use bits [3, 3 + ATOMIC_HASH_SHIFT) as the lock index.
 	 * Using mm works here because atomic_locks is page aligned.
@@ -72,26 +34,13 @@ int *__atomic_hashed_lock(volatile void *v)
 				      (unsigned long)atomic_locks,
 				      2, (ATOMIC_HASH_SHIFT + 2) - 1);
 	return (int *)ptr;
-#endif
 }
 
 #ifdef CONFIG_SMP
 /* Return whether the passed pointer is a valid atomic lock pointer. */
 static int is_atomic_lock(int *p)
 {
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-	int i;
-	for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) {
-
-		if (p >= &atomic_lock_ptr[i]->lock[0] &&
-		    p < &atomic_lock_ptr[i]->lock[ATOMIC_HASH_L2_SIZE]) {
-			return 1;
-		}
-	}
-	return 0;
-#else
 	return p >= &atomic_locks[0] && p < &atomic_locks[ATOMIC_HASH_SIZE];
-#endif
 }
 
 void __atomic_fault_unlock(int *irqlock_word)
@@ -110,33 +59,32 @@ static inline int *__atomic_setup(volatile void *v)
 	return __atomic_hashed_lock(v);
 }
 
-int _atomic_xchg(atomic_t *v, int n)
+int _atomic_xchg(int *v, int n)
 {
-	return __atomic_xchg(&v->counter, __atomic_setup(v), n).val;
+	return __atomic_xchg(v, __atomic_setup(v), n).val;
 }
 EXPORT_SYMBOL(_atomic_xchg);
 
-int _atomic_xchg_add(atomic_t *v, int i)
+int _atomic_xchg_add(int *v, int i)
 {
-	return __atomic_xchg_add(&v->counter, __atomic_setup(v), i).val;
+	return __atomic_xchg_add(v, __atomic_setup(v), i).val;
 }
 EXPORT_SYMBOL(_atomic_xchg_add);
 
-int _atomic_xchg_add_unless(atomic_t *v, int a, int u)
+int _atomic_xchg_add_unless(int *v, int a, int u)
 {
 	/*
 	 * Note: argument order is switched here since it is easier
 	 * to use the first argument consistently as the "old value"
 	 * in the assembly, as is done for _atomic_cmpxchg().
 	 */
-	return __atomic_xchg_add_unless(&v->counter, __atomic_setup(v), u, a)
-		.val;
+	return __atomic_xchg_add_unless(v, __atomic_setup(v), u, a).val;
 }
 EXPORT_SYMBOL(_atomic_xchg_add_unless);
 
-int _atomic_cmpxchg(atomic_t *v, int o, int n)
+int _atomic_cmpxchg(int *v, int o, int n)
 {
-	return __atomic_cmpxchg(&v->counter, __atomic_setup(v), o, n).val;
+	return __atomic_cmpxchg(v, __atomic_setup(v), o, n).val;
 }
 EXPORT_SYMBOL(_atomic_cmpxchg);
 
@@ -159,33 +107,32 @@ unsigned long _atomic_xor(volatile unsigned long *p, unsigned long mask)
 EXPORT_SYMBOL(_atomic_xor);
 
 
-u64 _atomic64_xchg(atomic64_t *v, u64 n)
+u64 _atomic64_xchg(u64 *v, u64 n)
 {
-	return __atomic64_xchg(&v->counter, __atomic_setup(v), n);
+	return __atomic64_xchg(v, __atomic_setup(v), n);
 }
 EXPORT_SYMBOL(_atomic64_xchg);
 
-u64 _atomic64_xchg_add(atomic64_t *v, u64 i)
+u64 _atomic64_xchg_add(u64 *v, u64 i)
 {
-	return __atomic64_xchg_add(&v->counter, __atomic_setup(v), i);
+	return __atomic64_xchg_add(v, __atomic_setup(v), i);
 }
 EXPORT_SYMBOL(_atomic64_xchg_add);
 
-u64 _atomic64_xchg_add_unless(atomic64_t *v, u64 a, u64 u)
+u64 _atomic64_xchg_add_unless(u64 *v, u64 a, u64 u)
 {
 	/*
 	 * Note: argument order is switched here since it is easier
 	 * to use the first argument consistently as the "old value"
 	 * in the assembly, as is done for _atomic_cmpxchg().
 	 */
-	return __atomic64_xchg_add_unless(&v->counter, __atomic_setup(v),
-					  u, a);
+	return __atomic64_xchg_add_unless(v, __atomic_setup(v), u, a);
 }
 EXPORT_SYMBOL(_atomic64_xchg_add_unless);
 
-u64 _atomic64_cmpxchg(atomic64_t *v, u64 o, u64 n)
+u64 _atomic64_cmpxchg(u64 *v, u64 o, u64 n)
 {
-	return __atomic64_cmpxchg(&v->counter, __atomic_setup(v), o, n);
+	return __atomic64_cmpxchg(v, __atomic_setup(v), o, n);
 }
 EXPORT_SYMBOL(_atomic64_cmpxchg);
 
@@ -208,54 +155,8 @@ struct __get_user __atomic_bad_address(int __user *addr)
 }
 
 
-#if CHIP_HAS_CBOX_HOME_MAP()
-static int __init noatomichash(char *str)
-{
-	pr_warning("noatomichash is deprecated.\n");
-	return 1;
-}
-__setup("noatomichash", noatomichash);
-#endif
-
 void __init __init_atomic_per_cpu(void)
 {
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-
-	unsigned int i;
-	int actual_cpu;
-
-	/*
-	 * Before this is called from setup, we just have one lock for
-	 * all atomic objects/operations.  Here we replace the
-	 * elements of atomic_lock_ptr so that they point at per_cpu
-	 * integers.  This seemingly over-complex approach stems from
-	 * the fact that DEFINE_PER_CPU defines an entry for each cpu
-	 * in the grid, not each cpu from 0..ATOMIC_HASH_SIZE-1.  But
-	 * for efficient hashing of atomics to their locks we want a
-	 * compile time constant power of 2 for the size of this
-	 * table, so we use ATOMIC_HASH_SIZE.
-	 *
-	 * Here we populate atomic_lock_ptr from the per cpu
-	 * atomic_lock_pool, interspersing by actual cpu so that
-	 * subsequent elements are homed on consecutive cpus.
-	 */
-
-	actual_cpu = cpumask_first(cpu_possible_mask);
-
-	for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) {
-		/*
-		 * Preincrement to slightly bias against using cpu 0,
-		 * which has plenty of stuff homed on it already.
-		 */
-		actual_cpu = cpumask_next(actual_cpu, cpu_possible_mask);
-		if (actual_cpu >= nr_cpu_ids)
-			actual_cpu = cpumask_first(cpu_possible_mask);
-
-		atomic_lock_ptr[i] = &per_cpu(atomic_lock_pool, actual_cpu);
-	}
-
-#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
 	/* Validate power-of-two and "bigger than cpus" assumption */
 	BUILD_BUG_ON(ATOMIC_HASH_SIZE & (ATOMIC_HASH_SIZE-1));
 	BUG_ON(ATOMIC_HASH_SIZE < nr_cpu_ids);
@@ -279,6 +180,4 @@ void __init __init_atomic_per_cpu(void)
 	 * That should not produce more indices than ATOMIC_HASH_SIZE.
 	 */
 	BUILD_BUG_ON((PAGE_SIZE >> 3) > ATOMIC_HASH_SIZE);
-
-#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
 }
diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S
index 30638042691d..6bda3132cd61 100644
--- a/arch/tile/lib/atomic_asm_32.S
+++ b/arch/tile/lib/atomic_asm_32.S
@@ -164,6 +164,7 @@ STD_ENTRY_SECTION(__atomic\name, .text.atomic)
 	STD_ENDPROC(__atomic\name)
 	.ifc \bitwidth,32
 	.pushsection __ex_table,"a"
+	.align  4
 	.word   1b, __atomic\name
 	.word   2b, __atomic\name
 	.word   __atomic\name, __atomic_bad_address
diff --git a/arch/tile/lib/cacheflush.c b/arch/tile/lib/cacheflush.c
index 8f8ad814b139..9c0ec22009a5 100644
--- a/arch/tile/lib/cacheflush.c
+++ b/arch/tile/lib/cacheflush.c
@@ -36,7 +36,8 @@ static inline void force_load(char *p)
  * core (if "!hfh") or homed via hash-for-home (if "hfh"), waiting
  * until the memory controller holds the flushed values.
  */
-void finv_buffer_remote(void *buffer, size_t size, int hfh)
+void __attribute__((optimize("omit-frame-pointer")))
+finv_buffer_remote(void *buffer, size_t size, int hfh)
 {
 	char *p, *base;
 	size_t step_size, load_count;
@@ -147,18 +148,21 @@ void finv_buffer_remote(void *buffer, size_t size, int hfh)
 		force_load(p);
 
 	/*
-	 * Repeat, but with inv's instead of loads, to get rid of the
+	 * Repeat, but with finv's instead of loads, to get rid of the
 	 * data we just loaded into our own cache and the old home L3.
-	 * No need to unroll since inv's don't target a register.
+	 * No need to unroll since finv's don't target a register.
+	 * The finv's are guaranteed not to actually flush the data in
+	 * the buffer back to their home, since we just read it, so the
+	 * lines are clean in cache; we will only invalidate those lines.
 	 */
 	p = (char *)buffer + size - 1;
-	__insn_inv(p);
+	__insn_finv(p);
 	p -= step_size;
 	p = (char *)((unsigned long)p | (step_size - 1));
 	for (; p >= base; p -= step_size)
-		__insn_inv(p);
+		__insn_finv(p);
 
-	/* Wait for the load+inv's (and thus finvs) to have completed. */
+	/* Wait for these finv's (and thus the first finvs) to be done. */
 	__insn_mf();
 
 #ifdef __tilegx__
diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c
index a93b02a25222..82733c87d67e 100644
--- a/arch/tile/lib/exports.c
+++ b/arch/tile/lib/exports.c
@@ -22,7 +22,6 @@ EXPORT_SYMBOL(strnlen_user_asm);
 EXPORT_SYMBOL(strncpy_from_user_asm);
 EXPORT_SYMBOL(clear_user_asm);
 EXPORT_SYMBOL(flush_user_asm);
-EXPORT_SYMBOL(inv_user_asm);
 EXPORT_SYMBOL(finv_user_asm);
 
 /* arch/tile/kernel/entry.S */
@@ -34,6 +33,12 @@ EXPORT_SYMBOL(dump_stack);
 /* arch/tile/kernel/head.S */
 EXPORT_SYMBOL(empty_zero_page);
 
+#ifdef CONFIG_FUNCTION_TRACER
+/* arch/tile/kernel/mcount_64.S */
+#include <asm/ftrace.h>
+EXPORT_SYMBOL(__mcount);
+#endif /* CONFIG_FUNCTION_TRACER */
+
 /* arch/tile/lib/, various memcpy files */
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(__copy_to_user_inatomic);
diff --git a/arch/tile/lib/memchr_64.c b/arch/tile/lib/memchr_64.c
index 6f867dbf7c56..f8196b3a950e 100644
--- a/arch/tile/lib/memchr_64.c
+++ b/arch/tile/lib/memchr_64.c
@@ -36,7 +36,7 @@ void *memchr(const void *s, int c, size_t n)
 	p = (const uint64_t *)(s_int & -8);
 
 	/* Create eight copies of the byte for which we are looking. */
-	goal = 0x0101010101010101ULL * (uint8_t) c;
+	goal = copy_byte(c);
 
 	/* Read the first word, but munge it so that bytes before the array
 	 * will not match goal.
diff --git a/arch/tile/lib/memcpy_32.S b/arch/tile/lib/memcpy_32.S
index 2a419a6122db..a2771ae5da53 100644
--- a/arch/tile/lib/memcpy_32.S
+++ b/arch/tile/lib/memcpy_32.S
@@ -22,14 +22,6 @@
 
 #include <linux/linkage.h>
 
-/* On TILE64, we wrap these functions via arch/tile/lib/memcpy_tile64.c */
-#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
-#define memcpy __memcpy_asm
-#define __copy_to_user_inatomic __copy_to_user_inatomic_asm
-#define __copy_from_user_inatomic __copy_from_user_inatomic_asm
-#define __copy_from_user_zeroing __copy_from_user_zeroing_asm
-#endif
-
 #define IS_MEMCPY	  0
 #define IS_COPY_FROM_USER  1
 #define IS_COPY_FROM_USER_ZEROING  2
@@ -44,6 +36,7 @@
  */
 #define EX \
 	.pushsection __ex_table, "a"; \
+	.align 4; \
 	.word 9f, memcpy_common_fixup; \
 	.popsection; \
 	9
@@ -158,12 +151,9 @@ EX:	{ sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
 
 	{ addi r3, r1, 60; andi r9, r9, -64 }
 
-#if CHIP_HAS_WH64()
 	/* No need to prefetch dst, we'll just do the wh64
 	 * right before we copy a line.
 	 */
-#endif
-
 EX:	{ lw r5, r3; addi r3, r3, 64; movei r4, 1 }
 	/* Intentionally stall for a few cycles to leave L2 cache alone. */
 	{ bnzt zero, .; move r27, lr }
@@ -171,21 +161,6 @@ EX:	{ lw r6, r3; addi r3, r3, 64 }
 	/* Intentionally stall for a few cycles to leave L2 cache alone. */
 	{ bnzt zero, . }
 EX:	{ lw r7, r3; addi r3, r3, 64 }
-#if !CHIP_HAS_WH64()
-	/* Prefetch the dest */
-	/* Intentionally stall for a few cycles to leave L2 cache alone. */
-	{ bnzt zero, . }
-	/* Use a real load to cause a TLB miss if necessary.  We aren't using
-	 * r28, so this should be fine.
-	 */
-EX:	{ lw r28, r9; addi r9, r9, 64 }
-	/* Intentionally stall for a few cycles to leave L2 cache alone. */
-	{ bnzt zero, . }
-	{ prefetch r9; addi r9, r9, 64 }
-	/* Intentionally stall for a few cycles to leave L2 cache alone. */
-	{ bnzt zero, . }
-	{ prefetch r9; addi r9, r9, 64 }
-#endif
 	/* Intentionally stall for a few cycles to leave L2 cache alone. */
 	{ bz zero, .Lbig_loop2 }
 
@@ -286,13 +261,8 @@ EX:	{ lw r7, r3; addi r3, r3, 64 }
 	/* Fill second L1D line. */
 EX:	{ lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */
 
-#if CHIP_HAS_WH64()
 	/* Prepare destination line for writing. */
 EX:	{ wh64 r9; addi r9, r9, 64 }
-#else
-	/* Prefetch dest line */
-	{ prefetch r9; addi r9, r9, 64 }
-#endif
 	/* Load seven words that are L1D hits to cover wh64 L2 usage. */
 
 	/* Load the three remaining words from the last L1D line, which
@@ -330,16 +300,7 @@ EX:	{ lw r18, r1; addi r1, r1, 4 }                  /* r18 = WORD_8 */
 EX:	{ sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */
 EX:	{ sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */
 EX:	{ sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */
-#if CHIP_HAS_WH64()
 EX:	{ sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */
-#else
-	/* Back up the r9 to a cache line we are already storing to
-	 * if it gets past the end of the dest vector.  Strictly speaking,
-	 * we don't need to back up to the start of a cache line, but it's free
-	 * and tidy, so why not?
-	 */
-EX:	{ sw r0, r15; addi r0, r0, 4; andi r13, r0, -64 } /* store(WORD_3) */
-#endif
 	/* Store second L1D line. */
 EX:	{ sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */
 EX:	{ sw r0, r19; addi r0, r0, 4 }                  /* store(WORD_5) */
@@ -403,7 +364,6 @@ EX:	{ sb r0, r3;   addi r0, r0, 1; addi r2, r2, -1 }
 
 .Ldest_is_word_aligned:
 
-#if CHIP_HAS_DWORD_ALIGN()
 EX:	{ andi r8, r0, 63; lwadd_na r6, r1, 4}
 	{ slti_u r9, r2, 64; bz r8, .Ldest_is_L2_line_aligned }
 
@@ -511,26 +471,6 @@ EX:	{ swadd r0, r13, 4; addi r2, r2, -32 }
 	/* Move r1 back to the point where it corresponds to r0. */
 	{ addi r1, r1, -4 }
 
-#else /* !CHIP_HAS_DWORD_ALIGN() */
-
-	/* Compute right/left shift counts and load initial source words. */
-	{ andi r5, r1, -4; andi r3, r1, 3 }
-EX:	{ lw r6, r5; addi r5, r5, 4; shli r3, r3, 3 }
-EX:	{ lw r7, r5; addi r5, r5, 4; sub r4, zero, r3 }
-
-	/* Load and store one word at a time, using shifts and ORs
-	 * to correct for the misaligned src.
-	 */
-.Lcopy_unaligned_src_loop:
-	{ shr r6, r6, r3; shl r8, r7, r4 }
-EX:	{ lw r7, r5; or r8, r8, r6; move r6, r7 }
-EX:	{ sw r0, r8; addi r0, r0, 4; addi r2, r2, -4 }
-	{ addi r5, r5, 4; slti_u r8, r2, 8 }
-	{ bzt r8, .Lcopy_unaligned_src_loop; addi r1, r1, 4 }
-
-	{ bz r2, .Lcopy_unaligned_done }
-#endif /* !CHIP_HAS_DWORD_ALIGN() */
-
 	/* Fall through */
 
 /*
@@ -614,5 +554,6 @@ memcpy_fixup_loop:
 	.size memcpy_common_fixup, . - memcpy_common_fixup
 
 	.section __ex_table,"a"
+	.align 4
 	.word .Lcfu, .Lcopy_from_user_fixup_zero_remainder
 	.word .Lctu, .Lcopy_to_user_fixup_done
diff --git a/arch/tile/lib/memcpy_64.c b/arch/tile/lib/memcpy_64.c
index c79b8e7c6828..4815354b8cd2 100644
--- a/arch/tile/lib/memcpy_64.c
+++ b/arch/tile/lib/memcpy_64.c
@@ -18,14 +18,17 @@
 /* EXPORT_SYMBOL() is in arch/tile/lib/exports.c since this should be asm. */
 
 /* Must be 8 bytes in size. */
-#define word_t uint64_t
+#define op_t uint64_t
 
-#if CHIP_L2_LINE_SIZE() != 64 && CHIP_L2_LINE_SIZE() != 128
-#error "Assumes 64 or 128 byte line size"
+/* Threshold value for when to enter the unrolled loops. */
+#define	OP_T_THRES	16
+
+#if CHIP_L2_LINE_SIZE() != 64
+#error "Assumes 64 byte line size"
 #endif
 
 /* How many cache lines ahead should we prefetch? */
-#define PREFETCH_LINES_AHEAD 3
+#define PREFETCH_LINES_AHEAD 4
 
 /*
  * Provide "base versions" of load and store for the normal code path.
@@ -51,15 +54,16 @@ void *memcpy(void *__restrict dstv, const void *__restrict srcv, size_t n)
  * macros to return a count of uncopied bytes due to mm fault.
  */
 #define RETVAL 0
-int USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n)
+int __attribute__((optimize("omit-frame-pointer")))
+USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n)
 #endif
 {
 	char *__restrict dst1 = (char *)dstv;
 	const char *__restrict src1 = (const char *)srcv;
 	const char *__restrict src1_end;
 	const char *__restrict prefetch;
-	word_t *__restrict dst8;    /* 8-byte pointer to destination memory. */
-	word_t final; /* Final bytes to write to trailing word, if any */
+	op_t *__restrict dst8;    /* 8-byte pointer to destination memory. */
+	op_t final; /* Final bytes to write to trailing word, if any */
 	long i;
 
 	if (n < 16) {
@@ -79,104 +83,228 @@ int USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n)
 	for (i = 0; i < PREFETCH_LINES_AHEAD; i++) {
 		__insn_prefetch(prefetch);
 		prefetch += CHIP_L2_LINE_SIZE();
-		prefetch = (prefetch > src1_end) ? prefetch : src1;
+		prefetch = (prefetch < src1_end) ? prefetch : src1;
 	}
 
 	/* Copy bytes until dst is word-aligned. */
-	for (; (uintptr_t)dst1 & (sizeof(word_t) - 1); n--)
+	for (; (uintptr_t)dst1 & (sizeof(op_t) - 1); n--)
 		ST1(dst1++, LD1(src1++));
 
 	/* 8-byte pointer to destination memory. */
-	dst8 = (word_t *)dst1;
-
-	if (__builtin_expect((uintptr_t)src1 & (sizeof(word_t) - 1), 0)) {
-		/*
-		 * Misaligned copy.  Copy 8 bytes at a time, but don't
-		 * bother with other fanciness.
-		 *
-		 * TODO: Consider prefetching and using wh64 as well.
-		 */
-
-		/* Create an aligned src8. */
-		const word_t *__restrict src8 =
-			(const word_t *)((uintptr_t)src1 & -sizeof(word_t));
-		word_t b;
-
-		word_t a = LD8(src8++);
-		for (; n >= sizeof(word_t); n -= sizeof(word_t)) {
-			b = LD8(src8++);
-			a = __insn_dblalign(a, b, src1);
-			ST8(dst8++, a);
-			a = b;
+	dst8 = (op_t *)dst1;
+
+	if (__builtin_expect((uintptr_t)src1 & (sizeof(op_t) - 1), 0)) {
+		/* Unaligned copy. */
+
+		op_t  tmp0 = 0, tmp1 = 0, tmp2, tmp3;
+		const op_t *src8 = (const op_t *) ((uintptr_t)src1 &
+						   -sizeof(op_t));
+		const void *srci = (void *)src1;
+		int m;
+
+		m = (CHIP_L2_LINE_SIZE() << 2) -
+			(((uintptr_t)dst8) & ((CHIP_L2_LINE_SIZE() << 2) - 1));
+		m = (n < m) ? n : m;
+		m /= sizeof(op_t);
+
+		/* Copy until 'dst' is cache-line-aligned. */
+		n -= (sizeof(op_t) * m);
+
+		switch (m % 4) {
+		case 0:
+			if (__builtin_expect(!m, 0))
+				goto _M0;
+			tmp1 = LD8(src8++);
+			tmp2 = LD8(src8++);
+			goto _8B3;
+		case 2:
+			m += 2;
+			tmp3 = LD8(src8++);
+			tmp0 = LD8(src8++);
+			goto _8B1;
+		case 3:
+			m += 1;
+			tmp2 = LD8(src8++);
+			tmp3 = LD8(src8++);
+			goto _8B2;
+		case 1:
+			m--;
+			tmp0 = LD8(src8++);
+			tmp1 = LD8(src8++);
+			if (__builtin_expect(!m, 0))
+				goto _8B0;
+		}
+
+		do {
+			tmp2 = LD8(src8++);
+			tmp0 =  __insn_dblalign(tmp0, tmp1, srci);
+			ST8(dst8++, tmp0);
+_8B3:
+			tmp3 = LD8(src8++);
+			tmp1 = __insn_dblalign(tmp1, tmp2, srci);
+			ST8(dst8++, tmp1);
+_8B2:
+			tmp0 = LD8(src8++);
+			tmp2 = __insn_dblalign(tmp2, tmp3, srci);
+			ST8(dst8++, tmp2);
+_8B1:
+			tmp1 = LD8(src8++);
+			tmp3 = __insn_dblalign(tmp3, tmp0, srci);
+			ST8(dst8++, tmp3);
+			m -= 4;
+		} while (m);
+
+_8B0:
+		tmp0 = __insn_dblalign(tmp0, tmp1, srci);
+		ST8(dst8++, tmp0);
+		src8--;
+
+_M0:
+		if (__builtin_expect(n >= CHIP_L2_LINE_SIZE(), 0)) {
+			op_t tmp4, tmp5, tmp6, tmp7, tmp8;
+
+			prefetch = ((const char *)src8) +
+				CHIP_L2_LINE_SIZE() * PREFETCH_LINES_AHEAD;
+
+			for (tmp0 = LD8(src8++); n >= CHIP_L2_LINE_SIZE();
+			     n -= CHIP_L2_LINE_SIZE()) {
+				/* Prefetch and advance to next line to
+				   prefetch, but don't go past the end.  */
+				__insn_prefetch(prefetch);
+
+				/* Make sure prefetch got scheduled
+				   earlier.  */
+				__asm__ ("" : : : "memory");
+
+				prefetch += CHIP_L2_LINE_SIZE();
+				prefetch = (prefetch < src1_end) ? prefetch :
+					(const char *) src8;
+
+				tmp1 = LD8(src8++);
+				tmp2 = LD8(src8++);
+				tmp3 = LD8(src8++);
+				tmp4 = LD8(src8++);
+				tmp5 = LD8(src8++);
+				tmp6 = LD8(src8++);
+				tmp7 = LD8(src8++);
+				tmp8 = LD8(src8++);
+
+				tmp0 = __insn_dblalign(tmp0, tmp1, srci);
+				tmp1 = __insn_dblalign(tmp1, tmp2, srci);
+				tmp2 = __insn_dblalign(tmp2, tmp3, srci);
+				tmp3 = __insn_dblalign(tmp3, tmp4, srci);
+				tmp4 = __insn_dblalign(tmp4, tmp5, srci);
+				tmp5 = __insn_dblalign(tmp5, tmp6, srci);
+				tmp6 = __insn_dblalign(tmp6, tmp7, srci);
+				tmp7 = __insn_dblalign(tmp7, tmp8, srci);
+
+				__insn_wh64(dst8);
+
+				ST8(dst8++, tmp0);
+				ST8(dst8++, tmp1);
+				ST8(dst8++, tmp2);
+				ST8(dst8++, tmp3);
+				ST8(dst8++, tmp4);
+				ST8(dst8++, tmp5);
+				ST8(dst8++, tmp6);
+				ST8(dst8++, tmp7);
+
+				tmp0 = tmp8;
+			}
+			src8--;
+		}
+
+		/* Copy the rest 8-byte chunks. */
+		if (n >= sizeof(op_t)) {
+			tmp0 = LD8(src8++);
+			for (; n >= sizeof(op_t); n -= sizeof(op_t)) {
+				tmp1 = LD8(src8++);
+				tmp0 = __insn_dblalign(tmp0, tmp1, srci);
+				ST8(dst8++, tmp0);
+				tmp0 = tmp1;
+			}
+			src8--;
 		}
 
 		if (n == 0)
 			return RETVAL;
 
-		b = ((const char *)src8 <= src1_end) ? *src8 : 0;
+		tmp0 = LD8(src8++);
+		tmp1 = ((const char *)src8 <= src1_end)
+			? LD8((op_t *)src8) : 0;
+		final = __insn_dblalign(tmp0, tmp1, srci);
 
-		/*
-		 * Final source bytes to write to trailing partial
-		 * word, if any.
-		 */
-		final = __insn_dblalign(a, b, src1);
 	} else {
 		/* Aligned copy. */
 
-		const word_t* __restrict src8 = (const word_t *)src1;
+		const op_t *__restrict src8 = (const op_t *)src1;
 
 		/* src8 and dst8 are both word-aligned. */
 		if (n >= CHIP_L2_LINE_SIZE()) {
 			/* Copy until 'dst' is cache-line-aligned. */
 			for (; (uintptr_t)dst8 & (CHIP_L2_LINE_SIZE() - 1);
-			     n -= sizeof(word_t))
+			     n -= sizeof(op_t))
 				ST8(dst8++, LD8(src8++));
 
 			for (; n >= CHIP_L2_LINE_SIZE(); ) {
-				__insn_wh64(dst8);
+				op_t tmp0, tmp1, tmp2, tmp3;
+				op_t tmp4, tmp5, tmp6, tmp7;
 
 				/*
 				 * Prefetch and advance to next line
-				 * to prefetch, but don't go past the end
+				 * to prefetch, but don't go past the
+				 * end.
 				 */
 				__insn_prefetch(prefetch);
+
+				/* Make sure prefetch got scheduled
+				   earlier.  */
+				__asm__ ("" : : : "memory");
+
 				prefetch += CHIP_L2_LINE_SIZE();
-				prefetch = (prefetch > src1_end) ? prefetch :
+				prefetch = (prefetch < src1_end) ? prefetch :
 					(const char *)src8;
 
 				/*
-				 * Copy an entire cache line.  Manually
-				 * unrolled to avoid idiosyncracies of
-				 * compiler unrolling.
+				 * Do all the loads before wh64.  This
+				 * is necessary if [src8, src8+7] and
+				 * [dst8, dst8+7] share the same cache
+				 * line and dst8 <= src8, as can be
+				 * the case when called from memmove,
+				 * or with code tested on x86 whose
+				 * memcpy always works with forward
+				 * copies.
 				 */
-#define COPY_WORD(offset) ({ ST8(dst8+offset, LD8(src8+offset)); n -= 8; })
-				COPY_WORD(0);
-				COPY_WORD(1);
-				COPY_WORD(2);
-				COPY_WORD(3);
-				COPY_WORD(4);
-				COPY_WORD(5);
-				COPY_WORD(6);
-				COPY_WORD(7);
-#if CHIP_L2_LINE_SIZE() == 128
-				COPY_WORD(8);
-				COPY_WORD(9);
-				COPY_WORD(10);
-				COPY_WORD(11);
-				COPY_WORD(12);
-				COPY_WORD(13);
-				COPY_WORD(14);
-				COPY_WORD(15);
-#elif CHIP_L2_LINE_SIZE() != 64
-# error Fix code that assumes particular L2 cache line sizes
-#endif
+				tmp0 = LD8(src8++);
+				tmp1 = LD8(src8++);
+				tmp2 = LD8(src8++);
+				tmp3 = LD8(src8++);
+				tmp4 = LD8(src8++);
+				tmp5 = LD8(src8++);
+				tmp6 = LD8(src8++);
+				tmp7 = LD8(src8++);
+
+				/* wh64 and wait for tmp7 load completion. */
+				__asm__ ("move %0, %0; wh64 %1\n"
+					 : : "r"(tmp7), "r"(dst8));
 
-				dst8 += CHIP_L2_LINE_SIZE() / sizeof(word_t);
-				src8 += CHIP_L2_LINE_SIZE() / sizeof(word_t);
+				ST8(dst8++, tmp0);
+				ST8(dst8++, tmp1);
+				ST8(dst8++, tmp2);
+				ST8(dst8++, tmp3);
+				ST8(dst8++, tmp4);
+				ST8(dst8++, tmp5);
+				ST8(dst8++, tmp6);
+				ST8(dst8++, tmp7);
+
+				n -= CHIP_L2_LINE_SIZE();
 			}
+#if CHIP_L2_LINE_SIZE() != 64
+# error "Fix code that assumes particular L2 cache line size."
+#endif
 		}
 
-		for (; n >= sizeof(word_t); n -= sizeof(word_t))
+		for (; n >= sizeof(op_t); n -= sizeof(op_t))
 			ST8(dst8++, LD8(src8++));
 
 		if (__builtin_expect(n == 0, 1))
diff --git a/arch/tile/lib/memcpy_tile64.c b/arch/tile/lib/memcpy_tile64.c
deleted file mode 100644
index 3bc4b4e40d93..000000000000
--- a/arch/tile/lib/memcpy_tile64.c
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * Copyright 2010 Tilera Corporation. All Rights Reserved.
- *
- *   This program is free software; you can redistribute it and/or
- *   modify it under the terms of the GNU General Public License
- *   as published by the Free Software Foundation, version 2.
- *
- *   This program is distributed in the hope that it will be useful, but
- *   WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- *   NON INFRINGEMENT.  See the GNU General Public License for
- *   more details.
- */
-
-#include <linux/string.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/uaccess.h>
-#include <asm/fixmap.h>
-#include <asm/kmap_types.h>
-#include <asm/tlbflush.h>
-#include <hv/hypervisor.h>
-#include <arch/chip.h>
-
-
-#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
-
-/* Defined in memcpy.S */
-extern unsigned long __memcpy_asm(void *to, const void *from, unsigned long n);
-extern unsigned long __copy_to_user_inatomic_asm(
-	void __user *to, const void *from, unsigned long n);
-extern unsigned long __copy_from_user_inatomic_asm(
-	void *to, const void __user *from, unsigned long n);
-extern unsigned long __copy_from_user_zeroing_asm(
-	void *to, const void __user *from, unsigned long n);
-
-typedef unsigned long (*memcpy_t)(void *, const void *, unsigned long);
-
-/* Size above which to consider TLB games for performance */
-#define LARGE_COPY_CUTOFF 2048
-
-/* Communicate to the simulator what we are trying to do. */
-#define sim_allow_multiple_caching(b) \
-  __insn_mtspr(SPR_SIM_CONTROL, \
-   SIM_CONTROL_ALLOW_MULTIPLE_CACHING | ((b) << _SIM_CONTROL_OPERATOR_BITS))
-
-/*
- * Copy memory by briefly enabling incoherent cacheline-at-a-time mode.
- *
- * We set up our own source and destination PTEs that we fully control.
- * This is the only way to guarantee that we don't race with another
- * thread that is modifying the PTE; we can't afford to try the
- * copy_{to,from}_user() technique of catching the interrupt, since
- * we must run with interrupts disabled to avoid the risk of some
- * other code seeing the incoherent data in our cache.  (Recall that
- * our cache is indexed by PA, so even if the other code doesn't use
- * our kmap_atomic virtual addresses, they'll still hit in cache using
- * the normal VAs that aren't supposed to hit in cache.)
- */
-static void memcpy_multicache(void *dest, const void *source,
-			      pte_t dst_pte, pte_t src_pte, int len)
-{
-	int idx;
-	unsigned long flags, newsrc, newdst;
-	pmd_t *pmdp;
-	pte_t *ptep;
-	int type0, type1;
-	int cpu = get_cpu();
-
-	/*
-	 * Disable interrupts so that we don't recurse into memcpy()
-	 * in an interrupt handler, nor accidentally reference
-	 * the PA of the source from an interrupt routine.  Also
-	 * notify the simulator that we're playing games so we don't
-	 * generate spurious coherency warnings.
-	 */
-	local_irq_save(flags);
-	sim_allow_multiple_caching(1);
-
-	/* Set up the new dest mapping */
-	type0 = kmap_atomic_idx_push();
-	idx = FIX_KMAP_BEGIN + (KM_TYPE_NR * cpu) + type0;
-	newdst = __fix_to_virt(idx) + ((unsigned long)dest & (PAGE_SIZE-1));
-	pmdp = pmd_offset(pud_offset(pgd_offset_k(newdst), newdst), newdst);
-	ptep = pte_offset_kernel(pmdp, newdst);
-	if (pte_val(*ptep) != pte_val(dst_pte)) {
-		set_pte(ptep, dst_pte);
-		local_flush_tlb_page(NULL, newdst, PAGE_SIZE);
-	}
-
-	/* Set up the new source mapping */
-	type1 = kmap_atomic_idx_push();
-	idx += (type0 - type1);
-	src_pte = hv_pte_set_nc(src_pte);
-	src_pte = hv_pte_clear_writable(src_pte);  /* be paranoid */
-	newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1));
-	pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc);
-	ptep = pte_offset_kernel(pmdp, newsrc);
-	__set_pte(ptep, src_pte);   /* set_pte() would be confused by this */
-	local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
-
-	/* Actually move the data. */
-	__memcpy_asm((void *)newdst, (const void *)newsrc, len);
-
-	/*
-	 * Remap the source as locally-cached and not OLOC'ed so that
-	 * we can inval without also invaling the remote cpu's cache.
-	 * This also avoids known errata with inv'ing cacheable oloc data.
-	 */
-	src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3);
-	src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */
-	__set_pte(ptep, src_pte);   /* set_pte() would be confused by this */
-	local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
-
-	/*
-	 * Do the actual invalidation, covering the full L2 cache line
-	 * at the end since __memcpy_asm() is somewhat aggressive.
-	 */
-	__inv_buffer((void *)newsrc, len);
-
-	/*
-	 * We're done: notify the simulator that all is back to normal,
-	 * and re-enable interrupts and pre-emption.
-	 */
-	kmap_atomic_idx_pop();
-	kmap_atomic_idx_pop();
-	sim_allow_multiple_caching(0);
-	local_irq_restore(flags);
-	put_cpu();
-}
-
-/*
- * Identify large copies from remotely-cached memory, and copy them
- * via memcpy_multicache() if they look good, otherwise fall back
- * to the particular kind of copying passed as the memcpy_t function.
- */
-static unsigned long fast_copy(void *dest, const void *source, int len,
-			       memcpy_t func)
-{
-	/*
-	 * Check if it's big enough to bother with.  We may end up doing a
-	 * small copy via TLB manipulation if we're near a page boundary,
-	 * but presumably we'll make it up when we hit the second page.
-	 */
-	while (len >= LARGE_COPY_CUTOFF) {
-		int copy_size, bytes_left_on_page;
-		pte_t *src_ptep, *dst_ptep;
-		pte_t src_pte, dst_pte;
-		struct page *src_page, *dst_page;
-
-		/* Is the source page oloc'ed to a remote cpu? */
-retry_source:
-		src_ptep = virt_to_pte(current->mm, (unsigned long)source);
-		if (src_ptep == NULL)
-			break;
-		src_pte = *src_ptep;
-		if (!hv_pte_get_present(src_pte) ||
-		    !hv_pte_get_readable(src_pte) ||
-		    hv_pte_get_mode(src_pte) != HV_PTE_MODE_CACHE_TILE_L3)
-			break;
-		if (get_remote_cache_cpu(src_pte) == smp_processor_id())
-			break;
-		src_page = pfn_to_page(pte_pfn(src_pte));
-		get_page(src_page);
-		if (pte_val(src_pte) != pte_val(*src_ptep)) {
-			put_page(src_page);
-			goto retry_source;
-		}
-		if (pte_huge(src_pte)) {
-			/* Adjust the PTE to correspond to a small page */
-			int pfn = pte_pfn(src_pte);
-			pfn += (((unsigned long)source & (HPAGE_SIZE-1))
-				>> PAGE_SHIFT);
-			src_pte = pfn_pte(pfn, src_pte);
-			src_pte = pte_mksmall(src_pte);
-		}
-
-		/* Is the destination page writable? */
-retry_dest:
-		dst_ptep = virt_to_pte(current->mm, (unsigned long)dest);
-		if (dst_ptep == NULL) {
-			put_page(src_page);
-			break;
-		}
-		dst_pte = *dst_ptep;
-		if (!hv_pte_get_present(dst_pte) ||
-		    !hv_pte_get_writable(dst_pte)) {
-			put_page(src_page);
-			break;
-		}
-		dst_page = pfn_to_page(pte_pfn(dst_pte));
-		if (dst_page == src_page) {
-			/*
-			 * Source and dest are on the same page; this
-			 * potentially exposes us to incoherence if any
-			 * part of src and dest overlap on a cache line.
-			 * Just give up rather than trying to be precise.
-			 */
-			put_page(src_page);
-			break;
-		}
-		get_page(dst_page);
-		if (pte_val(dst_pte) != pte_val(*dst_ptep)) {
-			put_page(dst_page);
-			goto retry_dest;
-		}
-		if (pte_huge(dst_pte)) {
-			/* Adjust the PTE to correspond to a small page */
-			int pfn = pte_pfn(dst_pte);
-			pfn += (((unsigned long)dest & (HPAGE_SIZE-1))
-				>> PAGE_SHIFT);
-			dst_pte = pfn_pte(pfn, dst_pte);
-			dst_pte = pte_mksmall(dst_pte);
-		}
-
-		/* All looks good: create a cachable PTE and copy from it */
-		copy_size = len;
-		bytes_left_on_page =
-			PAGE_SIZE - (((int)source) & (PAGE_SIZE-1));
-		if (copy_size > bytes_left_on_page)
-			copy_size = bytes_left_on_page;
-		bytes_left_on_page =
-			PAGE_SIZE - (((int)dest) & (PAGE_SIZE-1));
-		if (copy_size > bytes_left_on_page)
-			copy_size = bytes_left_on_page;
-		memcpy_multicache(dest, source, dst_pte, src_pte, copy_size);
-
-		/* Release the pages */
-		put_page(dst_page);
-		put_page(src_page);
-
-		/* Continue on the next page */
-		dest += copy_size;
-		source += copy_size;
-		len -= copy_size;
-	}
-
-	return func(dest, source, len);
-}
-
-void *memcpy(void *to, const void *from, __kernel_size_t n)
-{
-	if (n < LARGE_COPY_CUTOFF)
-		return (void *)__memcpy_asm(to, from, n);
-	else
-		return (void *)fast_copy(to, from, n, __memcpy_asm);
-}
-
-unsigned long __copy_to_user_inatomic(void __user *to, const void *from,
-				      unsigned long n)
-{
-	if (n < LARGE_COPY_CUTOFF)
-		return __copy_to_user_inatomic_asm(to, from, n);
-	else
-		return fast_copy(to, from, n, __copy_to_user_inatomic_asm);
-}
-
-unsigned long __copy_from_user_inatomic(void *to, const void __user *from,
-					unsigned long n)
-{
-	if (n < LARGE_COPY_CUTOFF)
-		return __copy_from_user_inatomic_asm(to, from, n);
-	else
-		return fast_copy(to, from, n, __copy_from_user_inatomic_asm);
-}
-
-unsigned long __copy_from_user_zeroing(void *to, const void __user *from,
-				       unsigned long n)
-{
-	if (n < LARGE_COPY_CUTOFF)
-		return __copy_from_user_zeroing_asm(to, from, n);
-	else
-		return fast_copy(to, from, n, __copy_from_user_zeroing_asm);
-}
-
-#endif /* !CHIP_HAS_COHERENT_LOCAL_CACHE() */
diff --git a/arch/tile/lib/memcpy_user_64.c b/arch/tile/lib/memcpy_user_64.c
index 37440caa7370..88c7016492c4 100644
--- a/arch/tile/lib/memcpy_user_64.c
+++ b/arch/tile/lib/memcpy_user_64.c
@@ -31,6 +31,7 @@
 		    ".pushsection .coldtext.memcpy,\"ax\";"	\
 		    "2: { move r0, %2; jrp lr };"		\
 		    ".section __ex_table,\"a\";"		\
+		    ".align 8;"					\
 		    ".quad 1b, 2b;"				\
 		    ".popsection"				\
 		    : "=m" (*(p)) : "r" (v), "r" (n));		\
@@ -43,6 +44,7 @@
 		    ".pushsection .coldtext.memcpy,\"ax\";"	\
 		    "2: { move r0, %2; jrp lr };"		\
 		    ".section __ex_table,\"a\";"		\
+		    ".align 8;"					\
 		    ".quad 1b, 2b;"				\
 		    ".popsection"				\
 		    : "=r" (__v) : "m" (*(p)), "r" (n));	\
diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c
index 57dbb3a5bff8..2042bfe6595f 100644
--- a/arch/tile/lib/memset_32.c
+++ b/arch/tile/lib/memset_32.c
@@ -12,13 +12,10 @@
  *   more details.
  */
 
-#include <arch/chip.h>
-
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/module.h>
-
-#undef memset
+#include <arch/chip.h>
 
 void *memset(void *s, int c, size_t n)
 {
@@ -26,11 +23,7 @@ void *memset(void *s, int c, size_t n)
 	int n32;
 	uint32_t v16, v32;
 	uint8_t *out8 = s;
-#if !CHIP_HAS_WH64()
-	int ahead32;
-#else
 	int to_align32;
-#endif
 
 	/* Experimentation shows that a trivial tight loop is a win up until
 	 * around a size of 20, where writing a word at a time starts to win.
@@ -61,21 +54,6 @@ void *memset(void *s, int c, size_t n)
 		return s;
 	}
 
-#if !CHIP_HAS_WH64()
-	/* Use a spare issue slot to start prefetching the first cache
-	 * line early. This instruction is free as the store can be buried
-	 * in otherwise idle issue slots doing ALU ops.
-	 */
-	__insn_prefetch(out8);
-
-	/* We prefetch the end so that a short memset that spans two cache
-	 * lines gets some prefetching benefit. Again we believe this is free
-	 * to issue.
-	 */
-	__insn_prefetch(&out8[n - 1]);
-#endif /* !CHIP_HAS_WH64() */
-
-
 	/* Align 'out8'. We know n >= 3 so this won't write past the end. */
 	while (((uintptr_t) out8 & 3) != 0) {
 		*out8++ = c;
@@ -96,90 +74,6 @@ void *memset(void *s, int c, size_t n)
 	/* This must be at least 8 or the following loop doesn't work. */
 #define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4)
 
-#if !CHIP_HAS_WH64()
-
-	ahead32 = CACHE_LINE_SIZE_IN_WORDS;
-
-	/* We already prefetched the first and last cache lines, so
-	 * we only need to do more prefetching if we are storing
-	 * to more than two cache lines.
-	 */
-	if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) {
-		int i;
-
-		/* Prefetch the next several cache lines.
-		 * This is the setup code for the software-pipelined
-		 * loop below.
-		 */
-#define MAX_PREFETCH 5
-		ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS;
-		if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS)
-			ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS;
-
-		for (i = CACHE_LINE_SIZE_IN_WORDS;
-		     i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS)
-			__insn_prefetch(&out32[i]);
-	}
-
-	if (n32 > ahead32) {
-		while (1) {
-			int j;
-
-			/* Prefetch by reading one word several cache lines
-			 * ahead.  Since loads are non-blocking this will
-			 * cause the full cache line to be read while we are
-			 * finishing earlier cache lines.  Using a store
-			 * here causes microarchitectural performance
-			 * problems where a victimizing store miss goes to
-			 * the head of the retry FIFO and locks the pipe for
-			 * a few cycles.  So a few subsequent stores in this
-			 * loop go into the retry FIFO, and then later
-			 * stores see other stores to the same cache line
-			 * are already in the retry FIFO and themselves go
-			 * into the retry FIFO, filling it up and grinding
-			 * to a halt waiting for the original miss to be
-			 * satisfied.
-			 */
-			__insn_prefetch(&out32[ahead32]);
-
-#if CACHE_LINE_SIZE_IN_WORDS % 4 != 0
-#error "Unhandled CACHE_LINE_SIZE_IN_WORDS"
-#endif
-
-			n32 -= CACHE_LINE_SIZE_IN_WORDS;
-
-			/* Save icache space by only partially unrolling
-			 * this loop.
-			 */
-			for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) {
-				*out32++ = v32;
-				*out32++ = v32;
-				*out32++ = v32;
-				*out32++ = v32;
-			}
-
-			/* To save compiled code size, reuse this loop even
-			 * when we run out of prefetching to do by dropping
-			 * ahead32 down.
-			 */
-			if (n32 <= ahead32) {
-				/* Not even a full cache line left,
-				 * so stop now.
-				 */
-				if (n32 < CACHE_LINE_SIZE_IN_WORDS)
-					break;
-
-				/* Choose a small enough value that we don't
-				 * prefetch past the end.  There's no sense
-				 * in touching cache lines we don't have to.
-				 */
-				ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1;
-			}
-		}
-	}
-
-#else /* CHIP_HAS_WH64() */
-
 	/* Determine how many words we need to emit before the 'out32'
 	 * pointer becomes aligned modulo the cache line size.
 	 */
@@ -236,8 +130,6 @@ void *memset(void *s, int c, size_t n)
 		n32 &= CACHE_LINE_SIZE_IN_WORDS - 1;
 	}
 
-#endif /* CHIP_HAS_WH64() */
-
 	/* Now handle any leftover values. */
 	if (n32 != 0) {
 		do {
diff --git a/arch/tile/lib/memset_64.c b/arch/tile/lib/memset_64.c
index 3873085711d5..03ef69cd73de 100644
--- a/arch/tile/lib/memset_64.c
+++ b/arch/tile/lib/memset_64.c
@@ -12,13 +12,11 @@
  *   more details.
  */
 
-#include <arch/chip.h>
-
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/module.h>
-
-#undef memset
+#include <arch/chip.h>
+#include "string-endian.h"
 
 void *memset(void *s, int c, size_t n)
 {
@@ -70,8 +68,7 @@ void *memset(void *s, int c, size_t n)
 	n64 = n >> 3;
 
 	/* Tile input byte out to 64 bits. */
-	/* KLUDGE */
-	v64 = 0x0101010101010101ULL * (uint8_t)c;
+	v64 = copy_byte(c);
 
 	/* This must be at least 8 or the following loop doesn't work. */
 #define CACHE_LINE_SIZE_IN_DOUBLEWORDS (CHIP_L2_LINE_SIZE() / 8)
diff --git a/arch/tile/lib/strchr_32.c b/arch/tile/lib/strchr_32.c
index c94e6f7ae7b5..841fe6963019 100644
--- a/arch/tile/lib/strchr_32.c
+++ b/arch/tile/lib/strchr_32.c
@@ -16,8 +16,6 @@
 #include <linux/string.h>
 #include <linux/module.h>
 
-#undef strchr
-
 char *strchr(const char *s, int c)
 {
 	int z, g;
diff --git a/arch/tile/lib/strchr_64.c b/arch/tile/lib/strchr_64.c
index f39f9dc422b0..fe6e31c06f8d 100644
--- a/arch/tile/lib/strchr_64.c
+++ b/arch/tile/lib/strchr_64.c
@@ -26,7 +26,7 @@ char *strchr(const char *s, int c)
 	const uint64_t *p = (const uint64_t *)(s_int & -8);
 
 	/* Create eight copies of the byte for which we are looking. */
-	const uint64_t goal = 0x0101010101010101ULL * (uint8_t) c;
+	const uint64_t goal = copy_byte(c);
 
 	/* Read the first aligned word, but force bytes before the string to
 	 * match neither zero nor goal (we make sure the high bit of each
diff --git a/arch/tile/lib/string-endian.h b/arch/tile/lib/string-endian.h
index c0eed7ce69c3..2e49cbfe9371 100644
--- a/arch/tile/lib/string-endian.h
+++ b/arch/tile/lib/string-endian.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
  *
  *   This program is free software; you can redistribute it and/or
  *   modify it under the terms of the GNU General Public License
@@ -31,3 +31,14 @@
 #define CFZ(x) __insn_clz(x)
 #define REVCZ(x) __insn_ctz(x)
 #endif
+
+/*
+ * Create eight copies of the byte in a uint64_t.  Byte Shuffle uses
+ * the bytes of srcB as the index into the dest vector to select a
+ * byte.  With all indices of zero, the first byte is copied into all
+ * the other bytes.
+ */
+static inline uint64_t copy_byte(uint8_t byte)
+{
+	return __insn_shufflebytes(byte, 0, 0);
+}
diff --git a/arch/tile/lib/strlen_32.c b/arch/tile/lib/strlen_32.c
index 4974292a5534..f26f88e11e4a 100644
--- a/arch/tile/lib/strlen_32.c
+++ b/arch/tile/lib/strlen_32.c
@@ -16,8 +16,6 @@
 #include <linux/string.h>
 #include <linux/module.h>
 
-#undef strlen
-
 size_t strlen(const char *s)
 {
 	/* Get an aligned pointer. */
diff --git a/arch/tile/lib/strnlen_32.c b/arch/tile/lib/strnlen_32.c
new file mode 100644
index 000000000000..1434141d9e01
--- /dev/null
+++ b/arch/tile/lib/strnlen_32.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+
+size_t strnlen(const char *s, size_t count)
+{
+	/* Get an aligned pointer. */
+	const uintptr_t s_int = (uintptr_t) s;
+	const uint32_t *p = (const uint32_t *)(s_int & -4);
+	size_t bytes_read = sizeof(*p) - (s_int & (sizeof(*p) - 1));
+	size_t len;
+	uint32_t v, bits;
+
+	/* Avoid page fault risk by not reading any bytes when count is 0. */
+	if (count == 0)
+		return 0;
+
+	/* Read first word, but force bytes before the string to be nonzero. */
+	v = *p | ((1 << ((s_int << 3) & 31)) - 1);
+
+	while ((bits = __insn_seqb(v, 0)) == 0) {
+		if (bytes_read >= count) {
+			/* Read COUNT bytes and didn't find the terminator. */
+			return count;
+		}
+		v = *++p;
+		bytes_read += sizeof(v);
+	}
+
+	len = ((const char *) p) + (__insn_ctz(bits) >> 3) - s;
+	return (len < count ? len : count);
+}
+EXPORT_SYMBOL(strnlen);
diff --git a/arch/tile/lib/strnlen_64.c b/arch/tile/lib/strnlen_64.c
new file mode 100644
index 000000000000..2e8de6a5136f
--- /dev/null
+++ b/arch/tile/lib/strnlen_64.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include "string-endian.h"
+
+size_t strnlen(const char *s, size_t count)
+{
+	/* Get an aligned pointer. */
+	const uintptr_t s_int = (uintptr_t) s;
+	const uint64_t *p = (const uint64_t *)(s_int & -8);
+	size_t bytes_read = sizeof(*p) - (s_int & (sizeof(*p) - 1));
+	size_t len;
+	uint64_t v, bits;
+
+	/* Avoid page fault risk by not reading any bytes when count is 0. */
+	if (count == 0)
+		return 0;
+
+	/* Read and MASK the first word. */
+	v = *p | MASK(s_int);
+
+	while ((bits = __insn_v1cmpeqi(v, 0)) == 0) {
+		if (bytes_read >= count) {
+			/* Read COUNT bytes and didn't find the terminator. */
+			return count;
+		}
+		v = *++p;
+		bytes_read += sizeof(v);
+	}
+
+	len = ((const char *) p) + (CFZ(bits) >> 3) - s;
+	return (len < count ? len : count);
+}
+EXPORT_SYMBOL(strnlen);
diff --git a/arch/tile/lib/usercopy_32.S b/arch/tile/lib/usercopy_32.S
index b62d002af009..1bc162224638 100644
--- a/arch/tile/lib/usercopy_32.S
+++ b/arch/tile/lib/usercopy_32.S
@@ -36,6 +36,7 @@ strnlen_user_fault:
 	{ move r0, zero; jrp lr }
 	ENDPROC(strnlen_user_fault)
 	.section __ex_table,"a"
+	.align 4
 	.word 1b, strnlen_user_fault
 	.popsection
 
@@ -47,18 +48,20 @@ strnlen_user_fault:
  */
 STD_ENTRY(strncpy_from_user_asm)
 	{ bz r2, 2f; move r3, r0 }
-1:      { lb_u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
+1:	{ lb_u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
 	{ sb r0, r4; addi r0, r0, 1 }
-	bz r2, 2f
-	bnzt r4, 1b
-	addi r0, r0, -1   /* don't count the trailing NUL */
-2:      { sub r0, r0, r3; jrp lr }
+	bz r4, 2f
+	bnzt r2, 1b
+	{ sub r0, r0, r3; jrp lr }
+2:	addi r0, r0, -1   /* don't count the trailing NUL */
+	{ sub r0, r0, r3; jrp lr }
 	STD_ENDPROC(strncpy_from_user_asm)
 	.pushsection .fixup,"ax"
 strncpy_from_user_fault:
 	{ movei r0, -EFAULT; jrp lr }
 	ENDPROC(strncpy_from_user_fault)
 	.section __ex_table,"a"
+	.align 4
 	.word 1b, strncpy_from_user_fault
 	.popsection
 
@@ -77,6 +80,7 @@ STD_ENTRY(clear_user_asm)
 	bnzt r1, 1b
 2:      { move r0, r1; jrp lr }
 	.pushsection __ex_table,"a"
+	.align 4
 	.word 1b, 2b
 	.popsection
 
@@ -86,6 +90,7 @@ STD_ENTRY(clear_user_asm)
 2:      { move r0, r1; jrp lr }
 	STD_ENDPROC(clear_user_asm)
 	.pushsection __ex_table,"a"
+	.align 4
 	.word 1b, 2b
 	.popsection
 
@@ -105,25 +110,7 @@ STD_ENTRY(flush_user_asm)
 2:      { move r0, r1; jrp lr }
 	STD_ENDPROC(flush_user_asm)
 	.pushsection __ex_table,"a"
-	.word 1b, 2b
-	.popsection
-
-/*
- * inv_user_asm takes the user target address in r0 and the
- * number of bytes to invalidate in r1.
- * It returns the number of not inv'able bytes (hopefully zero) in r0.
- */
-STD_ENTRY(inv_user_asm)
-	bz r1, 2f
-	{ movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
-	{ sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
-	{ and r0, r0, r2; and r1, r1, r2 }
-	{ sub r1, r1, r0 }
-1:      { inv r0; addi r1, r1, -CHIP_INV_STRIDE() }
-	{ addi r0, r0, CHIP_INV_STRIDE(); bnzt r1, 1b }
-2:      { move r0, r1; jrp lr }
-	STD_ENDPROC(inv_user_asm)
-	.pushsection __ex_table,"a"
+	.align 4
 	.word 1b, 2b
 	.popsection
 
@@ -143,5 +130,6 @@ STD_ENTRY(finv_user_asm)
 2:      { move r0, r1; jrp lr }
 	STD_ENDPROC(finv_user_asm)
 	.pushsection __ex_table,"a"
+	.align 4
 	.word 1b, 2b
 	.popsection
diff --git a/arch/tile/lib/usercopy_64.S b/arch/tile/lib/usercopy_64.S
index adb2dbbc70cd..b3b31a3306f8 100644
--- a/arch/tile/lib/usercopy_64.S
+++ b/arch/tile/lib/usercopy_64.S
@@ -36,6 +36,7 @@ strnlen_user_fault:
 	{ move r0, zero; jrp lr }
 	ENDPROC(strnlen_user_fault)
 	.section __ex_table,"a"
+	.align 8
 	.quad 1b, strnlen_user_fault
 	.popsection
 
@@ -47,18 +48,20 @@ strnlen_user_fault:
  */
 STD_ENTRY(strncpy_from_user_asm)
 	{ beqz r2, 2f; move r3, r0 }
-1:      { ld1u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
+1:	{ ld1u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
 	{ st1 r0, r4; addi r0, r0, 1 }
-	beqz r2, 2f
-	bnezt r4, 1b
-	addi r0, r0, -1   /* don't count the trailing NUL */
-2:      { sub r0, r0, r3; jrp lr }
+	beqz r4, 2f
+	bnezt r2, 1b
+	{ sub r0, r0, r3; jrp lr }
+2:	addi r0, r0, -1   /* don't count the trailing NUL */
+	{ sub r0, r0, r3; jrp lr }
 	STD_ENDPROC(strncpy_from_user_asm)
 	.pushsection .fixup,"ax"
 strncpy_from_user_fault:
 	{ movei r0, -EFAULT; jrp lr }
 	ENDPROC(strncpy_from_user_fault)
 	.section __ex_table,"a"
+	.align 8
 	.quad 1b, strncpy_from_user_fault
 	.popsection
 
@@ -77,6 +80,7 @@ STD_ENTRY(clear_user_asm)
 	bnezt r1, 1b
 2:      { move r0, r1; jrp lr }
 	.pushsection __ex_table,"a"
+	.align 8
 	.quad 1b, 2b
 	.popsection
 
@@ -86,6 +90,7 @@ STD_ENTRY(clear_user_asm)
 2:      { move r0, r1; jrp lr }
 	STD_ENDPROC(clear_user_asm)
 	.pushsection __ex_table,"a"
+	.align 8
 	.quad 1b, 2b
 	.popsection
 
@@ -105,25 +110,7 @@ STD_ENTRY(flush_user_asm)
 2:      { move r0, r1; jrp lr }
 	STD_ENDPROC(flush_user_asm)
 	.pushsection __ex_table,"a"
-	.quad 1b, 2b
-	.popsection
-
-/*
- * inv_user_asm takes the user target address in r0 and the
- * number of bytes to invalidate in r1.
- * It returns the number of not inv'able bytes (hopefully zero) in r0.
- */
-STD_ENTRY(inv_user_asm)
-	beqz r1, 2f
-	{ movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
-	{ sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
-	{ and r0, r0, r2; and r1, r1, r2 }
-	{ sub r1, r1, r0 }
-1:      { inv r0; addi r1, r1, -CHIP_INV_STRIDE() }
-	{ addi r0, r0, CHIP_INV_STRIDE(); bnezt r1, 1b }
-2:      { move r0, r1; jrp lr }
-	STD_ENDPROC(inv_user_asm)
-	.pushsection __ex_table,"a"
+	.align 8
 	.quad 1b, 2b
 	.popsection
 
@@ -143,5 +130,6 @@ STD_ENTRY(finv_user_asm)
 2:      { move r0, r1; jrp lr }
 	STD_ENDPROC(finv_user_asm)
 	.pushsection __ex_table,"a"
+	.align 8
 	.quad 1b, 2b
 	.popsection
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
index 743c951c61b0..23f044e8a7ab 100644
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -21,7 +21,8 @@
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/sections.h>
-#include <arch/sim_def.h>
+#include <asm/vdso.h>
+#include <arch/sim.h>
 
 /* Notify a running simulator, if any, that an exec just occurred. */
 static void sim_notify_exec(const char *binary_name)
@@ -38,21 +39,55 @@ static void sim_notify_exec(const char *binary_name)
 
 static int notify_exec(struct mm_struct *mm)
 {
-	int retval = 0;  /* failure */
-
-	if (mm->exe_file) {
-		char *buf = (char *) __get_free_page(GFP_KERNEL);
-		if (buf) {
-			char *path = d_path(&mm->exe_file->f_path,
-					    buf, PAGE_SIZE);
-			if (!IS_ERR(path)) {
-				sim_notify_exec(path);
-				retval = 1;
-			}
-			free_page((unsigned long)buf);
+	char *buf, *path;
+	struct vm_area_struct *vma;
+
+	if (!sim_is_simulator())
+		return 1;
+
+	if (mm->exe_file == NULL)
+		return 0;
+
+	for (vma = current->mm->mmap; ; vma = vma->vm_next) {
+		if (vma == NULL)
+			return 0;
+		if (vma->vm_file == mm->exe_file)
+			break;
+	}
+
+	buf = (char *) __get_free_page(GFP_KERNEL);
+	if (buf == NULL)
+		return 0;
+
+	path = d_path(&mm->exe_file->f_path, buf, PAGE_SIZE);
+	if (IS_ERR(path)) {
+		free_page((unsigned long)buf);
+		return 0;
+	}
+
+	/*
+	 * Notify simulator of an ET_DYN object so we know the load address.
+	 * The somewhat cryptic overuse of SIM_CONTROL_DLOPEN allows us
+	 * to be backward-compatible with older simulator releases.
+	 */
+	if (vma->vm_start == (ELF_ET_DYN_BASE & PAGE_MASK)) {
+		char buf[64];
+		int i;
+
+		snprintf(buf, sizeof(buf), "0x%lx:@", vma->vm_start);
+		for (i = 0; ; ++i) {
+			char c = buf[i];
+			__insn_mtspr(SPR_SIM_CONTROL,
+				     (SIM_CONTROL_DLOPEN
+				      | (c << _SIM_CONTROL_OPERATOR_BITS)));
+			if (c == '\0')
+				break;
 		}
 	}
-	return retval;
+
+	sim_notify_exec(path);
+	free_page((unsigned long)buf);
+	return 1;
 }
 
 /* Notify a running simulator, if any, that we loaded an interpreter. */
@@ -68,37 +103,10 @@ static void sim_notify_interp(unsigned long load_addr)
 }
 
 
-/* Kernel address of page used to map read-only kernel data into userspace. */
-static void *vdso_page;
-
-/* One-entry array used for install_special_mapping. */
-static struct page *vdso_pages[1];
-
-static int __init vdso_setup(void)
-{
-	vdso_page = (void *)get_zeroed_page(GFP_ATOMIC);
-	memcpy(vdso_page, __rt_sigreturn, __rt_sigreturn_end - __rt_sigreturn);
-	vdso_pages[0] = virt_to_page(vdso_page);
-	return 0;
-}
-device_initcall(vdso_setup);
-
-const char *arch_vma_name(struct vm_area_struct *vma)
-{
-	if (vma->vm_private_data == vdso_pages)
-		return "[vdso]";
-#ifndef __tilegx__
-	if (vma->vm_start == MEM_USER_INTRPT)
-		return "[intrpt]";
-#endif
-	return NULL;
-}
-
 int arch_setup_additional_pages(struct linux_binprm *bprm,
 				int executable_stack)
 {
 	struct mm_struct *mm = current->mm;
-	unsigned long vdso_base;
 	int retval = 0;
 
 	down_write(&mm->mmap_sem);
@@ -111,14 +119,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
 	if (!notify_exec(mm))
 		sim_notify_exec(bprm->filename);
 
-	/*
-	 * MAYWRITE to allow gdb to COW and set breakpoints
-	 */
-	vdso_base = VDSO_BASE;
-	retval = install_special_mapping(mm, vdso_base, PAGE_SIZE,
-					 VM_READ|VM_EXEC|
-					 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
-					 vdso_pages);
+	retval = setup_vdso_pages();
 
 #ifndef __tilegx__
 	/*
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index f7f99f90cbe0..111d5a9b76f1 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -34,6 +34,7 @@
 #include <linux/hugetlb.h>
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
+#include <linux/kdebug.h>
 
 #include <asm/pgalloc.h>
 #include <asm/sections.h>
@@ -122,10 +123,9 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
 	pmd_k = pmd_offset(pud_k, address);
 	if (!pmd_present(*pmd_k))
 		return NULL;
-	if (!pmd_present(*pmd)) {
+	if (!pmd_present(*pmd))
 		set_pmd(pmd, *pmd_k);
-		arch_flush_lazy_mmu_mode();
-	} else
+	else
 		BUG_ON(pmd_ptfn(*pmd) != pmd_ptfn(*pmd_k));
 	return pmd_k;
 }
@@ -283,7 +283,7 @@ static int handle_page_fault(struct pt_regs *regs,
 	flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
 		 (write ? FAULT_FLAG_WRITE : 0));
 
-	is_kernel_mode = (EX1_PL(regs->ex1) != USER_PL);
+	is_kernel_mode = !user_mode(regs);
 
 	tsk = validate_current();
 
@@ -466,28 +466,15 @@ good_area:
 		}
 	}
 
-#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
-	/*
-	 * If this was an asynchronous fault,
-	 * restart the appropriate engine.
-	 */
-	switch (fault_num) {
 #if CHIP_HAS_TILE_DMA()
+	/* If this was a DMA TLB fault, restart the DMA engine. */
+	switch (fault_num) {
 	case INT_DMATLB_MISS:
 	case INT_DMATLB_MISS_DWNCL:
 	case INT_DMATLB_ACCESS:
 	case INT_DMATLB_ACCESS_DWNCL:
 		__insn_mtspr(SPR_DMA_CTR, SPR_DMA_CTR__REQUEST_MASK);
 		break;
-#endif
-#if CHIP_HAS_SN_PROC()
-	case INT_SNITLB_MISS:
-	case INT_SNITLB_MISS_DWNCL:
-		__insn_mtspr(SPR_SNCTL,
-			     __insn_mfspr(SPR_SNCTL) &
-			     ~SPR_SNCTL__FRZPROC_MASK);
-		break;
-#endif
 	}
 #endif
 
@@ -722,8 +709,60 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
 {
 	int is_page_fault;
 
+#ifdef CONFIG_KPROBES
+	/*
+	 * This is to notify the fault handler of the kprobes.  The
+	 * exception code is redundant as it is also carried in REGS,
+	 * but we pass it anyhow.
+	 */
+	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, -1,
+		       regs->faultnum, SIGSEGV) == NOTIFY_STOP)
+		return;
+#endif
+
+#ifdef __tilegx__
+	/*
+	 * We don't need early do_page_fault_ics() support, since unlike
+	 * Pro we don't need to worry about unlocking the atomic locks.
+	 * There is only one current case in GX where we touch any memory
+	 * under ICS other than our own kernel stack, and we handle that
+	 * here.  (If we crash due to trying to touch our own stack,
+	 * we're in too much trouble for C code to help out anyway.)
+	 */
+	if (write & ~1) {
+		unsigned long pc = write & ~1;
+		if (pc >= (unsigned long) __start_unalign_asm_code &&
+		    pc < (unsigned long) __end_unalign_asm_code) {
+			struct thread_info *ti = current_thread_info();
+			/*
+			 * Our EX_CONTEXT is still what it was from the
+			 * initial unalign exception, but now we've faulted
+			 * on the JIT page.  We would like to complete the
+			 * page fault however is appropriate, and then retry
+			 * the instruction that caused the unalign exception.
+			 * Our state has been "corrupted" by setting the low
+			 * bit in "sp", and stashing r0..r3 in the
+			 * thread_info area, so we revert all of that, then
+			 * continue as if this were a normal page fault.
+			 */
+			regs->sp &= ~1UL;
+			regs->regs[0] = ti->unalign_jit_tmp[0];
+			regs->regs[1] = ti->unalign_jit_tmp[1];
+			regs->regs[2] = ti->unalign_jit_tmp[2];
+			regs->regs[3] = ti->unalign_jit_tmp[3];
+			write &= 1;
+		} else {
+			pr_alert("%s/%d: ICS set at page fault at %#lx: %#lx\n",
+				 current->comm, current->pid, pc, address);
+			show_regs(regs);
+			do_group_exit(SIGKILL);
+			return;
+		}
+	}
+#else
 	/* This case should have been handled by do_page_fault_ics(). */
 	BUG_ON(write & ~1);
+#endif
 
 #if CHIP_HAS_TILE_DMA()
 	/*
@@ -752,10 +791,6 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
 	case INT_DMATLB_MISS:
 	case INT_DMATLB_MISS_DWNCL:
 #endif
-#if CHIP_HAS_SN_PROC()
-	case INT_SNITLB_MISS:
-	case INT_SNITLB_MISS_DWNCL:
-#endif
 		is_page_fault = 1;
 		break;
 
@@ -771,8 +806,8 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
 		panic("Bad fault number %d in do_page_fault", fault_num);
 	}
 
-#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
-	if (EX1_PL(regs->ex1) != USER_PL) {
+#if CHIP_HAS_TILE_DMA()
+	if (!user_mode(regs)) {
 		struct async_tlb *async;
 		switch (fault_num) {
 #if CHIP_HAS_TILE_DMA()
@@ -783,12 +818,6 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
 			async = &current->thread.dma_async_tlb;
 			break;
 #endif
-#if CHIP_HAS_SN_PROC()
-		case INT_SNITLB_MISS:
-		case INT_SNITLB_MISS_DWNCL:
-			async = &current->thread.sn_async_tlb;
-			break;
-#endif
 		default:
 			async = NULL;
 		}
@@ -821,14 +850,22 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
 }
 
 
-#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
+#if CHIP_HAS_TILE_DMA()
 /*
- * Check an async_tlb structure to see if a deferred fault is waiting,
- * and if so pass it to the page-fault code.
+ * This routine effectively re-issues asynchronous page faults
+ * when we are returning to user space.
  */
-static void handle_async_page_fault(struct pt_regs *regs,
-				    struct async_tlb *async)
+void do_async_page_fault(struct pt_regs *regs)
 {
+	struct async_tlb *async = &current->thread.dma_async_tlb;
+
+	/*
+	 * Clear thread flag early.  If we re-interrupt while processing
+	 * code here, we will reset it and recall this routine before
+	 * returning to user space.
+	 */
+	clear_thread_flag(TIF_ASYNC_TLB);
+
 	if (async->fault_num) {
 		/*
 		 * Clear async->fault_num before calling the page-fault
@@ -842,35 +879,15 @@ static void handle_async_page_fault(struct pt_regs *regs,
 				  async->address, async->is_write);
 	}
 }
-
-/*
- * This routine effectively re-issues asynchronous page faults
- * when we are returning to user space.
- */
-void do_async_page_fault(struct pt_regs *regs)
-{
-	/*
-	 * Clear thread flag early.  If we re-interrupt while processing
-	 * code here, we will reset it and recall this routine before
-	 * returning to user space.
-	 */
-	clear_thread_flag(TIF_ASYNC_TLB);
-
-#if CHIP_HAS_TILE_DMA()
-	handle_async_page_fault(regs, &current->thread.dma_async_tlb);
-#endif
-#if CHIP_HAS_SN_PROC()
-	handle_async_page_fault(regs, &current->thread.sn_async_tlb);
-#endif
-}
-#endif /* CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC() */
+#endif /* CHIP_HAS_TILE_DMA() */
 
 
 void vmalloc_sync_all(void)
 {
 #ifdef __tilegx__
 	/* Currently all L1 kernel pmd's are static and shared. */
-	BUG_ON(pgd_index(VMALLOC_END) != pgd_index(VMALLOC_START));
+	BUILD_BUG_ON(pgd_index(VMALLOC_END - PAGE_SIZE) !=
+		     pgd_index(VMALLOC_START));
 #else
 	/*
 	 * Note that races in the updates of insync and start aren't
diff --git a/arch/tile/mm/highmem.c b/arch/tile/mm/highmem.c
index 347d123b14be..0dc218294770 100644
--- a/arch/tile/mm/highmem.c
+++ b/arch/tile/mm/highmem.c
@@ -114,7 +114,6 @@ static void kmap_atomic_register(struct page *page, int type,
 
 	list_add(&amp->list, &amp_list);
 	set_pte(ptep, pteval);
-	arch_flush_lazy_mmu_mode();
 
 	spin_unlock(&amp_lock);
 	homecache_kpte_unlock(flags);
@@ -259,7 +258,6 @@ void __kunmap_atomic(void *kvaddr)
 		BUG_ON(vaddr >= (unsigned long)high_memory);
 	}
 
-	arch_flush_lazy_mmu_mode();
 	pagefault_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c
index 1ae911939a18..004ba568d93f 100644
--- a/arch/tile/mm/homecache.c
+++ b/arch/tile/mm/homecache.c
@@ -43,12 +43,9 @@
 #include "migrate.h"
 
 
-#if CHIP_HAS_COHERENT_LOCAL_CACHE()
-
 /*
  * The noallocl2 option suppresses all use of the L2 cache to cache
- * locally from a remote home.  There's no point in using it if we
- * don't have coherent local caching, though.
+ * locally from a remote home.
  */
 static int __write_once noallocl2;
 static int __init set_noallocl2(char *str)
@@ -58,12 +55,6 @@ static int __init set_noallocl2(char *str)
 }
 early_param("noallocl2", set_noallocl2);
 
-#else
-
-#define noallocl2 0
-
-#endif
-
 
 /*
  * Update the irq_stat for cpus that we are going to interrupt
@@ -172,7 +163,8 @@ void flush_remote(unsigned long cache_pfn, unsigned long cache_control,
 
 static void homecache_finv_page_va(void* va, int home)
 {
-	if (home == smp_processor_id()) {
+	int cpu = get_cpu();
+	if (home == cpu) {
 		finv_buffer_local(va, PAGE_SIZE);
 	} else if (home == PAGE_HOME_HASH) {
 		finv_buffer_remote(va, PAGE_SIZE, 1);
@@ -180,6 +172,7 @@ static void homecache_finv_page_va(void* va, int home)
 		BUG_ON(home < 0 || home >= NR_CPUS);
 		finv_buffer_remote(va, PAGE_SIZE, 0);
 	}
+	put_cpu();
 }
 
 void homecache_finv_map_page(struct page *page, int home)
@@ -198,7 +191,7 @@ void homecache_finv_map_page(struct page *page, int home)
 #else
 	va = __fix_to_virt(FIX_HOMECACHE_BEGIN + smp_processor_id());
 #endif
-	ptep = virt_to_pte(NULL, (unsigned long)va);
+	ptep = virt_to_kpte(va);
 	pte = pfn_pte(page_to_pfn(page), PAGE_KERNEL);
 	__set_pte(ptep, pte_set_home(pte, home));
 	homecache_finv_page_va((void *)va, home);
@@ -263,10 +256,8 @@ static int pte_to_home(pte_t pte)
 		return PAGE_HOME_INCOHERENT;
 	case HV_PTE_MODE_UNCACHED:
 		return PAGE_HOME_UNCACHED;
-#if CHIP_HAS_CBOX_HOME_MAP()
 	case HV_PTE_MODE_CACHE_HASH_L3:
 		return PAGE_HOME_HASH;
-#endif
 	}
 	panic("Bad PTE %#llx\n", pte.val);
 }
@@ -323,20 +314,16 @@ pte_t pte_set_home(pte_t pte, int home)
 						      HV_PTE_MODE_CACHE_NO_L3);
 			}
 		} else
-#if CHIP_HAS_CBOX_HOME_MAP()
 		if (hash_default)
 			pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_HASH_L3);
 		else
-#endif
 			pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_NO_L3);
 		pte = hv_pte_set_nc(pte);
 		break;
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 	case PAGE_HOME_HASH:
 		pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_HASH_L3);
 		break;
-#endif
 
 	default:
 		BUG_ON(home < 0 || home >= NR_CPUS ||
@@ -346,7 +333,6 @@ pte_t pte_set_home(pte_t pte, int home)
 		break;
 	}
 
-#if CHIP_HAS_NC_AND_NOALLOC_BITS()
 	if (noallocl2)
 		pte = hv_pte_set_no_alloc_l2(pte);
 
@@ -355,7 +341,6 @@ pte_t pte_set_home(pte_t pte, int home)
 	    hv_pte_get_mode(pte) == HV_PTE_MODE_CACHE_NO_L3) {
 		pte = hv_pte_set_mode(pte, HV_PTE_MODE_UNCACHED);
 	}
-#endif
 
 	/* Checking this case here gives a better panic than from the hv. */
 	BUG_ON(hv_pte_get_mode(pte) == 0);
@@ -371,19 +356,13 @@ EXPORT_SYMBOL(pte_set_home);
  * so they're not suitable for anything but infrequent use.
  */
 
-#if CHIP_HAS_CBOX_HOME_MAP()
-static inline int initial_page_home(void) { return PAGE_HOME_HASH; }
-#else
-static inline int initial_page_home(void) { return 0; }
-#endif
-
 int page_home(struct page *page)
 {
 	if (PageHighMem(page)) {
-		return initial_page_home();
+		return PAGE_HOME_HASH;
 	} else {
 		unsigned long kva = (unsigned long)page_address(page);
-		return pte_to_home(*virt_to_pte(NULL, kva));
+		return pte_to_home(*virt_to_kpte(kva));
 	}
 }
 EXPORT_SYMBOL(page_home);
@@ -402,7 +381,7 @@ void homecache_change_page_home(struct page *page, int order, int home)
 		     NULL, 0);
 
 	for (i = 0; i < pages; ++i, kva += PAGE_SIZE) {
-		pte_t *ptep = virt_to_pte(NULL, kva);
+		pte_t *ptep = virt_to_kpte(kva);
 		pte_t pteval = *ptep;
 		BUG_ON(!pte_present(pteval) || pte_huge(pteval));
 		__set_pte(ptep, pte_set_home(pteval, home));
@@ -436,7 +415,7 @@ struct page *homecache_alloc_pages_node(int nid, gfp_t gfp_mask,
 void __homecache_free_pages(struct page *page, unsigned int order)
 {
 	if (put_page_testzero(page)) {
-		homecache_change_page_home(page, order, initial_page_home());
+		homecache_change_page_home(page, order, PAGE_HOME_HASH);
 		if (order == 0) {
 			free_hot_cold_page(page, 0);
 		} else {
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
index 650ccff8378c..e514899e1100 100644
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -49,38 +49,6 @@ int huge_shift[HUGE_SHIFT_ENTRIES] = {
 #endif
 };
 
-/*
- * This routine is a hybrid of pte_alloc_map() and pte_alloc_kernel().
- * It assumes that L2 PTEs are never in HIGHMEM (we don't support that).
- * It locks the user pagetable, and bumps up the mm->nr_ptes field,
- * but otherwise allocate the page table using the kernel versions.
- */
-static pte_t *pte_alloc_hugetlb(struct mm_struct *mm, pmd_t *pmd,
-				unsigned long address)
-{
-	pte_t *new;
-
-	if (pmd_none(*pmd)) {
-		new = pte_alloc_one_kernel(mm, address);
-		if (!new)
-			return NULL;
-
-		smp_wmb(); /* See comment in __pte_alloc */
-
-		spin_lock(&mm->page_table_lock);
-		if (likely(pmd_none(*pmd))) {  /* Has another populated it ? */
-			mm->nr_ptes++;
-			pmd_populate_kernel(mm, pmd, new);
-			new = NULL;
-		} else
-			VM_BUG_ON(pmd_trans_splitting(*pmd));
-		spin_unlock(&mm->page_table_lock);
-		if (new)
-			pte_free_kernel(mm, new);
-	}
-
-	return pte_offset_kernel(pmd, address);
-}
 #endif
 
 pte_t *huge_pte_alloc(struct mm_struct *mm,
@@ -109,7 +77,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 		else {
 			if (sz != PAGE_SIZE << huge_shift[HUGE_SHIFT_PAGE])
 				panic("Unexpected page size %#lx\n", sz);
-			return pte_alloc_hugetlb(mm, pmd, addr);
+			return pte_alloc_map(mm, NULL, pmd, addr);
 		}
 	}
 #else
@@ -144,14 +112,14 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 
 	/* Get the top-level page table entry. */
 	pgd = (pgd_t *)get_pte((pte_t *)mm->pgd, pgd_index(addr), 0);
-	if (!pgd_present(*pgd))
-		return NULL;
 
 	/* We don't have four levels. */
 	pud = pud_offset(pgd, addr);
 #ifndef __PAGETABLE_PUD_FOLDED
 # error support fourth page table level
 #endif
+	if (!pud_present(*pud))
+		return NULL;
 
 	/* Check for an L0 huge PTE, if we have three levels. */
 #ifndef __PAGETABLE_PMD_FOLDED
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index e182958c707d..4e316deb92fd 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -106,10 +106,8 @@ pte_t *get_prealloc_pte(unsigned long pfn)
  */
 static int initial_heap_home(void)
 {
-#if CHIP_HAS_CBOX_HOME_MAP()
 	if (hash_default)
 		return PAGE_HOME_HASH;
-#endif
 	return smp_processor_id();
 }
 
@@ -190,14 +188,11 @@ static void __init page_table_range_init(unsigned long start,
 }
 
 
-#if CHIP_HAS_CBOX_HOME_MAP()
-
 static int __initdata ktext_hash = 1;  /* .text pages */
 static int __initdata kdata_hash = 1;  /* .data and .bss pages */
 int __write_once hash_default = 1;     /* kernel allocator pages */
 EXPORT_SYMBOL(hash_default);
 int __write_once kstack_hash = 1;      /* if no homecaching, use h4h */
-#endif /* CHIP_HAS_CBOX_HOME_MAP */
 
 /*
  * CPUs to use to for striping the pages of kernel data.  If hash-for-home
@@ -215,14 +210,12 @@ int __write_once kdata_huge;       /* if no homecaching, small pages */
 static pgprot_t __init construct_pgprot(pgprot_t prot, int home)
 {
 	prot = pte_set_home(prot, home);
-#if CHIP_HAS_CBOX_HOME_MAP()
 	if (home == PAGE_HOME_IMMUTABLE) {
 		if (ktext_hash)
 			prot = hv_pte_set_mode(prot, HV_PTE_MODE_CACHE_HASH_L3);
 		else
 			prot = hv_pte_set_mode(prot, HV_PTE_MODE_CACHE_NO_L3);
 	}
-#endif
 	return prot;
 }
 
@@ -234,22 +227,17 @@ static pgprot_t __init init_pgprot(ulong address)
 {
 	int cpu;
 	unsigned long page;
-	enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
+	enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET };
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 	/* For kdata=huge, everything is just hash-for-home. */
 	if (kdata_huge)
 		return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
-#endif
 
 	/* We map the aliased pages of permanent text inaccessible. */
 	if (address < (ulong) _sinittext - CODE_DELTA)
 		return PAGE_NONE;
 
-	/*
-	 * We map read-only data non-coherent for performance.  We could
-	 * use neighborhood caching on TILE64, but it's not clear it's a win.
-	 */
+	/* We map read-only data non-coherent for performance. */
 	if ((address >= (ulong) __start_rodata &&
 	     address < (ulong) __end_rodata) ||
 	    address == (ulong) empty_zero_page) {
@@ -257,12 +245,10 @@ static pgprot_t __init init_pgprot(ulong address)
 	}
 
 #ifndef __tilegx__
-#if !ATOMIC_LOCKS_FOUND_VIA_TABLE()
 	/* Force the atomic_locks[] array page to be hash-for-home. */
 	if (address == (ulong) atomic_locks)
 		return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
 #endif
-#endif
 
 	/*
 	 * Everything else that isn't data or bss is heap, so mark it
@@ -280,19 +266,9 @@ static pgprot_t __init init_pgprot(ulong address)
 	if (address >= (ulong) _end || address < (ulong) _einitdata)
 		return construct_pgprot(PAGE_KERNEL, initial_heap_home());
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 	/* Use hash-for-home if requested for data/bss. */
 	if (kdata_hash)
 		return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
-#endif
-
-	/*
-	 * Make the w1data homed like heap to start with, to avoid
-	 * making it part of the page-striped data area when we're just
-	 * going to convert it to read-only soon anyway.
-	 */
-	if (address >= (ulong)__w1data_begin && address < (ulong)__w1data_end)
-		return construct_pgprot(PAGE_KERNEL, initial_heap_home());
 
 	/*
 	 * Otherwise we just hand out consecutive cpus.  To avoid
@@ -301,7 +277,7 @@ static pgprot_t __init init_pgprot(ulong address)
 	 * the requested address, while walking cpu home around kdata_mask.
 	 * This is typically no more than a dozen or so iterations.
 	 */
-	page = (((ulong)__w1data_end) + PAGE_SIZE - 1) & PAGE_MASK;
+	page = (((ulong)__end_rodata) + PAGE_SIZE - 1) & PAGE_MASK;
 	BUG_ON(address < page || address >= (ulong)_end);
 	cpu = cpumask_first(&kdata_mask);
 	for (; page < address; page += PAGE_SIZE) {
@@ -311,11 +287,9 @@ static pgprot_t __init init_pgprot(ulong address)
 		if (page == (ulong)empty_zero_page)
 			continue;
 #ifndef __tilegx__
-#if !ATOMIC_LOCKS_FOUND_VIA_TABLE()
 		if (page == (ulong)atomic_locks)
 			continue;
 #endif
-#endif
 		cpu = cpumask_next(cpu, &kdata_mask);
 		if (cpu == NR_CPUS)
 			cpu = cpumask_first(&kdata_mask);
@@ -358,7 +332,7 @@ static int __init setup_ktext(char *str)
 
 	ktext_arg_seen = 1;
 
-	/* Default setting on Tile64: use a huge page */
+	/* Default setting: use a huge page */
 	if (strcmp(str, "huge") == 0)
 		pr_info("ktext: using one huge locally cached page\n");
 
@@ -404,10 +378,8 @@ static inline pgprot_t ktext_set_nocache(pgprot_t prot)
 {
 	if (!ktext_nocache)
 		prot = hv_pte_set_nc(prot);
-#if CHIP_HAS_NC_AND_NOALLOC_BITS()
 	else
 		prot = hv_pte_set_no_alloc_l2(prot);
-#endif
 	return prot;
 }
 
@@ -440,7 +412,6 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 	struct cpumask kstripe_mask;
 	int rc, i;
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 	if (ktext_arg_seen && ktext_hash) {
 		pr_warning("warning: \"ktext\" boot argument ignored"
 			   " if \"kcache_hash\" sets up text hash-for-home\n");
@@ -457,7 +428,6 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 			  " kcache_hash=all or =allbutstack\n");
 		kdata_huge = 0;
 	}
-#endif
 
 	/*
 	 * Set up a mask for cpus to use for kernel striping.
@@ -538,7 +508,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 		}
 	}
 
-	address = MEM_SV_INTRPT;
+	address = MEM_SV_START;
 	pmd = get_pmd(pgtables, address);
 	pfn = 0;  /* code starts at PA 0 */
 	if (ktext_small) {
@@ -585,13 +555,11 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 	} else {
 		pte_t pteval = pfn_pte(0, PAGE_KERNEL_EXEC);
 		pteval = pte_mkhuge(pteval);
-#if CHIP_HAS_CBOX_HOME_MAP()
 		if (ktext_hash) {
 			pteval = hv_pte_set_mode(pteval,
 						 HV_PTE_MODE_CACHE_HASH_L3);
 			pteval = ktext_set_nocache(pteval);
 		} else
-#endif /* CHIP_HAS_CBOX_HOME_MAP() */
 		if (cpumask_weight(&ktext_mask) == 1) {
 			pteval = set_remote_cache_cpu(pteval,
 					      cpumask_first(&ktext_mask));
@@ -777,10 +745,7 @@ void __init paging_init(void)
 
 	kernel_physical_mapping_init(pgd_base);
 
-	/*
-	 * Fixed mappings, only the page table structure has to be
-	 * created - mappings will be set by set_fixmap():
-	 */
+	/* Fixed mappings, only the page table structure has to be created. */
 	page_table_range_init(fix_to_virt(__end_of_fixed_addresses - 1),
 			      FIXADDR_TOP, pgd_base);
 
@@ -941,26 +906,6 @@ void __init pgtable_cache_init(void)
 		panic("pgtable_cache_init(): Cannot create pgd cache");
 }
 
-#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
-/*
- * The __w1data area holds data that is only written during initialization,
- * and is read-only and thus freely cacheable thereafter.  Fix the page
- * table entries that cover that region accordingly.
- */
-static void mark_w1data_ro(void)
-{
-	/* Loop over page table entries */
-	unsigned long addr = (unsigned long)__w1data_begin;
-	BUG_ON((addr & (PAGE_SIZE-1)) != 0);
-	for (; addr <= (unsigned long)__w1data_end - 1; addr += PAGE_SIZE) {
-		unsigned long pfn = kaddr_to_pfn((void *)addr);
-		pte_t *ptep = virt_to_pte(NULL, addr);
-		BUG_ON(pte_huge(*ptep));   /* not relevant for kdata_huge */
-		set_pte_at(&init_mm, addr, ptep, pfn_pte(pfn, PAGE_KERNEL_RO));
-	}
-}
-#endif
-
 #ifdef CONFIG_DEBUG_PAGEALLOC
 static long __write_once initfree;
 #else
@@ -1000,7 +945,7 @@ static void free_init_pages(char *what, unsigned long begin, unsigned long end)
 		 */
 		int pfn = kaddr_to_pfn((void *)addr);
 		struct page *page = pfn_to_page(pfn);
-		pte_t *ptep = virt_to_pte(NULL, addr);
+		pte_t *ptep = virt_to_kpte(addr);
 		if (!initfree) {
 			/*
 			 * If debugging page accesses then do not free
@@ -1024,15 +969,11 @@ static void free_init_pages(char *what, unsigned long begin, unsigned long end)
 
 void free_initmem(void)
 {
-	const unsigned long text_delta = MEM_SV_INTRPT - PAGE_OFFSET;
+	const unsigned long text_delta = MEM_SV_START - PAGE_OFFSET;
 
 	/*
-	 * Evict the dirty initdata on the boot cpu, evict the w1data
-	 * wherever it's homed, and evict all the init code everywhere.
-	 * We are guaranteed that no one will touch the init pages any
-	 * more, and although other cpus may be touching the w1data,
-	 * we only actually change the caching on tile64, which won't
-	 * be keeping local copies in the other tiles' caches anyway.
+	 * Evict the cache on all cores to avoid incoherence.
+	 * We are guaranteed that no one will touch the init pages any more.
 	 */
 	homecache_evict(&cpu_cacheable_map);
 
@@ -1043,26 +984,11 @@ void free_initmem(void)
 
 	/*
 	 * Free the pages mapped from 0xc0000000 that correspond to code
-	 * pages from MEM_SV_INTRPT that we won't use again after init.
+	 * pages from MEM_SV_START that we won't use again after init.
 	 */
 	free_init_pages("unused kernel text",
 			(unsigned long)_sinittext - text_delta,
 			(unsigned long)_einittext - text_delta);
-
-#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
-	/*
-	 * Upgrade the .w1data section to globally cached.
-	 * We don't do this on tilepro, since the cache architecture
-	 * pretty much makes it irrelevant, and in any case we end
-	 * up having racing issues with other tiles that may touch
-	 * the data after we flush the cache but before we update
-	 * the PTEs and flush the TLBs, causing sharer shootdowns
-	 * later.  Even though this is to clean data, it seems like
-	 * an unnecessary complication.
-	 */
-	mark_w1data_ro();
-#endif
-
 	/* Do a global TLB flush so everyone sees the changes. */
 	flush_tlb_all();
 }
diff --git a/arch/tile/mm/migrate_32.S b/arch/tile/mm/migrate_32.S
index 5305814bf187..772085491bf9 100644
--- a/arch/tile/mm/migrate_32.S
+++ b/arch/tile/mm/migrate_32.S
@@ -136,7 +136,7 @@ STD_ENTRY(flush_and_install_context)
 	 move r8, zero  /* asids */
 	 move r9, zero  /* asidcount */
 	}
-	jal hv_flush_remote
+	jal _hv_flush_remote
 	bnz r0, .Ldone
 
 	/* Now install the new page table. */
@@ -152,7 +152,7 @@ STD_ENTRY(flush_and_install_context)
 	 move r4, r_asid
 	 moveli r5, HV_CTX_DIRECTIO | CTX_PAGE_FLAG
 	}
-	jal hv_install_context
+	jal _hv_install_context
 	bnz r0, .Ldone
 
 	/* Finally, flush the TLB. */
diff --git a/arch/tile/mm/migrate_64.S b/arch/tile/mm/migrate_64.S
index 1d15b10833d1..a49eee38f872 100644
--- a/arch/tile/mm/migrate_64.S
+++ b/arch/tile/mm/migrate_64.S
@@ -123,7 +123,7 @@ STD_ENTRY(flush_and_install_context)
 	}
 	{
 	 move r8, zero  /* asidcount */
-	 jal hv_flush_remote
+	 jal _hv_flush_remote
 	}
 	bnez r0, 1f
 
@@ -136,7 +136,7 @@ STD_ENTRY(flush_and_install_context)
 	 move r2, r_asid
 	 moveli r3, HV_CTX_DIRECTIO | CTX_PAGE_FLAG
 	}
-	jal hv_install_context
+	jal _hv_install_context
 	bnez r0, 1f
 
 	/* Finally, flush the TLB. */
diff --git a/arch/tile/mm/mmap.c b/arch/tile/mm/mmap.c
index d67d91ebf63e..851a94e6ae58 100644
--- a/arch/tile/mm/mmap.c
+++ b/arch/tile/mm/mmap.c
@@ -58,16 +58,36 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 #else
 	int is_32bit = 0;
 #endif
+	unsigned long random_factor = 0UL;
+
+	/*
+	 *  8 bits of randomness in 32bit mmaps, 24 address space bits
+	 * 12 bits of randomness in 64bit mmaps, 28 address space bits
+	 */
+	if (current->flags & PF_RANDOMIZE) {
+		if (is_32bit)
+			random_factor = get_random_int() % (1<<8);
+		else
+			random_factor = get_random_int() % (1<<12);
+
+		random_factor <<= PAGE_SHIFT;
+	}
 
 	/*
 	 * Use standard layout if the expected stack growth is unlimited
 	 * or we are running native 64 bits.
 	 */
-	if (!is_32bit || rlimit(RLIMIT_STACK) == RLIM_INFINITY) {
-		mm->mmap_base = TASK_UNMAPPED_BASE;
+	if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) {
+		mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
 		mm->get_unmapped_area = arch_get_unmapped_area;
 	} else {
 		mm->mmap_base = mmap_base(mm);
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
 	}
 }
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+	unsigned long range_end = mm->brk + 0x02000000;
+	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+}
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index dfd63ce87327..2deaddf3e01f 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -83,55 +83,6 @@ void show_mem(unsigned int filter)
 	}
 }
 
-/*
- * Associate a virtual page frame with a given physical page frame
- * and protection flags for that frame.
- */
-static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-
-	pgd = swapper_pg_dir + pgd_index(vaddr);
-	if (pgd_none(*pgd)) {
-		BUG();
-		return;
-	}
-	pud = pud_offset(pgd, vaddr);
-	if (pud_none(*pud)) {
-		BUG();
-		return;
-	}
-	pmd = pmd_offset(pud, vaddr);
-	if (pmd_none(*pmd)) {
-		BUG();
-		return;
-	}
-	pte = pte_offset_kernel(pmd, vaddr);
-	/* <pfn,flags> stored as-is, to permit clearing entries */
-	set_pte(pte, pfn_pte(pfn, flags));
-
-	/*
-	 * It's enough to flush this one mapping.
-	 * This appears conservative since it is only called
-	 * from __set_fixmap.
-	 */
-	local_flush_tlb_page(NULL, vaddr, PAGE_SIZE);
-}
-
-void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
-{
-	unsigned long address = __fix_to_virt(idx);
-
-	if (idx >= __end_of_fixed_addresses) {
-		BUG();
-		return;
-	}
-	set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
-}
-
 /**
  * shatter_huge_page() - ensure a given address is mapped by a small page.
  *
@@ -374,6 +325,17 @@ void ptep_set_wrprotect(struct mm_struct *mm,
 
 #endif
 
+/*
+ * Return a pointer to the PTE that corresponds to the given
+ * address in the given page table.  A NULL page table just uses
+ * the standard kernel page table; the preferred API in this case
+ * is virt_to_kpte().
+ *
+ * The returned pointer can point to a huge page in other levels
+ * of the page table than the bottom, if the huge page is present
+ * in the page table.  For bottom-level PTEs, the returned pointer
+ * can point to a PTE that is either present or not.
+ */
 pte_t *virt_to_pte(struct mm_struct* mm, unsigned long addr)
 {
 	pgd_t *pgd;
@@ -387,13 +349,23 @@ pte_t *virt_to_pte(struct mm_struct* mm, unsigned long addr)
 	pud = pud_offset(pgd, addr);
 	if (!pud_present(*pud))
 		return NULL;
+	if (pud_huge_page(*pud))
+		return (pte_t *)pud;
 	pmd = pmd_offset(pud, addr);
-	if (pmd_huge_page(*pmd))
-		return (pte_t *)pmd;
 	if (!pmd_present(*pmd))
 		return NULL;
+	if (pmd_huge_page(*pmd))
+		return (pte_t *)pmd;
 	return pte_offset_kernel(pmd, addr);
 }
+EXPORT_SYMBOL(virt_to_pte);
+
+pte_t *virt_to_kpte(unsigned long kaddr)
+{
+	BUG_ON(kaddr < PAGE_OFFSET);
+	return virt_to_pte(NULL, kaddr);
+}
+EXPORT_SYMBOL(virt_to_kpte);
 
 pgprot_t set_remote_cache_cpu(pgprot_t prot, int cpu)
 {
@@ -568,7 +540,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
 	addr = area->addr;
 	if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
 			       phys_addr, pgprot)) {
-		remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
+		free_vm_area(area);
 		return NULL;
 	}
 	return (__force void __iomem *) (offset + (char *)addr);
author	Linus Torvalds <torvalds@linux-foundation.org>	2013-09-06 11:14:33 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-09-06 11:14:33 -0700
commit	4de9ad9bc08b4953fc03336ad38908496e2f8826 (patch)
tree	bd44add223061a58317034a0d6c9686d95d12fba /arch/tile
parent	576c25eb5954035b64112188d9a2683144600f3d (diff)
parent	06da6629e68ddc8ffe2933d33b3681f09104b3f1 (diff)
download	blackbird-op-linux-4de9ad9bc08b4953fc03336ad38908496e2f8826.tar.gz blackbird-op-linux-4de9ad9bc08b4953fc03336ad38908496e2f8826.zip