12 files changed, 1409 insertions, 0 deletions
diff --git a/arch/sh/lib64/.gitignore b/arch/sh/lib64/.gitignore
new file mode 100644
index 000000000000..3508c2cb23c4
--- /dev/null
+++ b/arch/sh/lib64/.gitignore
@@ -0,0 +1 @@
+syscalltab.h
diff --git a/arch/sh/lib64/Makefile b/arch/sh/lib64/Makefile
new file mode 100644
index 000000000000..6a4cc3f9c0b1
--- /dev/null
+++ b/arch/sh/lib64/Makefile
@@ -0,0 +1,19 @@
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License.  See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+# Copyright (C) 2000, 2001  Paolo Alberelli
+# Coprygith (C) 2003  Paul Mundt
+#
+# Makefile for the SH-5 specific library files..
+#
+# Note! Dependencies are done automagically by 'make dep', which also
+# removes any old dependencies. DON'T put your own dependencies here
+# unless it's something special (ie not a .c file).
+#
+
+# Panic should really be compiled as PIC
+lib-y  := udelay.o c-checksum.o dbg.o io.o panic.o memcpy.o copy_user_memcpy.o \
+		page_copy.o page_clear.o iomap.o
+
diff --git a/arch/sh/lib64/c-checksum.c b/arch/sh/lib64/c-checksum.c
new file mode 100644
index 000000000000..053137abd8a0
--- /dev/null
+++ b/arch/sh/lib64/c-checksum.c
@@ -0,0 +1,217 @@
+/*
+ * arch/sh64/lib/c-checksum.c
+ *
+ * This file contains network checksum routines that are better done
+ * in an architecture-specific manner due to speed..
+ */
+
+#undef DEBUG
+
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/byteorder.h>
+#include <asm/uaccess.h>
+
+static inline unsigned short from64to16(unsigned long long x)
+{
+	/* add up 32-bit words for 33 bits */
+	x = (x & 0xffffffff) + (x >> 32);
+	/* add up 16-bit and 17-bit words for 17+c bits */
+	x = (x & 0xffff) + (x >> 16);
+	/* add up 16-bit and 2-bit for 16+c bit */
+	x = (x & 0xffff) + (x >> 16);
+	/* add up carry.. */
+	x = (x & 0xffff) + (x >> 16);
+	return x;
+}
+
+static inline unsigned short foldto16(unsigned long x)
+{
+	/* add up 16-bit for 17 bits */
+	x = (x & 0xffff) + (x >> 16);
+	/* add up carry.. */
+	x = (x & 0xffff) + (x >> 16);
+	return x;
+}
+
+static inline unsigned short myfoldto16(unsigned long long x)
+{
+	/* Fold down to 32-bits so we don't loose in the typedef-less
+	   network stack.  */
+	/* 64 to 33 */
+	x = (x & 0xffffffff) + (x >> 32);
+	/* 33 to 32 */
+	x = (x & 0xffffffff) + (x >> 32);
+
+	/* add up 16-bit for 17 bits */
+	x = (x & 0xffff) + (x >> 16);
+	/* add up carry.. */
+	x = (x & 0xffff) + (x >> 16);
+	return x;
+}
+
+#define odd(x) ((x)&1)
+#define U16(x) ntohs(x)
+
+static unsigned long do_csum(const unsigned char *buff, int len)
+{
+	int odd, count;
+	unsigned long result = 0;
+
+	pr_debug("do_csum buff %p, len %d (0x%x)\n", buff, len, len);
+#ifdef DEBUG
+	for (i = 0; i < len; i++) {
+		if ((i % 26) == 0)
+			printk("\n");
+		printk("%02X ", buff[i]);
+	}
+#endif
+
+	if (len <= 0)
+		goto out;
+
+	odd = 1 & (unsigned long) buff;
+	if (odd) {
+		result = *buff << 8;
+		len--;
+		buff++;
+	}
+	count = len >> 1;	/* nr of 16-bit words.. */
+	if (count) {
+		if (2 & (unsigned long) buff) {
+			result += *(unsigned short *) buff;
+			count--;
+			len -= 2;
+			buff += 2;
+		}
+		count >>= 1;	/* nr of 32-bit words.. */
+		if (count) {
+			unsigned long carry = 0;
+			do {
+				unsigned long w = *(unsigned long *) buff;
+				buff += 4;
+				count--;
+				result += carry;
+				result += w;
+				carry = (w > result);
+			} while (count);
+			result += carry;
+			result = (result & 0xffff) + (result >> 16);
+		}
+		if (len & 2) {
+			result += *(unsigned short *) buff;
+			buff += 2;
+		}
+	}
+	if (len & 1)
+		result += *buff;
+	result = foldto16(result);
+	if (odd)
+		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+
+	pr_debug("\nCHECKSUM is 0x%lx\n", result);
+
+      out:
+	return result;
+}
+
+/* computes the checksum of a memory block at buff, length len,
+   and adds in "sum" (32-bit)  */
+__wsum csum_partial(const void *buff, int len, __wsum sum)
+{
+	unsigned long long result = do_csum(buff, len);
+
+	/* add in old sum, and carry.. */
+	result += (__force u32)sum;
+	/* 32+c bits -> 32 bits */
+	result = (result & 0xffffffff) + (result >> 32);
+
+	pr_debug("csum_partial, buff %p len %d sum 0x%x result=0x%016Lx\n",
+		buff, len, sum, result);
+
+	return (__force __wsum)result;
+}
+
+/* Copy while checksumming, otherwise like csum_partial.  */
+__wsum
+csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum)
+{
+	sum = csum_partial(src, len, sum);
+	memcpy(dst, src, len);
+
+	return sum;
+}
+
+/* Copy from userspace and compute checksum.  If we catch an exception
+   then zero the rest of the buffer.  */
+__wsum
+csum_partial_copy_from_user(const void __user *src, void *dst, int len,
+			    __wsum sum, int *err_ptr)
+{
+	int missing;
+
+	pr_debug
+	    ("csum_partial_copy_from_user src %p, dest %p, len %d, sum %08x, err_ptr %p\n",
+	     src, dst, len, sum, err_ptr);
+	missing = copy_from_user(dst, src, len);
+	pr_debug("  access_ok %d\n", __access_ok((unsigned long) src, len));
+	pr_debug("  missing %d\n", missing);
+	if (missing) {
+		memset(dst + len - missing, 0, missing);
+		*err_ptr = -EFAULT;
+	}
+
+	return csum_partial(dst, len, sum);
+}
+
+/* Copy to userspace and compute checksum.  */
+__wsum
+csum_partial_copy_to_user(const unsigned char *src, unsigned char *dst, int len,
+			  __wsum sum, int *err_ptr)
+{
+	sum = csum_partial(src, len, sum);
+
+	if (copy_to_user(dst, src, len))
+		*err_ptr = -EFAULT;
+
+	return sum;
+}
+
+/*
+ *	This is a version of ip_compute_csum() optimized for IP headers,
+ *	which always checksum on 4 octet boundaries.
+ */
+__sum16 ip_fast_csum(const void *iph, unsigned int ihl)
+{
+	pr_debug("ip_fast_csum %p,%d\n", iph, ihl);
+
+	return (__force __sum16)~do_csum(iph, ihl * 4);
+}
+
+__wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
+				unsigned short len,
+				unsigned short proto, __wsum sum)
+{
+	unsigned long long result;
+
+	pr_debug("ntohs(0x%x)=0x%x\n", 0xdead, ntohs(0xdead));
+	pr_debug("htons(0x%x)=0x%x\n", 0xdead, htons(0xdead));
+
+	result = (__force u64) saddr + (__force u64) daddr +
+		 (__force u64) sum + ((len + proto) << 8);
+
+	/* Fold down to 32-bits so we don't loose in the typedef-less
+	   network stack.  */
+	/* 64 to 33 */
+	result = (result & 0xffffffff) + (result >> 32);
+	/* 33 to 32 */
+	result = (result & 0xffffffff) + (result >> 32);
+
+	pr_debug("%s saddr %x daddr %x len %x proto %x sum %x result %08Lx\n",
+		__FUNCTION__, saddr, daddr, len, proto, sum, result);
+
+	return (__wsum)result;
+}
+EXPORT_SYMBOL(csum_tcpudp_nofold);
diff --git a/arch/sh/lib64/copy_user_memcpy.S b/arch/sh/lib64/copy_user_memcpy.S
new file mode 100644
index 000000000000..2a62816d2ddd
--- /dev/null
+++ b/arch/sh/lib64/copy_user_memcpy.S
@@ -0,0 +1,217 @@
+!
+! Fast SH memcpy
+!
+! by Toshiyasu Morita (tm@netcom.com)
+! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
+! SH5 code Copyright 2002 SuperH Ltd.
+!
+! Entry: ARG0: destination pointer
+!        ARG1: source pointer
+!        ARG2: byte count
+!
+! Exit:  RESULT: destination pointer
+!        any other registers in the range r0-r7: trashed
+!
+! Notes: Usually one wants to do small reads and write a longword, but
+!        unfortunately it is difficult in some cases to concatanate bytes
+!        into a longword on the SH, so this does a longword read and small
+!        writes.
+!
+! This implementation makes two assumptions about how it is called:
+!
+! 1.: If the byte count is nonzero, the address of the last byte to be
+!     copied is unsigned greater than the address of the first byte to
+!     be copied.  This could be easily swapped for a signed comparison,
+!     but the algorithm used needs some comparison.
+!
+! 2.: When there are two or three bytes in the last word of an 11-or-more
+!     bytes memory chunk to b copied, the rest of the word can be read
+!     without side effects.
+!     This could be easily changed by increasing the minumum size of
+!     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
+!     however, this would cost a few extra cyles on average.
+!     For SHmedia, the assumption is that any quadword can be read in its
+!     enirety if at least one byte is included in the copy.
+
+/* Imported into Linux kernel by Richard Curnow.  This is used to implement the
+   __copy_user function in the general case, so it has to be a distinct
+   function from intra-kernel memcpy to allow for exception fix-ups in the
+   event that the user pointer is bad somewhere in the copy (e.g. due to
+   running off the end of the vma).
+
+   Note, this algorithm will be slightly wasteful in the case where the source
+   and destination pointers are equally aligned, because the stlo/sthi pairs
+   could then be merged back into single stores.  If there are a lot of cache
+   misses, this is probably offset by the stall lengths on the preloads.
+
+*/
+
+/* NOTE : Prefetches removed and allocos guarded by synco to avoid TAKum03020
+ * erratum.  The first two prefetches are nop-ed out to avoid upsetting the
+ * instruction counts used in the jump address calculation.
+ * */
+
+	.section .text..SHmedia32,"ax"
+	.little
+	.balign 32
+	.global copy_user_memcpy
+	.global copy_user_memcpy_end
+copy_user_memcpy:
+
+#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
+#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
+#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
+#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
+
+	nop ! ld.b r3,0,r63 ! TAKum03020
+	pta/l Large,tr0
+	movi 25,r0
+	bgeu/u r4,r0,tr0
+	nsb r4,r0
+	shlli r0,5,r0
+	movi (L1-L0+63*32 + 1) & 0xffff,r1
+	sub r1, r0, r0
+L0:	ptrel r0,tr0
+	add r2,r4,r5
+	ptabs r18,tr1
+	add r3,r4,r6
+	blink tr0,r63
+
+/* Rearranged to make cut2 safe */
+	.balign 8
+L4_7:	/* 4..7 byte memcpy cntd. */
+	stlo.l r2, 0, r0
+	or r6, r7, r6
+	sthi.l r5, -1, r6
+	stlo.l r5, -4, r6
+	blink tr1,r63
+
+	.balign 8
+L1:	/* 0 byte memcpy */
+	nop
+	blink tr1,r63
+	nop
+	nop
+	nop
+	nop
+
+L2_3:	/* 2 or 3 byte memcpy cntd. */
+	st.b r5,-1,r6
+	blink tr1,r63
+
+	/* 1 byte memcpy */
+	ld.b r3,0,r0
+	st.b r2,0,r0
+	blink tr1,r63
+
+L8_15:	/* 8..15 byte memcpy cntd. */
+	stlo.q r2, 0, r0
+	or r6, r7, r6
+	sthi.q r5, -1, r6
+	stlo.q r5, -8, r6
+	blink tr1,r63
+
+	/* 2 or 3 byte memcpy */
+	ld.b r3,0,r0
+	nop ! ld.b r2,0,r63 ! TAKum03020
+	ld.b r3,1,r1
+	st.b r2,0,r0
+	pta/l L2_3,tr0
+	ld.b r6,-1,r6
+	st.b r2,1,r1
+	blink tr0, r63
+
+	/* 4 .. 7 byte memcpy */
+	LDUAL (r3, 0, r0, r1)
+	pta L4_7, tr0
+	ldlo.l r6, -4, r7
+	or r0, r1, r0
+	sthi.l r2, 3, r0
+	ldhi.l r6, -1, r6
+	blink tr0, r63
+
+	/* 8 .. 15 byte memcpy */
+	LDUAQ (r3, 0, r0, r1)
+	pta L8_15, tr0
+	ldlo.q r6, -8, r7
+	or r0, r1, r0
+	sthi.q r2, 7, r0
+	ldhi.q r6, -1, r6
+	blink tr0, r63
+
+	/* 16 .. 24 byte memcpy */
+	LDUAQ (r3, 0, r0, r1)
+	LDUAQ (r3, 8, r8, r9)
+	or r0, r1, r0
+	sthi.q r2, 7, r0
+	or r8, r9, r8
+	sthi.q r2, 15, r8
+	ldlo.q r6, -8, r7
+	ldhi.q r6, -1, r6
+	stlo.q r2, 8, r8
+	stlo.q r2, 0, r0
+	or r6, r7, r6
+	sthi.q r5, -1, r6
+	stlo.q r5, -8, r6
+	blink tr1,r63
+
+Large:
+	! ld.b r2, 0, r63 ! TAKum03020
+	pta/l  Loop_ua, tr1
+	ori r3, -8, r7
+	sub r2, r7, r22
+	sub r3, r2, r6
+	add r2, r4, r5
+	ldlo.q r3, 0, r0
+	addi r5, -16, r5
+	movi 64+8, r27 ! could subtract r7 from that.
+	stlo.q r2, 0, r0
+	sthi.q r2, 7, r0
+	ldx.q r22, r6, r0
+	bgtu/l r27, r4, tr1
+
+	addi r5, -48, r27
+	pta/l Loop_line, tr0
+	addi r6, 64, r36
+	addi r6, -24, r19
+	addi r6, -16, r20
+	addi r6, -8, r21
+
+Loop_line:
+	! ldx.q r22, r36, r63 ! TAKum03020
+	alloco r22, 32
+	synco
+	addi r22, 32, r22
+	ldx.q r22, r19, r23
+	sthi.q r22, -25, r0
+	ldx.q r22, r20, r24
+	ldx.q r22, r21, r25
+	stlo.q r22, -32, r0
+	ldx.q r22, r6,  r0
+	sthi.q r22, -17, r23
+	sthi.q r22,  -9, r24
+	sthi.q r22,  -1, r25
+	stlo.q r22, -24, r23
+	stlo.q r22, -16, r24
+	stlo.q r22,  -8, r25
+	bgeu r27, r22, tr0
+
+Loop_ua:
+	addi r22, 8, r22
+	sthi.q r22, -1, r0
+	stlo.q r22, -8, r0
+	ldx.q r22, r6, r0
+	bgtu/l r5, r22, tr1
+
+	add r3, r4, r7
+	ldlo.q r7, -8, r1
+	sthi.q r22, 7, r0
+	ldhi.q r7, -1, r7
+	ptabs r18,tr1
+	stlo.q r22, 0, r0
+	or r1, r7, r1
+	sthi.q r5, 15, r1
+	stlo.q r5, 8, r1
+	blink tr1, r63
+copy_user_memcpy_end:
+	nop
diff --git a/arch/sh/lib64/dbg.c b/arch/sh/lib64/dbg.c
new file mode 100644
index 000000000000..97816e0baf19
--- /dev/null
+++ b/arch/sh/lib64/dbg.c
@@ -0,0 +1,430 @@
+/*--------------------------------------------------------------------------
+--
+-- Identity : Linux50 Debug Funcions
+--
+-- File     : arch/sh64/lib/dbg.C
+--
+-- Copyright 2000, 2001 STMicroelectronics Limited.
+-- Copyright 2004 Richard Curnow (evt_debug etc)
+--
+--------------------------------------------------------------------------*/
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <asm/mmu_context.h>
+
+typedef u64 regType_t;
+
+static regType_t getConfigReg(u64 id)
+{
+	register u64 reg __asm__("r2");
+	asm volatile ("getcfg   %1, 0, %0":"=r" (reg):"r"(id));
+	return (reg);
+}
+
+/* ======================================================================= */
+
+static char *szTab[] = { "4k", "64k", "1M", "512M" };
+static char *protTab[] = { "----",
+	"---R",
+	"--X-",
+	"--XR",
+	"-W--",
+	"-W-R",
+	"-WX-",
+	"-WXR",
+	"U---",
+	"U--R",
+	"U-X-",
+	"U-XR",
+	"UW--",
+	"UW-R",
+	"UWX-",
+	"UWXR"
+};
+#define  ITLB_BASE	0x00000000
+#define  DTLB_BASE	0x00800000
+#define  MAX_TLBs		64
+/* PTE High */
+#define  GET_VALID(pte)        ((pte) & 0x1)
+#define  GET_SHARED(pte)       ((pte) & 0x2)
+#define  GET_ASID(pte)         ((pte >> 2) & 0x0ff)
+#define  GET_EPN(pte)          ((pte) & 0xfffff000)
+
+/* PTE Low */
+#define  GET_CBEHAVIOR(pte)    ((pte) & 0x3)
+#define  GET_PAGE_SIZE(pte)    szTab[((pte >> 3) & 0x3)]
+#define  GET_PROTECTION(pte)   protTab[((pte >> 6) & 0xf)]
+#define  GET_PPN(pte)          ((pte) & 0xfffff000)
+
+#define PAGE_1K_MASK           0x00000000
+#define PAGE_4K_MASK           0x00000010
+#define PAGE_64K_MASK          0x00000080
+#define MMU_PAGESIZE_MASK      (PAGE_64K_MASK | PAGE_4K_MASK)
+#define PAGE_1MB_MASK          MMU_PAGESIZE_MASK
+#define PAGE_1K                (1024)
+#define PAGE_4K                (1024 * 4)
+#define PAGE_64K               (1024 * 64)
+#define PAGE_1MB               (1024 * 1024)
+
+#define HOW_TO_READ_TLB_CONTENT  \
+       "[ ID]  PPN         EPN        ASID  Share  CB  P.Size   PROT.\n"
+
+void print_single_tlb(unsigned long tlb, int single_print)
+{
+	regType_t pteH;
+	regType_t pteL;
+	unsigned int valid, shared, asid, epn, cb, ppn;
+	char *pSize;
+	char *pProt;
+
+	/*
+	   ** in case of single print <single_print> is true, this implies:
+	   **   1) print the TLB in any case also if NOT VALID
+	   **   2) print out the header
+	 */
+
+	pteH = getConfigReg(tlb);
+	valid = GET_VALID(pteH);
+	if (single_print)
+		printk(HOW_TO_READ_TLB_CONTENT);
+	else if (!valid)
+		return;
+
+	pteL = getConfigReg(tlb + 1);
+
+	shared = GET_SHARED(pteH);
+	asid = GET_ASID(pteH);
+	epn = GET_EPN(pteH);
+	cb = GET_CBEHAVIOR(pteL);
+	pSize = GET_PAGE_SIZE(pteL);
+	pProt = GET_PROTECTION(pteL);
+	ppn = GET_PPN(pteL);
+	printk("[%c%2ld]  0x%08x  0x%08x  %03d   %02x    %02x   %4s    %s\n",
+	       ((valid) ? ' ' : 'u'), ((tlb & 0x0ffff) / TLB_STEP),
+	       ppn, epn, asid, shared, cb, pSize, pProt);
+}
+
+void print_dtlb(void)
+{
+	int count;
+	unsigned long tlb;
+
+	printk(" ================= SH-5 D-TLBs Status ===================\n");
+	printk(HOW_TO_READ_TLB_CONTENT);
+	tlb = DTLB_BASE;
+	for (count = 0; count < MAX_TLBs; count++, tlb += TLB_STEP)
+		print_single_tlb(tlb, 0);
+	printk
+	    (" =============================================================\n");
+}
+
+void print_itlb(void)
+{
+	int count;
+	unsigned long tlb;
+
+	printk(" ================= SH-5 I-TLBs Status ===================\n");
+	printk(HOW_TO_READ_TLB_CONTENT);
+	tlb = ITLB_BASE;
+	for (count = 0; count < MAX_TLBs; count++, tlb += TLB_STEP)
+		print_single_tlb(tlb, 0);
+	printk
+	    (" =============================================================\n");
+}
+
+/* ======================================================================= */
+
+#ifdef CONFIG_POOR_MANS_STRACE
+
+#include "syscalltab.h"
+
+struct ring_node {
+	int evt;
+	int ret_addr;
+	int event;
+	int tra;
+	int pid;
+	unsigned long sp;
+	unsigned long pc;
+};
+
+static struct ring_node event_ring[16];
+static int event_ptr = 0;
+
+struct stored_syscall_data {
+	int pid;
+	int syscall_number;
+};
+
+#define N_STORED_SYSCALLS 16
+
+static struct stored_syscall_data stored_syscalls[N_STORED_SYSCALLS];
+static int syscall_next=0;
+static int syscall_next_print=0;
+
+void evt_debug(int evt, int ret_addr, int event, int tra, struct pt_regs *regs)
+{
+	int syscallno = tra & 0xff;
+	unsigned long sp;
+	unsigned long stack_bottom;
+	int pid;
+	struct ring_node *rr;
+
+	pid = current->pid;
+	stack_bottom = (unsigned long) task_stack_page(current);
+	asm volatile("ori r15, 0, %0" : "=r" (sp));
+	rr = event_ring + event_ptr;
+	rr->evt = evt;
+	rr->ret_addr = ret_addr;
+	rr->event = event;
+	rr->tra = tra;
+	rr->pid = pid;
+	rr->sp = sp;
+	rr->pc = regs->pc;
+
+	if (sp < stack_bottom + 3092) {
+		printk("evt_debug : stack underflow report\n");
+		int i, j;
+		for (j=0, i = event_ptr; j<16; j++) {
+			rr = event_ring + i;
+			printk("evt=%08x event=%08x tra=%08x pid=%5d sp=%08lx pc=%08lx\n",
+				rr->evt, rr->event, rr->tra, rr->pid, rr->sp, rr->pc);
+			i--;
+			i &= 15;
+		}
+		panic("STACK UNDERFLOW\n");
+	}
+
+	event_ptr = (event_ptr + 1) & 15;
+
+	if ((event == 2) && (evt == 0x160)) {
+		if (syscallno < NUM_SYSCALL_INFO_ENTRIES) {
+			/* Store the syscall information to print later.  We
+			 * can't print this now - currently we're running with
+			 * SR.BL=1, so we can't take a tlbmiss (which could occur
+			 * in the console drivers under printk).
+			 *
+			 * Just overwrite old entries on ring overflow - this
+			 * is only for last-hope debugging. */
+			stored_syscalls[syscall_next].pid = current->pid;
+			stored_syscalls[syscall_next].syscall_number = syscallno;
+			syscall_next++;
+			syscall_next &= (N_STORED_SYSCALLS - 1);
+		}
+	}
+}
+
+static void drain_syscalls(void) {
+	while (syscall_next_print != syscall_next) {
+		printk("Task %d: %s()\n",
+			stored_syscalls[syscall_next_print].pid,
+			syscall_info_table[stored_syscalls[syscall_next_print].syscall_number].name);
+			syscall_next_print++;
+			syscall_next_print &= (N_STORED_SYSCALLS - 1);
+	}
+}
+
+void evt_debug2(unsigned int ret)
+{
+	drain_syscalls();
+	printk("Task %d: syscall returns %08x\n", current->pid, ret);
+}
+
+void evt_debug_ret_from_irq(struct pt_regs *regs)
+{
+	int pid;
+	struct ring_node *rr;
+
+	pid = current->pid;
+	rr = event_ring + event_ptr;
+	rr->evt = 0xffff;
+	rr->ret_addr = 0;
+	rr->event = 0;
+	rr->tra = 0;
+	rr->pid = pid;
+	rr->pc = regs->pc;
+	event_ptr = (event_ptr + 1) & 15;
+}
+
+void evt_debug_ret_from_exc(struct pt_regs *regs)
+{
+	int pid;
+	struct ring_node *rr;
+
+	pid = current->pid;
+	rr = event_ring + event_ptr;
+	rr->evt = 0xfffe;
+	rr->ret_addr = 0;
+	rr->event = 0;
+	rr->tra = 0;
+	rr->pid = pid;
+	rr->pc = regs->pc;
+	event_ptr = (event_ptr + 1) & 15;
+}
+
+#endif /* CONFIG_POOR_MANS_STRACE */
+
+/* ======================================================================= */
+
+void show_excp_regs(char *from, int trapnr, int signr, struct pt_regs *regs)
+{
+
+	unsigned long long ah, al, bh, bl, ch, cl;
+
+	printk("\n");
+	printk("EXCEPTION - %s: task %d; Linux trap # %d; signal = %d\n",
+	       ((from) ? from : "???"), current->pid, trapnr, signr);
+
+	asm volatile ("getcon   " __EXPEVT ", %0":"=r"(ah));
+	asm volatile ("getcon   " __EXPEVT ", %0":"=r"(al));
+	ah = (ah) >> 32;
+	al = (al) & 0xffffffff;
+	asm volatile ("getcon   " __KCR1 ", %0":"=r"(bh));
+	asm volatile ("getcon   " __KCR1 ", %0":"=r"(bl));
+	bh = (bh) >> 32;
+	bl = (bl) & 0xffffffff;
+	asm volatile ("getcon   " __INTEVT ", %0":"=r"(ch));
+	asm volatile ("getcon   " __INTEVT ", %0":"=r"(cl));
+	ch = (ch) >> 32;
+	cl = (cl) & 0xffffffff;
+	printk("EXPE: %08Lx%08Lx KCR1: %08Lx%08Lx INTE: %08Lx%08Lx\n",
+	       ah, al, bh, bl, ch, cl);
+
+	asm volatile ("getcon   " __PEXPEVT ", %0":"=r"(ah));
+	asm volatile ("getcon   " __PEXPEVT ", %0":"=r"(al));
+	ah = (ah) >> 32;
+	al = (al) & 0xffffffff;
+	asm volatile ("getcon   " __PSPC ", %0":"=r"(bh));
+	asm volatile ("getcon   " __PSPC ", %0":"=r"(bl));
+	bh = (bh) >> 32;
+	bl = (bl) & 0xffffffff;
+	asm volatile ("getcon   " __PSSR ", %0":"=r"(ch));
+	asm volatile ("getcon   " __PSSR ", %0":"=r"(cl));
+	ch = (ch) >> 32;
+	cl = (cl) & 0xffffffff;
+	printk("PEXP: %08Lx%08Lx PSPC: %08Lx%08Lx PSSR: %08Lx%08Lx\n",
+	       ah, al, bh, bl, ch, cl);
+
+	ah = (regs->pc) >> 32;
+	al = (regs->pc) & 0xffffffff;
+	bh = (regs->regs[18]) >> 32;
+	bl = (regs->regs[18]) & 0xffffffff;
+	ch = (regs->regs[15]) >> 32;
+	cl = (regs->regs[15]) & 0xffffffff;
+	printk("PC  : %08Lx%08Lx LINK: %08Lx%08Lx SP  : %08Lx%08Lx\n",
+	       ah, al, bh, bl, ch, cl);
+
+	ah = (regs->sr) >> 32;
+	al = (regs->sr) & 0xffffffff;
+	asm volatile ("getcon   " __TEA ", %0":"=r"(bh));
+	asm volatile ("getcon   " __TEA ", %0":"=r"(bl));
+	bh = (bh) >> 32;
+	bl = (bl) & 0xffffffff;
+	asm volatile ("getcon   " __KCR0 ", %0":"=r"(ch));
+	asm volatile ("getcon   " __KCR0 ", %0":"=r"(cl));
+	ch = (ch) >> 32;
+	cl = (cl) & 0xffffffff;
+	printk("SR  : %08Lx%08Lx TEA : %08Lx%08Lx KCR0: %08Lx%08Lx\n",
+	       ah, al, bh, bl, ch, cl);
+
+	ah = (regs->regs[0]) >> 32;
+	al = (regs->regs[0]) & 0xffffffff;
+	bh = (regs->regs[1]) >> 32;
+	bl = (regs->regs[1]) & 0xffffffff;
+	ch = (regs->regs[2]) >> 32;
+	cl = (regs->regs[2]) & 0xffffffff;
+	printk("R0  : %08Lx%08Lx R1  : %08Lx%08Lx R2  : %08Lx%08Lx\n",
+	       ah, al, bh, bl, ch, cl);
+
+	ah = (regs->regs[3]) >> 32;
+	al = (regs->regs[3]) & 0xffffffff;
+	bh = (regs->regs[4]) >> 32;
+	bl = (regs->regs[4]) & 0xffffffff;
+	ch = (regs->regs[5]) >> 32;
+	cl = (regs->regs[5]) & 0xffffffff;
+	printk("R3  : %08Lx%08Lx R4  : %08Lx%08Lx R5  : %08Lx%08Lx\n",
+	       ah, al, bh, bl, ch, cl);
+
+	ah = (regs->regs[6]) >> 32;
+	al = (regs->regs[6]) & 0xffffffff;
+	bh = (regs->regs[7]) >> 32;
+	bl = (regs->regs[7]) & 0xffffffff;
+	ch = (regs->regs[8]) >> 32;
+	cl = (regs->regs[8]) & 0xffffffff;
+	printk("R6  : %08Lx%08Lx R7  : %08Lx%08Lx R8  : %08Lx%08Lx\n",
+	       ah, al, bh, bl, ch, cl);
+
+	ah = (regs->regs[9]) >> 32;
+	al = (regs->regs[9]) & 0xffffffff;
+	bh = (regs->regs[10]) >> 32;
+	bl = (regs->regs[10]) & 0xffffffff;
+	ch = (regs->regs[11]) >> 32;
+	cl = (regs->regs[11]) & 0xffffffff;
+	printk("R9  : %08Lx%08Lx R10 : %08Lx%08Lx R11 : %08Lx%08Lx\n",
+	       ah, al, bh, bl, ch, cl);
+	printk("....\n");
+
+	ah = (regs->tregs[0]) >> 32;
+	al = (regs->tregs[0]) & 0xffffffff;
+	bh = (regs->tregs[1]) >> 32;
+	bl = (regs->tregs[1]) & 0xffffffff;
+	ch = (regs->tregs[2]) >> 32;
+	cl = (regs->tregs[2]) & 0xffffffff;
+	printk("T0  : %08Lx%08Lx T1  : %08Lx%08Lx T2  : %08Lx%08Lx\n",
+	       ah, al, bh, bl, ch, cl);
+	printk("....\n");
+
+	print_dtlb();
+	print_itlb();
+}
+
+/* ======================================================================= */
+
+/*
+** Depending on <base> scan the MMU, Data or Instruction side
+** looking for a valid mapping matching Eaddr & asid.
+** Return -1 if not found or the TLB id entry otherwise.
+** Note: it works only for 4k pages!
+*/
+static unsigned long
+lookup_mmu_side(unsigned long base, unsigned long Eaddr, unsigned long asid)
+{
+	regType_t pteH;
+	unsigned long epn;
+	int count;
+
+	epn = Eaddr & 0xfffff000;
+
+	for (count = 0; count < MAX_TLBs; count++, base += TLB_STEP) {
+		pteH = getConfigReg(base);
+		if (GET_VALID(pteH))
+			if ((unsigned long) GET_EPN(pteH) == epn)
+				if ((unsigned long) GET_ASID(pteH) == asid)
+					break;
+	}
+	return ((unsigned long) ((count < MAX_TLBs) ? base : -1));
+}
+
+unsigned long lookup_dtlb(unsigned long Eaddr)
+{
+	unsigned long asid = get_asid();
+	return (lookup_mmu_side((u64) DTLB_BASE, Eaddr, asid));
+}
+
+unsigned long lookup_itlb(unsigned long Eaddr)
+{
+	unsigned long asid = get_asid();
+	return (lookup_mmu_side((u64) ITLB_BASE, Eaddr, asid));
+}
+
+void print_page(struct page *page)
+{
+	printk("  page[%p] -> index 0x%lx,  count 0x%x,  flags 0x%lx\n",
+	       page, page->index, page_count(page), page->flags);
+	printk("       address_space = %p, pages =%ld\n", page->mapping,
+	       page->mapping->nrpages);
+
+}
diff --git a/arch/sh/lib64/io.c b/arch/sh/lib64/io.c
new file mode 100644
index 000000000000..a3f3a2b8e25b
--- /dev/null
+++ b/arch/sh/lib64/io.c
@@ -0,0 +1,128 @@
+/*
+ * Copyright (C) 2000 David J. Mckay (david.mckay@st.com)
+ *
+ * May be copied or modified under the terms of the GNU General Public
+ * License.  See linux/COPYING for more information.
+ *
+ * This file contains the I/O routines for use on the overdrive board
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <asm/system.h>
+#include <asm/processor.h>
+#include <asm/io.h>
+
+/*  Now for the string version of these functions */
+void outsb(unsigned long port, const void *addr, unsigned long count)
+{
+	int i;
+	unsigned char *p = (unsigned char *) addr;
+
+	for (i = 0; i < count; i++, p++) {
+		outb(*p, port);
+	}
+}
+EXPORT_SYMBOL(outsb);
+
+void insb(unsigned long port, void *addr, unsigned long count)
+{
+	int i;
+	unsigned char *p = (unsigned char *) addr;
+
+	for (i = 0; i < count; i++, p++) {
+		*p = inb(port);
+	}
+}
+EXPORT_SYMBOL(insb);
+
+/* For the 16 and 32 bit string functions, we have to worry about alignment.
+ * The SH does not do unaligned accesses, so we have to read as bytes and
+ * then write as a word or dword.
+ * This can be optimised a lot more, especially in the case where the data
+ * is aligned
+ */
+
+void outsw(unsigned long port, const void *addr, unsigned long count)
+{
+	int i;
+	unsigned short tmp;
+	unsigned char *p = (unsigned char *) addr;
+
+	for (i = 0; i < count; i++, p += 2) {
+		tmp = (*p) | ((*(p + 1)) << 8);
+		outw(tmp, port);
+	}
+}
+EXPORT_SYMBOL(outsw);
+
+void insw(unsigned long port, void *addr, unsigned long count)
+{
+	int i;
+	unsigned short tmp;
+	unsigned char *p = (unsigned char *) addr;
+
+	for (i = 0; i < count; i++, p += 2) {
+		tmp = inw(port);
+		p[0] = tmp & 0xff;
+		p[1] = (tmp >> 8) & 0xff;
+	}
+}
+EXPORT_SYMBOL(insw);
+
+void outsl(unsigned long port, const void *addr, unsigned long count)
+{
+	int i;
+	unsigned tmp;
+	unsigned char *p = (unsigned char *) addr;
+
+	for (i = 0; i < count; i++, p += 4) {
+		tmp = (*p) | ((*(p + 1)) << 8) | ((*(p + 2)) << 16) |
+		    ((*(p + 3)) << 24);
+		outl(tmp, port);
+	}
+}
+EXPORT_SYMBOL(outsl);
+
+void insl(unsigned long port, void *addr, unsigned long count)
+{
+	int i;
+	unsigned tmp;
+	unsigned char *p = (unsigned char *) addr;
+
+	for (i = 0; i < count; i++, p += 4) {
+		tmp = inl(port);
+		p[0] = tmp & 0xff;
+		p[1] = (tmp >> 8) & 0xff;
+		p[2] = (tmp >> 16) & 0xff;
+		p[3] = (tmp >> 24) & 0xff;
+
+	}
+}
+EXPORT_SYMBOL(insl);
+
+void memcpy_toio(void __iomem *to, const void *from, long count)
+{
+	unsigned char *p = (unsigned char *) from;
+
+	while (count) {
+		count--;
+		writeb(*p++, to++);
+	}
+}
+EXPORT_SYMBOL(memcpy_toio);
+
+void memcpy_fromio(void *to, void __iomem *from, long count)
+{
+	int i;
+	unsigned char *p = (unsigned char *) to;
+
+	for (i = 0; i < count; i++) {
+		p[i] = readb(from);
+		from++;
+	}
+}
+EXPORT_SYMBOL(memcpy_fromio);
diff --git a/arch/sh/lib64/iomap.c b/arch/sh/lib64/iomap.c
new file mode 100644
index 000000000000..253d1e351d49
--- /dev/null
+++ b/arch/sh/lib64/iomap.c
@@ -0,0 +1,54 @@
+/*
+ * arch/sh64/lib/iomap.c
+ *
+ * Generic sh64 iomap interface
+ *
+ * Copyright (C) 2004  Paul Mundt
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/pci.h>
+#include <asm/io.h>
+
+void __iomem *__attribute__ ((weak))
+ioport_map(unsigned long port, unsigned int len)
+{
+	return (void __iomem *)port;
+}
+EXPORT_SYMBOL(ioport_map);
+
+void ioport_unmap(void __iomem *addr)
+{
+	/* Nothing .. */
+}
+EXPORT_SYMBOL(ioport_unmap);
+
+#ifdef CONFIG_PCI
+void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max)
+{
+	unsigned long start = pci_resource_start(dev, bar);
+	unsigned long len = pci_resource_len(dev, bar);
+	unsigned long flags = pci_resource_flags(dev, bar);
+
+	if (!len)
+		return NULL;
+	if (max && len > max)
+		len = max;
+	if (flags & IORESOURCE_IO)
+		return ioport_map(start + pciio_virt, len);
+	if (flags & IORESOURCE_MEM)
+		return (void __iomem *)start;
+
+	/* What? */
+	return NULL;
+}
+EXPORT_SYMBOL(pci_iomap);
+
+void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
+{
+	/* Nothing .. */
+}
+EXPORT_SYMBOL(pci_iounmap);
+#endif
diff --git a/arch/sh/lib64/memcpy.c b/arch/sh/lib64/memcpy.c
new file mode 100644
index 000000000000..fba436a92bfa
--- /dev/null
+++ b/arch/sh/lib64/memcpy.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (C) 2002 Mark Debbage (Mark.Debbage@superh.com)
+ *
+ * May be copied or modified under the terms of the GNU General Public
+ * License.  See linux/COPYING for more information.
+ *
+ */
+
+#include <linux/types.h>
+#include <asm/string.h>
+
+// This is a simplistic optimization of memcpy to increase the
+// granularity of access beyond one byte using aligned
+// loads and stores. This is not an optimal implementation
+// for SH-5 (especially with regard to prefetching and the cache),
+// and a better version should be provided later ...
+
+void *memcpy(void *dest, const void *src, size_t count)
+{
+	char *d = (char *) dest, *s = (char *) src;
+
+	if (count >= 32) {
+		int i = 8 - (((unsigned long) d) & 0x7);
+
+		if (i != 8)
+			while (i-- && count--) {
+				*d++ = *s++;
+			}
+
+		if (((((unsigned long) d) & 0x7) == 0) &&
+		    ((((unsigned long) s) & 0x7) == 0)) {
+			while (count >= 32) {
+				unsigned long long t1, t2, t3, t4;
+				t1 = *(unsigned long long *) (s);
+				t2 = *(unsigned long long *) (s + 8);
+				t3 = *(unsigned long long *) (s + 16);
+				t4 = *(unsigned long long *) (s + 24);
+				*(unsigned long long *) (d) = t1;
+				*(unsigned long long *) (d + 8) = t2;
+				*(unsigned long long *) (d + 16) = t3;
+				*(unsigned long long *) (d + 24) = t4;
+				d += 32;
+				s += 32;
+				count -= 32;
+			}
+			while (count >= 8) {
+				*(unsigned long long *) d =
+				    *(unsigned long long *) s;
+				d += 8;
+				s += 8;
+				count -= 8;
+			}
+		}
+
+		if (((((unsigned long) d) & 0x3) == 0) &&
+		    ((((unsigned long) s) & 0x3) == 0)) {
+			while (count >= 4) {
+				*(unsigned long *) d = *(unsigned long *) s;
+				d += 4;
+				s += 4;
+				count -= 4;
+			}
+		}
+
+		if (((((unsigned long) d) & 0x1) == 0) &&
+		    ((((unsigned long) s) & 0x1) == 0)) {
+			while (count >= 2) {
+				*(unsigned short *) d = *(unsigned short *) s;
+				d += 2;
+				s += 2;
+				count -= 2;
+			}
+		}
+	}
+
+	while (count--) {
+		*d++ = *s++;
+	}
+
+	return d;
+}
diff --git a/arch/sh/lib64/page_clear.S b/arch/sh/lib64/page_clear.S
new file mode 100644
index 000000000000..ac0111d669a3
--- /dev/null
+++ b/arch/sh/lib64/page_clear.S
@@ -0,0 +1,54 @@
+/*
+   Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
+
+   This file is subject to the terms and conditions of the GNU General Public
+   License.  See the file "COPYING" in the main directory of this archive
+   for more details.
+
+   Tight version of memset for the case of just clearing a page.  It turns out
+   that having the alloco's spaced out slightly due to the increment/branch
+   pair causes them to contend less for access to the cache.  Similarly,
+   keeping the stores apart from the allocos causes less contention.  => Do two
+   separate loops.  Do multiple stores per loop to amortise the
+   increment/branch cost a little.
+
+   Parameters:
+   r2 : source effective address (start of page)
+
+   Always clears 4096 bytes.
+
+   Note : alloco guarded by synco to avoid TAKum03020 erratum
+
+*/
+
+	.section .text..SHmedia32,"ax"
+	.little
+
+	.balign 8
+	.global sh64_page_clear
+sh64_page_clear:
+	pta/l 1f, tr1
+	pta/l 2f, tr2
+	ptabs/l r18, tr0
+
+	movi 4096, r7
+	add  r2, r7, r7
+	add  r2, r63, r6
+1:
+	alloco r6, 0
+	synco	! TAKum03020
+	addi	r6, 32, r6
+	bgt/l	r7, r6, tr1
+
+	add  r2, r63, r6
+2:
+	st.q  r6,   0, r63
+	st.q  r6,   8, r63
+	st.q  r6,  16, r63
+	st.q  r6,  24, r63
+	addi r6, 32, r6
+	bgt/l r7, r6, tr2
+
+	blink tr0, r63
+
+
diff --git a/arch/sh/lib64/page_copy.S b/arch/sh/lib64/page_copy.S
new file mode 100644
index 000000000000..e159c3cd2582
--- /dev/null
+++ b/arch/sh/lib64/page_copy.S
@@ -0,0 +1,91 @@
+/*
+   Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
+
+   This file is subject to the terms and conditions of the GNU General Public
+   License.  See the file "COPYING" in the main directory of this archive
+   for more details.
+
+   Tight version of mempy for the case of just copying a page.
+   Prefetch strategy empirically optimised against RTL simulations
+   of SH5-101 cut2 eval chip with Cayman board DDR memory.
+
+   Parameters:
+   r2 : source effective address (start of page)
+   r3 : destination effective address (start of page)
+
+   Always copies 4096 bytes.
+
+   Points to review.
+   * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead.
+     It seems like the prefetch needs to be at at least 4 lines ahead to get
+     the data into the cache in time, and the allocos contend with outstanding
+     prefetches for the same cache set, so it's better to have the numbers
+     different.
+   */
+
+	.section .text..SHmedia32,"ax"
+	.little
+
+	.balign 8
+	.global sh64_page_copy
+sh64_page_copy:
+
+	/* Copy 4096 bytes worth of data from r2 to r3.
+	   Do prefetches 4 lines ahead.
+	   Do alloco 2 lines ahead */
+
+	pta 1f, tr1
+	pta 2f, tr2
+	pta 3f, tr3
+	ptabs r18, tr0
+
+#if 0
+	/* TAKum03020 */
+	ld.q r2, 0x00, r63
+	ld.q r2, 0x20, r63
+	ld.q r2, 0x40, r63
+	ld.q r2, 0x60, r63
+#endif
+	alloco r3, 0x00
+	synco		! TAKum03020
+	alloco r3, 0x20
+	synco		! TAKum03020
+
+	movi 3968, r6
+	add  r3, r6, r6
+	addi r6, 64, r7
+	addi r7, 64, r8
+	sub r2, r3, r60
+	addi r60, 8, r61
+	addi r61, 8, r62
+	addi r62, 8, r23
+	addi r60, 0x80, r22
+
+/* Minimal code size.  The extra branches inside the loop don't cost much
+   because they overlap with the time spent waiting for prefetches to
+   complete. */
+1:
+#if 0
+	/* TAKum03020 */
+	bge/u r3, r6, tr2  ! skip prefetch for last 4 lines
+	ldx.q r3, r22, r63 ! prefetch 4 lines hence
+#endif
+2:
+	bge/u r3, r7, tr3  ! skip alloco for last 2 lines
+	alloco r3, 0x40    ! alloc destination line 2 lines ahead
+	synco		! TAKum03020
+3:
+	ldx.q r3, r60, r36
+	ldx.q r3, r61, r37
+	ldx.q r3, r62, r38
+	ldx.q r3, r23, r39
+	st.q  r3,   0, r36
+	st.q  r3,   8, r37
+	st.q  r3,  16, r38
+	st.q  r3,  24, r39
+	addi r3, 32, r3
+	bgt/l r8, r3, tr1
+
+	blink tr0, r63	   ! return
+
+
diff --git a/arch/sh/lib64/panic.c b/arch/sh/lib64/panic.c
new file mode 100644
index 000000000000..c9eb1cb50d97
--- /dev/null
+++ b/arch/sh/lib64/panic.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2003  Richard Curnow, SuperH UK Limited
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+
+#include <linux/kernel.h>
+#include <asm/io.h>
+#include <asm/registers.h>
+
+/* THIS IS A PHYSICAL ADDRESS */
+#define HDSP2534_ADDR (0x04002100)
+
+#ifdef CONFIG_SH_CAYMAN
+
+static void poor_mans_delay(void)
+{
+	int i;
+	for (i = 0; i < 2500000; i++) {
+	}		/* poor man's delay */
+}
+
+static void show_value(unsigned long x)
+{
+	int i;
+	unsigned nibble;
+	for (i = 0; i < 8; i++) {
+		nibble = ((x >> (i * 4)) & 0xf);
+
+		ctrl_outb(nibble + ((nibble > 9) ? 55 : 48),
+			  HDSP2534_ADDR + 0xe0 + ((7 - i) << 2));
+	}
+}
+
+#endif
+
+void
+panic_handler(unsigned long panicPC, unsigned long panicSSR,
+	      unsigned long panicEXPEVT)
+{
+#ifdef CONFIG_SH_CAYMAN
+	while (1) {
+		/* This piece of code displays the PC on the LED display */
+		show_value(panicPC);
+		poor_mans_delay();
+		show_value(panicSSR);
+		poor_mans_delay();
+		show_value(panicEXPEVT);
+		poor_mans_delay();
+	}
+#endif
+
+	/* Never return from the panic handler */
+	for (;;) ;
+
+}
diff --git a/arch/sh/lib64/udelay.c b/arch/sh/lib64/udelay.c
new file mode 100644
index 000000000000..327653914007
--- /dev/null
+++ b/arch/sh/lib64/udelay.c
@@ -0,0 +1,59 @@
+/*
+ * arch/sh64/lib/udelay.c
+ *
+ * Delay routines, using a pre-computed "loops_per_jiffy" value.
+ *
+ * Copyright (C) 2000, 2001  Paolo Alberelli
+ * Copyright (C) 2003, 2004  Paul Mundt
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/sched.h>
+#include <asm/param.h>
+
+extern unsigned long loops_per_jiffy;
+
+/*
+ * Use only for very small delays (< 1 msec).
+ *
+ * The active part of our cycle counter is only 32-bits wide, and
+ * we're treating the difference between two marks as signed.  On
+ * a 1GHz box, that's about 2 seconds.
+ */
+
+void __delay(int loops)
+{
+	long long dummy;
+	__asm__ __volatile__("gettr	tr0, %1\n\t"
+			     "pta	$+4, tr0\n\t"
+			     "addi	%0, -1, %0\n\t"
+			     "bne	%0, r63, tr0\n\t"
+			     "ptabs	%1, tr0\n\t":"=r"(loops),
+			     "=r"(dummy)
+			     :"0"(loops));
+}
+
+void __udelay(unsigned long long usecs, unsigned long lpj)
+{
+	usecs *= (((unsigned long long) HZ << 32) / 1000000) * lpj;
+	__delay((long long) usecs >> 32);
+}
+
+void __ndelay(unsigned long long nsecs, unsigned long lpj)
+{
+	nsecs *= (((unsigned long long) HZ << 32) / 1000000000) * lpj;
+	__delay((long long) nsecs >> 32);
+}
+
+void udelay(unsigned long usecs)
+{
+	__udelay(usecs, loops_per_jiffy);
+}
+
+void ndelay(unsigned long nsecs)
+{
+	__ndelay(nsecs, loops_per_jiffy);
+}
+