diff options
Diffstat (limited to 'arch/tile/lib')
-rw-r--r-- | arch/tile/lib/Makefile | 16 | ||||
-rw-r--r-- | arch/tile/lib/atomic_32.c | 133 | ||||
-rw-r--r-- | arch/tile/lib/atomic_asm_32.S | 1 | ||||
-rw-r--r-- | arch/tile/lib/cacheflush.c | 16 | ||||
-rw-r--r-- | arch/tile/lib/exports.c | 7 | ||||
-rw-r--r-- | arch/tile/lib/memchr_64.c | 2 | ||||
-rw-r--r-- | arch/tile/lib/memcpy_32.S | 63 | ||||
-rw-r--r-- | arch/tile/lib/memcpy_64.c | 264 | ||||
-rw-r--r-- | arch/tile/lib/memcpy_tile64.c | 276 | ||||
-rw-r--r-- | arch/tile/lib/memcpy_user_64.c | 2 | ||||
-rw-r--r-- | arch/tile/lib/memset_32.c | 110 | ||||
-rw-r--r-- | arch/tile/lib/memset_64.c | 9 | ||||
-rw-r--r-- | arch/tile/lib/strchr_32.c | 2 | ||||
-rw-r--r-- | arch/tile/lib/strchr_64.c | 2 | ||||
-rw-r--r-- | arch/tile/lib/string-endian.h | 13 | ||||
-rw-r--r-- | arch/tile/lib/strlen_32.c | 2 | ||||
-rw-r--r-- | arch/tile/lib/strnlen_32.c | 47 | ||||
-rw-r--r-- | arch/tile/lib/strnlen_64.c | 48 | ||||
-rw-r--r-- | arch/tile/lib/usercopy_32.S | 36 | ||||
-rw-r--r-- | arch/tile/lib/usercopy_64.S | 36 |
20 files changed, 378 insertions, 707 deletions
diff --git a/arch/tile/lib/Makefile b/arch/tile/lib/Makefile index 985f59858234..c4211cbb2021 100644 --- a/arch/tile/lib/Makefile +++ b/arch/tile/lib/Makefile @@ -4,15 +4,15 @@ lib-y = cacheflush.o checksum.o cpumask.o delay.o uaccess.o \ memmove.o memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \ - strchr_$(BITS).o strlen_$(BITS).o - -ifeq ($(CONFIG_TILEGX),y) -CFLAGS_REMOVE_memcpy_user_64.o = -fno-omit-frame-pointer -lib-y += memcpy_user_64.o -else -lib-y += atomic_32.o atomic_asm_32.o memcpy_tile64.o -endif + strchr_$(BITS).o strlen_$(BITS).o strnlen_$(BITS).o +lib-$(CONFIG_TILEGX) += memcpy_user_64.o +lib-$(CONFIG_TILEPRO) += atomic_32.o atomic_asm_32.o lib-$(CONFIG_SMP) += spinlock_$(BITS).o usercopy_$(BITS).o obj-$(CONFIG_MODULES) += exports.o + +# The finv_buffer_remote() and copy_{to,from}_user() routines can't +# have -pg added, since they both rely on being leaf functions. +CFLAGS_REMOVE_cacheflush.o = -pg +CFLAGS_REMOVE_memcpy_user_64.o = -pg diff --git a/arch/tile/lib/atomic_32.c b/arch/tile/lib/atomic_32.c index f5cada70c3c8..759efa337be8 100644 --- a/arch/tile/lib/atomic_32.c +++ b/arch/tile/lib/atomic_32.c @@ -20,50 +20,12 @@ #include <linux/atomic.h> #include <arch/chip.h> -/* See <asm/atomic_32.h> */ -#if ATOMIC_LOCKS_FOUND_VIA_TABLE() - -/* - * A block of memory containing locks for atomic ops. Each instance of this - * struct will be homed on a different CPU. - */ -struct atomic_locks_on_cpu { - int lock[ATOMIC_HASH_L2_SIZE]; -} __attribute__((aligned(ATOMIC_HASH_L2_SIZE * 4))); - -static DEFINE_PER_CPU(struct atomic_locks_on_cpu, atomic_lock_pool); - -/* The locks we'll use until __init_atomic_per_cpu is called. */ -static struct atomic_locks_on_cpu __initdata initial_atomic_locks; - -/* Hash into this vector to get a pointer to lock for the given atomic. */ -struct atomic_locks_on_cpu *atomic_lock_ptr[ATOMIC_HASH_L1_SIZE] - __write_once = { - [0 ... ATOMIC_HASH_L1_SIZE-1] (&initial_atomic_locks) -}; - -#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ - /* This page is remapped on startup to be hash-for-home. */ int atomic_locks[PAGE_SIZE / sizeof(int)] __page_aligned_bss; -#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ - int *__atomic_hashed_lock(volatile void *v) { /* NOTE: this code must match "sys_cmpxchg" in kernel/intvec_32.S */ -#if ATOMIC_LOCKS_FOUND_VIA_TABLE() - unsigned long i = - (unsigned long) v & ((PAGE_SIZE-1) & -sizeof(long long)); - unsigned long n = __insn_crc32_32(0, i); - - /* Grab high bits for L1 index. */ - unsigned long l1_index = n >> ((sizeof(n) * 8) - ATOMIC_HASH_L1_SHIFT); - /* Grab low bits for L2 index. */ - unsigned long l2_index = n & (ATOMIC_HASH_L2_SIZE - 1); - - return &atomic_lock_ptr[l1_index]->lock[l2_index]; -#else /* * Use bits [3, 3 + ATOMIC_HASH_SHIFT) as the lock index. * Using mm works here because atomic_locks is page aligned. @@ -72,26 +34,13 @@ int *__atomic_hashed_lock(volatile void *v) (unsigned long)atomic_locks, 2, (ATOMIC_HASH_SHIFT + 2) - 1); return (int *)ptr; -#endif } #ifdef CONFIG_SMP /* Return whether the passed pointer is a valid atomic lock pointer. */ static int is_atomic_lock(int *p) { -#if ATOMIC_LOCKS_FOUND_VIA_TABLE() - int i; - for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) { - - if (p >= &atomic_lock_ptr[i]->lock[0] && - p < &atomic_lock_ptr[i]->lock[ATOMIC_HASH_L2_SIZE]) { - return 1; - } - } - return 0; -#else return p >= &atomic_locks[0] && p < &atomic_locks[ATOMIC_HASH_SIZE]; -#endif } void __atomic_fault_unlock(int *irqlock_word) @@ -110,33 +59,32 @@ static inline int *__atomic_setup(volatile void *v) return __atomic_hashed_lock(v); } -int _atomic_xchg(atomic_t *v, int n) +int _atomic_xchg(int *v, int n) { - return __atomic_xchg(&v->counter, __atomic_setup(v), n).val; + return __atomic_xchg(v, __atomic_setup(v), n).val; } EXPORT_SYMBOL(_atomic_xchg); -int _atomic_xchg_add(atomic_t *v, int i) +int _atomic_xchg_add(int *v, int i) { - return __atomic_xchg_add(&v->counter, __atomic_setup(v), i).val; + return __atomic_xchg_add(v, __atomic_setup(v), i).val; } EXPORT_SYMBOL(_atomic_xchg_add); -int _atomic_xchg_add_unless(atomic_t *v, int a, int u) +int _atomic_xchg_add_unless(int *v, int a, int u) { /* * Note: argument order is switched here since it is easier * to use the first argument consistently as the "old value" * in the assembly, as is done for _atomic_cmpxchg(). */ - return __atomic_xchg_add_unless(&v->counter, __atomic_setup(v), u, a) - .val; + return __atomic_xchg_add_unless(v, __atomic_setup(v), u, a).val; } EXPORT_SYMBOL(_atomic_xchg_add_unless); -int _atomic_cmpxchg(atomic_t *v, int o, int n) +int _atomic_cmpxchg(int *v, int o, int n) { - return __atomic_cmpxchg(&v->counter, __atomic_setup(v), o, n).val; + return __atomic_cmpxchg(v, __atomic_setup(v), o, n).val; } EXPORT_SYMBOL(_atomic_cmpxchg); @@ -159,33 +107,32 @@ unsigned long _atomic_xor(volatile unsigned long *p, unsigned long mask) EXPORT_SYMBOL(_atomic_xor); -u64 _atomic64_xchg(atomic64_t *v, u64 n) +u64 _atomic64_xchg(u64 *v, u64 n) { - return __atomic64_xchg(&v->counter, __atomic_setup(v), n); + return __atomic64_xchg(v, __atomic_setup(v), n); } EXPORT_SYMBOL(_atomic64_xchg); -u64 _atomic64_xchg_add(atomic64_t *v, u64 i) +u64 _atomic64_xchg_add(u64 *v, u64 i) { - return __atomic64_xchg_add(&v->counter, __atomic_setup(v), i); + return __atomic64_xchg_add(v, __atomic_setup(v), i); } EXPORT_SYMBOL(_atomic64_xchg_add); -u64 _atomic64_xchg_add_unless(atomic64_t *v, u64 a, u64 u) +u64 _atomic64_xchg_add_unless(u64 *v, u64 a, u64 u) { /* * Note: argument order is switched here since it is easier * to use the first argument consistently as the "old value" * in the assembly, as is done for _atomic_cmpxchg(). */ - return __atomic64_xchg_add_unless(&v->counter, __atomic_setup(v), - u, a); + return __atomic64_xchg_add_unless(v, __atomic_setup(v), u, a); } EXPORT_SYMBOL(_atomic64_xchg_add_unless); -u64 _atomic64_cmpxchg(atomic64_t *v, u64 o, u64 n) +u64 _atomic64_cmpxchg(u64 *v, u64 o, u64 n) { - return __atomic64_cmpxchg(&v->counter, __atomic_setup(v), o, n); + return __atomic64_cmpxchg(v, __atomic_setup(v), o, n); } EXPORT_SYMBOL(_atomic64_cmpxchg); @@ -208,54 +155,8 @@ struct __get_user __atomic_bad_address(int __user *addr) } -#if CHIP_HAS_CBOX_HOME_MAP() -static int __init noatomichash(char *str) -{ - pr_warning("noatomichash is deprecated.\n"); - return 1; -} -__setup("noatomichash", noatomichash); -#endif - void __init __init_atomic_per_cpu(void) { -#if ATOMIC_LOCKS_FOUND_VIA_TABLE() - - unsigned int i; - int actual_cpu; - - /* - * Before this is called from setup, we just have one lock for - * all atomic objects/operations. Here we replace the - * elements of atomic_lock_ptr so that they point at per_cpu - * integers. This seemingly over-complex approach stems from - * the fact that DEFINE_PER_CPU defines an entry for each cpu - * in the grid, not each cpu from 0..ATOMIC_HASH_SIZE-1. But - * for efficient hashing of atomics to their locks we want a - * compile time constant power of 2 for the size of this - * table, so we use ATOMIC_HASH_SIZE. - * - * Here we populate atomic_lock_ptr from the per cpu - * atomic_lock_pool, interspersing by actual cpu so that - * subsequent elements are homed on consecutive cpus. - */ - - actual_cpu = cpumask_first(cpu_possible_mask); - - for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) { - /* - * Preincrement to slightly bias against using cpu 0, - * which has plenty of stuff homed on it already. - */ - actual_cpu = cpumask_next(actual_cpu, cpu_possible_mask); - if (actual_cpu >= nr_cpu_ids) - actual_cpu = cpumask_first(cpu_possible_mask); - - atomic_lock_ptr[i] = &per_cpu(atomic_lock_pool, actual_cpu); - } - -#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ - /* Validate power-of-two and "bigger than cpus" assumption */ BUILD_BUG_ON(ATOMIC_HASH_SIZE & (ATOMIC_HASH_SIZE-1)); BUG_ON(ATOMIC_HASH_SIZE < nr_cpu_ids); @@ -279,6 +180,4 @@ void __init __init_atomic_per_cpu(void) * That should not produce more indices than ATOMIC_HASH_SIZE. */ BUILD_BUG_ON((PAGE_SIZE >> 3) > ATOMIC_HASH_SIZE); - -#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ } diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S index 30638042691d..6bda3132cd61 100644 --- a/arch/tile/lib/atomic_asm_32.S +++ b/arch/tile/lib/atomic_asm_32.S @@ -164,6 +164,7 @@ STD_ENTRY_SECTION(__atomic\name, .text.atomic) STD_ENDPROC(__atomic\name) .ifc \bitwidth,32 .pushsection __ex_table,"a" + .align 4 .word 1b, __atomic\name .word 2b, __atomic\name .word __atomic\name, __atomic_bad_address diff --git a/arch/tile/lib/cacheflush.c b/arch/tile/lib/cacheflush.c index 8f8ad814b139..9c0ec22009a5 100644 --- a/arch/tile/lib/cacheflush.c +++ b/arch/tile/lib/cacheflush.c @@ -36,7 +36,8 @@ static inline void force_load(char *p) * core (if "!hfh") or homed via hash-for-home (if "hfh"), waiting * until the memory controller holds the flushed values. */ -void finv_buffer_remote(void *buffer, size_t size, int hfh) +void __attribute__((optimize("omit-frame-pointer"))) +finv_buffer_remote(void *buffer, size_t size, int hfh) { char *p, *base; size_t step_size, load_count; @@ -147,18 +148,21 @@ void finv_buffer_remote(void *buffer, size_t size, int hfh) force_load(p); /* - * Repeat, but with inv's instead of loads, to get rid of the + * Repeat, but with finv's instead of loads, to get rid of the * data we just loaded into our own cache and the old home L3. - * No need to unroll since inv's don't target a register. + * No need to unroll since finv's don't target a register. + * The finv's are guaranteed not to actually flush the data in + * the buffer back to their home, since we just read it, so the + * lines are clean in cache; we will only invalidate those lines. */ p = (char *)buffer + size - 1; - __insn_inv(p); + __insn_finv(p); p -= step_size; p = (char *)((unsigned long)p | (step_size - 1)); for (; p >= base; p -= step_size) - __insn_inv(p); + __insn_finv(p); - /* Wait for the load+inv's (and thus finvs) to have completed. */ + /* Wait for these finv's (and thus the first finvs) to be done. */ __insn_mf(); #ifdef __tilegx__ diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c index a93b02a25222..82733c87d67e 100644 --- a/arch/tile/lib/exports.c +++ b/arch/tile/lib/exports.c @@ -22,7 +22,6 @@ EXPORT_SYMBOL(strnlen_user_asm); EXPORT_SYMBOL(strncpy_from_user_asm); EXPORT_SYMBOL(clear_user_asm); EXPORT_SYMBOL(flush_user_asm); -EXPORT_SYMBOL(inv_user_asm); EXPORT_SYMBOL(finv_user_asm); /* arch/tile/kernel/entry.S */ @@ -34,6 +33,12 @@ EXPORT_SYMBOL(dump_stack); /* arch/tile/kernel/head.S */ EXPORT_SYMBOL(empty_zero_page); +#ifdef CONFIG_FUNCTION_TRACER +/* arch/tile/kernel/mcount_64.S */ +#include <asm/ftrace.h> +EXPORT_SYMBOL(__mcount); +#endif /* CONFIG_FUNCTION_TRACER */ + /* arch/tile/lib/, various memcpy files */ EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(__copy_to_user_inatomic); diff --git a/arch/tile/lib/memchr_64.c b/arch/tile/lib/memchr_64.c index 6f867dbf7c56..f8196b3a950e 100644 --- a/arch/tile/lib/memchr_64.c +++ b/arch/tile/lib/memchr_64.c @@ -36,7 +36,7 @@ void *memchr(const void *s, int c, size_t n) p = (const uint64_t *)(s_int & -8); /* Create eight copies of the byte for which we are looking. */ - goal = 0x0101010101010101ULL * (uint8_t) c; + goal = copy_byte(c); /* Read the first word, but munge it so that bytes before the array * will not match goal. diff --git a/arch/tile/lib/memcpy_32.S b/arch/tile/lib/memcpy_32.S index 2a419a6122db..a2771ae5da53 100644 --- a/arch/tile/lib/memcpy_32.S +++ b/arch/tile/lib/memcpy_32.S @@ -22,14 +22,6 @@ #include <linux/linkage.h> -/* On TILE64, we wrap these functions via arch/tile/lib/memcpy_tile64.c */ -#if !CHIP_HAS_COHERENT_LOCAL_CACHE() -#define memcpy __memcpy_asm -#define __copy_to_user_inatomic __copy_to_user_inatomic_asm -#define __copy_from_user_inatomic __copy_from_user_inatomic_asm -#define __copy_from_user_zeroing __copy_from_user_zeroing_asm -#endif - #define IS_MEMCPY 0 #define IS_COPY_FROM_USER 1 #define IS_COPY_FROM_USER_ZEROING 2 @@ -44,6 +36,7 @@ */ #define EX \ .pushsection __ex_table, "a"; \ + .align 4; \ .word 9f, memcpy_common_fixup; \ .popsection; \ 9 @@ -158,12 +151,9 @@ EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 } { addi r3, r1, 60; andi r9, r9, -64 } -#if CHIP_HAS_WH64() /* No need to prefetch dst, we'll just do the wh64 * right before we copy a line. */ -#endif - EX: { lw r5, r3; addi r3, r3, 64; movei r4, 1 } /* Intentionally stall for a few cycles to leave L2 cache alone. */ { bnzt zero, .; move r27, lr } @@ -171,21 +161,6 @@ EX: { lw r6, r3; addi r3, r3, 64 } /* Intentionally stall for a few cycles to leave L2 cache alone. */ { bnzt zero, . } EX: { lw r7, r3; addi r3, r3, 64 } -#if !CHIP_HAS_WH64() - /* Prefetch the dest */ - /* Intentionally stall for a few cycles to leave L2 cache alone. */ - { bnzt zero, . } - /* Use a real load to cause a TLB miss if necessary. We aren't using - * r28, so this should be fine. - */ -EX: { lw r28, r9; addi r9, r9, 64 } - /* Intentionally stall for a few cycles to leave L2 cache alone. */ - { bnzt zero, . } - { prefetch r9; addi r9, r9, 64 } - /* Intentionally stall for a few cycles to leave L2 cache alone. */ - { bnzt zero, . } - { prefetch r9; addi r9, r9, 64 } -#endif /* Intentionally stall for a few cycles to leave L2 cache alone. */ { bz zero, .Lbig_loop2 } @@ -286,13 +261,8 @@ EX: { lw r7, r3; addi r3, r3, 64 } /* Fill second L1D line. */ EX: { lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */ -#if CHIP_HAS_WH64() /* Prepare destination line for writing. */ EX: { wh64 r9; addi r9, r9, 64 } -#else - /* Prefetch dest line */ - { prefetch r9; addi r9, r9, 64 } -#endif /* Load seven words that are L1D hits to cover wh64 L2 usage. */ /* Load the three remaining words from the last L1D line, which @@ -330,16 +300,7 @@ EX: { lw r18, r1; addi r1, r1, 4 } /* r18 = WORD_8 */ EX: { sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */ EX: { sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */ EX: { sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */ -#if CHIP_HAS_WH64() EX: { sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */ -#else - /* Back up the r9 to a cache line we are already storing to - * if it gets past the end of the dest vector. Strictly speaking, - * we don't need to back up to the start of a cache line, but it's free - * and tidy, so why not? - */ -EX: { sw r0, r15; addi r0, r0, 4; andi r13, r0, -64 } /* store(WORD_3) */ -#endif /* Store second L1D line. */ EX: { sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */ EX: { sw r0, r19; addi r0, r0, 4 } /* store(WORD_5) */ @@ -403,7 +364,6 @@ EX: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 } .Ldest_is_word_aligned: -#if CHIP_HAS_DWORD_ALIGN() EX: { andi r8, r0, 63; lwadd_na r6, r1, 4} { slti_u r9, r2, 64; bz r8, .Ldest_is_L2_line_aligned } @@ -511,26 +471,6 @@ EX: { swadd r0, r13, 4; addi r2, r2, -32 } /* Move r1 back to the point where it corresponds to r0. */ { addi r1, r1, -4 } -#else /* !CHIP_HAS_DWORD_ALIGN() */ - - /* Compute right/left shift counts and load initial source words. */ - { andi r5, r1, -4; andi r3, r1, 3 } -EX: { lw r6, r5; addi r5, r5, 4; shli r3, r3, 3 } -EX: { lw r7, r5; addi r5, r5, 4; sub r4, zero, r3 } - - /* Load and store one word at a time, using shifts and ORs - * to correct for the misaligned src. - */ -.Lcopy_unaligned_src_loop: - { shr r6, r6, r3; shl r8, r7, r4 } -EX: { lw r7, r5; or r8, r8, r6; move r6, r7 } -EX: { sw r0, r8; addi r0, r0, 4; addi r2, r2, -4 } - { addi r5, r5, 4; slti_u r8, r2, 8 } - { bzt r8, .Lcopy_unaligned_src_loop; addi r1, r1, 4 } - - { bz r2, .Lcopy_unaligned_done } -#endif /* !CHIP_HAS_DWORD_ALIGN() */ - /* Fall through */ /* @@ -614,5 +554,6 @@ memcpy_fixup_loop: .size memcpy_common_fixup, . - memcpy_common_fixup .section __ex_table,"a" + .align 4 .word .Lcfu, .Lcopy_from_user_fixup_zero_remainder .word .Lctu, .Lcopy_to_user_fixup_done diff --git a/arch/tile/lib/memcpy_64.c b/arch/tile/lib/memcpy_64.c index c79b8e7c6828..4815354b8cd2 100644 --- a/arch/tile/lib/memcpy_64.c +++ b/arch/tile/lib/memcpy_64.c @@ -18,14 +18,17 @@ /* EXPORT_SYMBOL() is in arch/tile/lib/exports.c since this should be asm. */ /* Must be 8 bytes in size. */ -#define word_t uint64_t +#define op_t uint64_t -#if CHIP_L2_LINE_SIZE() != 64 && CHIP_L2_LINE_SIZE() != 128 -#error "Assumes 64 or 128 byte line size" +/* Threshold value for when to enter the unrolled loops. */ +#define OP_T_THRES 16 + +#if CHIP_L2_LINE_SIZE() != 64 +#error "Assumes 64 byte line size" #endif /* How many cache lines ahead should we prefetch? */ -#define PREFETCH_LINES_AHEAD 3 +#define PREFETCH_LINES_AHEAD 4 /* * Provide "base versions" of load and store for the normal code path. @@ -51,15 +54,16 @@ void *memcpy(void *__restrict dstv, const void *__restrict srcv, size_t n) * macros to return a count of uncopied bytes due to mm fault. */ #define RETVAL 0 -int USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n) +int __attribute__((optimize("omit-frame-pointer"))) +USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n) #endif { char *__restrict dst1 = (char *)dstv; const char *__restrict src1 = (const char *)srcv; const char *__restrict src1_end; const char *__restrict prefetch; - word_t *__restrict dst8; /* 8-byte pointer to destination memory. */ - word_t final; /* Final bytes to write to trailing word, if any */ + op_t *__restrict dst8; /* 8-byte pointer to destination memory. */ + op_t final; /* Final bytes to write to trailing word, if any */ long i; if (n < 16) { @@ -79,104 +83,228 @@ int USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n) for (i = 0; i < PREFETCH_LINES_AHEAD; i++) { __insn_prefetch(prefetch); prefetch += CHIP_L2_LINE_SIZE(); - prefetch = (prefetch > src1_end) ? prefetch : src1; + prefetch = (prefetch < src1_end) ? prefetch : src1; } /* Copy bytes until dst is word-aligned. */ - for (; (uintptr_t)dst1 & (sizeof(word_t) - 1); n--) + for (; (uintptr_t)dst1 & (sizeof(op_t) - 1); n--) ST1(dst1++, LD1(src1++)); /* 8-byte pointer to destination memory. */ - dst8 = (word_t *)dst1; - - if (__builtin_expect((uintptr_t)src1 & (sizeof(word_t) - 1), 0)) { - /* - * Misaligned copy. Copy 8 bytes at a time, but don't - * bother with other fanciness. - * - * TODO: Consider prefetching and using wh64 as well. - */ - - /* Create an aligned src8. */ - const word_t *__restrict src8 = - (const word_t *)((uintptr_t)src1 & -sizeof(word_t)); - word_t b; - - word_t a = LD8(src8++); - for (; n >= sizeof(word_t); n -= sizeof(word_t)) { - b = LD8(src8++); - a = __insn_dblalign(a, b, src1); - ST8(dst8++, a); - a = b; + dst8 = (op_t *)dst1; + + if (__builtin_expect((uintptr_t)src1 & (sizeof(op_t) - 1), 0)) { + /* Unaligned copy. */ + + op_t tmp0 = 0, tmp1 = 0, tmp2, tmp3; + const op_t *src8 = (const op_t *) ((uintptr_t)src1 & + -sizeof(op_t)); + const void *srci = (void *)src1; + int m; + + m = (CHIP_L2_LINE_SIZE() << 2) - + (((uintptr_t)dst8) & ((CHIP_L2_LINE_SIZE() << 2) - 1)); + m = (n < m) ? n : m; + m /= sizeof(op_t); + + /* Copy until 'dst' is cache-line-aligned. */ + n -= (sizeof(op_t) * m); + + switch (m % 4) { + case 0: + if (__builtin_expect(!m, 0)) + goto _M0; + tmp1 = LD8(src8++); + tmp2 = LD8(src8++); + goto _8B3; + case 2: + m += 2; + tmp3 = LD8(src8++); + tmp0 = LD8(src8++); + goto _8B1; + case 3: + m += 1; + tmp2 = LD8(src8++); + tmp3 = LD8(src8++); + goto _8B2; + case 1: + m--; + tmp0 = LD8(src8++); + tmp1 = LD8(src8++); + if (__builtin_expect(!m, 0)) + goto _8B0; + } + + do { + tmp2 = LD8(src8++); + tmp0 = __insn_dblalign(tmp0, tmp1, srci); + ST8(dst8++, tmp0); +_8B3: + tmp3 = LD8(src8++); + tmp1 = __insn_dblalign(tmp1, tmp2, srci); + ST8(dst8++, tmp1); +_8B2: + tmp0 = LD8(src8++); + tmp2 = __insn_dblalign(tmp2, tmp3, srci); + ST8(dst8++, tmp2); +_8B1: + tmp1 = LD8(src8++); + tmp3 = __insn_dblalign(tmp3, tmp0, srci); + ST8(dst8++, tmp3); + m -= 4; + } while (m); + +_8B0: + tmp0 = __insn_dblalign(tmp0, tmp1, srci); + ST8(dst8++, tmp0); + src8--; + +_M0: + if (__builtin_expect(n >= CHIP_L2_LINE_SIZE(), 0)) { + op_t tmp4, tmp5, tmp6, tmp7, tmp8; + + prefetch = ((const char *)src8) + + CHIP_L2_LINE_SIZE() * PREFETCH_LINES_AHEAD; + + for (tmp0 = LD8(src8++); n >= CHIP_L2_LINE_SIZE(); + n -= CHIP_L2_LINE_SIZE()) { + /* Prefetch and advance to next line to + prefetch, but don't go past the end. */ + __insn_prefetch(prefetch); + + /* Make sure prefetch got scheduled + earlier. */ + __asm__ ("" : : : "memory"); + + prefetch += CHIP_L2_LINE_SIZE(); + prefetch = (prefetch < src1_end) ? prefetch : + (const char *) src8; + + tmp1 = LD8(src8++); + tmp2 = LD8(src8++); + tmp3 = LD8(src8++); + tmp4 = LD8(src8++); + tmp5 = LD8(src8++); + tmp6 = LD8(src8++); + tmp7 = LD8(src8++); + tmp8 = LD8(src8++); + + tmp0 = __insn_dblalign(tmp0, tmp1, srci); + tmp1 = __insn_dblalign(tmp1, tmp2, srci); + tmp2 = __insn_dblalign(tmp2, tmp3, srci); + tmp3 = __insn_dblalign(tmp3, tmp4, srci); + tmp4 = __insn_dblalign(tmp4, tmp5, srci); + tmp5 = __insn_dblalign(tmp5, tmp6, srci); + tmp6 = __insn_dblalign(tmp6, tmp7, srci); + tmp7 = __insn_dblalign(tmp7, tmp8, srci); + + __insn_wh64(dst8); + + ST8(dst8++, tmp0); + ST8(dst8++, tmp1); + ST8(dst8++, tmp2); + ST8(dst8++, tmp3); + ST8(dst8++, tmp4); + ST8(dst8++, tmp5); + ST8(dst8++, tmp6); + ST8(dst8++, tmp7); + + tmp0 = tmp8; + } + src8--; + } + + /* Copy the rest 8-byte chunks. */ + if (n >= sizeof(op_t)) { + tmp0 = LD8(src8++); + for (; n >= sizeof(op_t); n -= sizeof(op_t)) { + tmp1 = LD8(src8++); + tmp0 = __insn_dblalign(tmp0, tmp1, srci); + ST8(dst8++, tmp0); + tmp0 = tmp1; + } + src8--; } if (n == 0) return RETVAL; - b = ((const char *)src8 <= src1_end) ? *src8 : 0; + tmp0 = LD8(src8++); + tmp1 = ((const char *)src8 <= src1_end) + ? LD8((op_t *)src8) : 0; + final = __insn_dblalign(tmp0, tmp1, srci); - /* - * Final source bytes to write to trailing partial - * word, if any. - */ - final = __insn_dblalign(a, b, src1); } else { /* Aligned copy. */ - const word_t* __restrict src8 = (const word_t *)src1; + const op_t *__restrict src8 = (const op_t *)src1; /* src8 and dst8 are both word-aligned. */ if (n >= CHIP_L2_LINE_SIZE()) { /* Copy until 'dst' is cache-line-aligned. */ for (; (uintptr_t)dst8 & (CHIP_L2_LINE_SIZE() - 1); - n -= sizeof(word_t)) + n -= sizeof(op_t)) ST8(dst8++, LD8(src8++)); for (; n >= CHIP_L2_LINE_SIZE(); ) { - __insn_wh64(dst8); + op_t tmp0, tmp1, tmp2, tmp3; + op_t tmp4, tmp5, tmp6, tmp7; /* * Prefetch and advance to next line - * to prefetch, but don't go past the end + * to prefetch, but don't go past the + * end. */ __insn_prefetch(prefetch); + + /* Make sure prefetch got scheduled + earlier. */ + __asm__ ("" : : : "memory"); + prefetch += CHIP_L2_LINE_SIZE(); - prefetch = (prefetch > src1_end) ? prefetch : + prefetch = (prefetch < src1_end) ? prefetch : (const char *)src8; /* - * Copy an entire cache line. Manually - * unrolled to avoid idiosyncracies of - * compiler unrolling. + * Do all the loads before wh64. This + * is necessary if [src8, src8+7] and + * [dst8, dst8+7] share the same cache + * line and dst8 <= src8, as can be + * the case when called from memmove, + * or with code tested on x86 whose + * memcpy always works with forward + * copies. */ -#define COPY_WORD(offset) ({ ST8(dst8+offset, LD8(src8+offset)); n -= 8; }) - COPY_WORD(0); - COPY_WORD(1); - COPY_WORD(2); - COPY_WORD(3); - COPY_WORD(4); - COPY_WORD(5); - COPY_WORD(6); - COPY_WORD(7); -#if CHIP_L2_LINE_SIZE() == 128 - COPY_WORD(8); - COPY_WORD(9); - COPY_WORD(10); - COPY_WORD(11); - COPY_WORD(12); - COPY_WORD(13); - COPY_WORD(14); - COPY_WORD(15); -#elif CHIP_L2_LINE_SIZE() != 64 -# error Fix code that assumes particular L2 cache line sizes -#endif + tmp0 = LD8(src8++); + tmp1 = LD8(src8++); + tmp2 = LD8(src8++); + tmp3 = LD8(src8++); + tmp4 = LD8(src8++); + tmp5 = LD8(src8++); + tmp6 = LD8(src8++); + tmp7 = LD8(src8++); + + /* wh64 and wait for tmp7 load completion. */ + __asm__ ("move %0, %0; wh64 %1\n" + : : "r"(tmp7), "r"(dst8)); - dst8 += CHIP_L2_LINE_SIZE() / sizeof(word_t); - src8 += CHIP_L2_LINE_SIZE() / sizeof(word_t); + ST8(dst8++, tmp0); + ST8(dst8++, tmp1); + ST8(dst8++, tmp2); + ST8(dst8++, tmp3); + ST8(dst8++, tmp4); + ST8(dst8++, tmp5); + ST8(dst8++, tmp6); + ST8(dst8++, tmp7); + + n -= CHIP_L2_LINE_SIZE(); } +#if CHIP_L2_LINE_SIZE() != 64 +# error "Fix code that assumes particular L2 cache line size." +#endif } - for (; n >= sizeof(word_t); n -= sizeof(word_t)) + for (; n >= sizeof(op_t); n -= sizeof(op_t)) ST8(dst8++, LD8(src8++)); if (__builtin_expect(n == 0, 1)) diff --git a/arch/tile/lib/memcpy_tile64.c b/arch/tile/lib/memcpy_tile64.c deleted file mode 100644 index 3bc4b4e40d93..000000000000 --- a/arch/tile/lib/memcpy_tile64.c +++ /dev/null @@ -1,276 +0,0 @@ -/* - * Copyright 2010 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#include <linux/string.h> -#include <linux/smp.h> -#include <linux/module.h> -#include <linux/uaccess.h> -#include <asm/fixmap.h> -#include <asm/kmap_types.h> -#include <asm/tlbflush.h> -#include <hv/hypervisor.h> -#include <arch/chip.h> - - -#if !CHIP_HAS_COHERENT_LOCAL_CACHE() - -/* Defined in memcpy.S */ -extern unsigned long __memcpy_asm(void *to, const void *from, unsigned long n); -extern unsigned long __copy_to_user_inatomic_asm( - void __user *to, const void *from, unsigned long n); -extern unsigned long __copy_from_user_inatomic_asm( - void *to, const void __user *from, unsigned long n); -extern unsigned long __copy_from_user_zeroing_asm( - void *to, const void __user *from, unsigned long n); - -typedef unsigned long (*memcpy_t)(void *, const void *, unsigned long); - -/* Size above which to consider TLB games for performance */ -#define LARGE_COPY_CUTOFF 2048 - -/* Communicate to the simulator what we are trying to do. */ -#define sim_allow_multiple_caching(b) \ - __insn_mtspr(SPR_SIM_CONTROL, \ - SIM_CONTROL_ALLOW_MULTIPLE_CACHING | ((b) << _SIM_CONTROL_OPERATOR_BITS)) - -/* - * Copy memory by briefly enabling incoherent cacheline-at-a-time mode. - * - * We set up our own source and destination PTEs that we fully control. - * This is the only way to guarantee that we don't race with another - * thread that is modifying the PTE; we can't afford to try the - * copy_{to,from}_user() technique of catching the interrupt, since - * we must run with interrupts disabled to avoid the risk of some - * other code seeing the incoherent data in our cache. (Recall that - * our cache is indexed by PA, so even if the other code doesn't use - * our kmap_atomic virtual addresses, they'll still hit in cache using - * the normal VAs that aren't supposed to hit in cache.) - */ -static void memcpy_multicache(void *dest, const void *source, - pte_t dst_pte, pte_t src_pte, int len) -{ - int idx; - unsigned long flags, newsrc, newdst; - pmd_t *pmdp; - pte_t *ptep; - int type0, type1; - int cpu = get_cpu(); - - /* - * Disable interrupts so that we don't recurse into memcpy() - * in an interrupt handler, nor accidentally reference - * the PA of the source from an interrupt routine. Also - * notify the simulator that we're playing games so we don't - * generate spurious coherency warnings. - */ - local_irq_save(flags); - sim_allow_multiple_caching(1); - - /* Set up the new dest mapping */ - type0 = kmap_atomic_idx_push(); - idx = FIX_KMAP_BEGIN + (KM_TYPE_NR * cpu) + type0; - newdst = __fix_to_virt(idx) + ((unsigned long)dest & (PAGE_SIZE-1)); - pmdp = pmd_offset(pud_offset(pgd_offset_k(newdst), newdst), newdst); - ptep = pte_offset_kernel(pmdp, newdst); - if (pte_val(*ptep) != pte_val(dst_pte)) { - set_pte(ptep, dst_pte); - local_flush_tlb_page(NULL, newdst, PAGE_SIZE); - } - - /* Set up the new source mapping */ - type1 = kmap_atomic_idx_push(); - idx += (type0 - type1); - src_pte = hv_pte_set_nc(src_pte); - src_pte = hv_pte_clear_writable(src_pte); /* be paranoid */ - newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1)); - pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc); - ptep = pte_offset_kernel(pmdp, newsrc); - __set_pte(ptep, src_pte); /* set_pte() would be confused by this */ - local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); - - /* Actually move the data. */ - __memcpy_asm((void *)newdst, (const void *)newsrc, len); - - /* - * Remap the source as locally-cached and not OLOC'ed so that - * we can inval without also invaling the remote cpu's cache. - * This also avoids known errata with inv'ing cacheable oloc data. - */ - src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3); - src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */ - __set_pte(ptep, src_pte); /* set_pte() would be confused by this */ - local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); - - /* - * Do the actual invalidation, covering the full L2 cache line - * at the end since __memcpy_asm() is somewhat aggressive. - */ - __inv_buffer((void *)newsrc, len); - - /* - * We're done: notify the simulator that all is back to normal, - * and re-enable interrupts and pre-emption. - */ - kmap_atomic_idx_pop(); - kmap_atomic_idx_pop(); - sim_allow_multiple_caching(0); - local_irq_restore(flags); - put_cpu(); -} - -/* - * Identify large copies from remotely-cached memory, and copy them - * via memcpy_multicache() if they look good, otherwise fall back - * to the particular kind of copying passed as the memcpy_t function. - */ -static unsigned long fast_copy(void *dest, const void *source, int len, - memcpy_t func) -{ - /* - * Check if it's big enough to bother with. We may end up doing a - * small copy via TLB manipulation if we're near a page boundary, - * but presumably we'll make it up when we hit the second page. - */ - while (len >= LARGE_COPY_CUTOFF) { - int copy_size, bytes_left_on_page; - pte_t *src_ptep, *dst_ptep; - pte_t src_pte, dst_pte; - struct page *src_page, *dst_page; - - /* Is the source page oloc'ed to a remote cpu? */ -retry_source: - src_ptep = virt_to_pte(current->mm, (unsigned long)source); - if (src_ptep == NULL) - break; - src_pte = *src_ptep; - if (!hv_pte_get_present(src_pte) || - !hv_pte_get_readable(src_pte) || - hv_pte_get_mode(src_pte) != HV_PTE_MODE_CACHE_TILE_L3) - break; - if (get_remote_cache_cpu(src_pte) == smp_processor_id()) - break; - src_page = pfn_to_page(pte_pfn(src_pte)); - get_page(src_page); - if (pte_val(src_pte) != pte_val(*src_ptep)) { - put_page(src_page); - goto retry_source; - } - if (pte_huge(src_pte)) { - /* Adjust the PTE to correspond to a small page */ - int pfn = pte_pfn(src_pte); - pfn += (((unsigned long)source & (HPAGE_SIZE-1)) - >> PAGE_SHIFT); - src_pte = pfn_pte(pfn, src_pte); - src_pte = pte_mksmall(src_pte); - } - - /* Is the destination page writable? */ -retry_dest: - dst_ptep = virt_to_pte(current->mm, (unsigned long)dest); - if (dst_ptep == NULL) { - put_page(src_page); - break; - } - dst_pte = *dst_ptep; - if (!hv_pte_get_present(dst_pte) || - !hv_pte_get_writable(dst_pte)) { - put_page(src_page); - break; - } - dst_page = pfn_to_page(pte_pfn(dst_pte)); - if (dst_page == src_page) { - /* - * Source and dest are on the same page; this - * potentially exposes us to incoherence if any - * part of src and dest overlap on a cache line. - * Just give up rather than trying to be precise. - */ - put_page(src_page); - break; - } - get_page(dst_page); - if (pte_val(dst_pte) != pte_val(*dst_ptep)) { - put_page(dst_page); - goto retry_dest; - } - if (pte_huge(dst_pte)) { - /* Adjust the PTE to correspond to a small page */ - int pfn = pte_pfn(dst_pte); - pfn += (((unsigned long)dest & (HPAGE_SIZE-1)) - >> PAGE_SHIFT); - dst_pte = pfn_pte(pfn, dst_pte); - dst_pte = pte_mksmall(dst_pte); - } - - /* All looks good: create a cachable PTE and copy from it */ - copy_size = len; - bytes_left_on_page = - PAGE_SIZE - (((int)source) & (PAGE_SIZE-1)); - if (copy_size > bytes_left_on_page) - copy_size = bytes_left_on_page; - bytes_left_on_page = - PAGE_SIZE - (((int)dest) & (PAGE_SIZE-1)); - if (copy_size > bytes_left_on_page) - copy_size = bytes_left_on_page; - memcpy_multicache(dest, source, dst_pte, src_pte, copy_size); - - /* Release the pages */ - put_page(dst_page); - put_page(src_page); - - /* Continue on the next page */ - dest += copy_size; - source += copy_size; - len -= copy_size; - } - - return func(dest, source, len); -} - -void *memcpy(void *to, const void *from, __kernel_size_t n) -{ - if (n < LARGE_COPY_CUTOFF) - return (void *)__memcpy_asm(to, from, n); - else - return (void *)fast_copy(to, from, n, __memcpy_asm); -} - -unsigned long __copy_to_user_inatomic(void __user *to, const void *from, - unsigned long n) -{ - if (n < LARGE_COPY_CUTOFF) - return __copy_to_user_inatomic_asm(to, from, n); - else - return fast_copy(to, from, n, __copy_to_user_inatomic_asm); -} - -unsigned long __copy_from_user_inatomic(void *to, const void __user *from, - unsigned long n) -{ - if (n < LARGE_COPY_CUTOFF) - return __copy_from_user_inatomic_asm(to, from, n); - else - return fast_copy(to, from, n, __copy_from_user_inatomic_asm); -} - -unsigned long __copy_from_user_zeroing(void *to, const void __user *from, - unsigned long n) -{ - if (n < LARGE_COPY_CUTOFF) - return __copy_from_user_zeroing_asm(to, from, n); - else - return fast_copy(to, from, n, __copy_from_user_zeroing_asm); -} - -#endif /* !CHIP_HAS_COHERENT_LOCAL_CACHE() */ diff --git a/arch/tile/lib/memcpy_user_64.c b/arch/tile/lib/memcpy_user_64.c index 37440caa7370..88c7016492c4 100644 --- a/arch/tile/lib/memcpy_user_64.c +++ b/arch/tile/lib/memcpy_user_64.c @@ -31,6 +31,7 @@ ".pushsection .coldtext.memcpy,\"ax\";" \ "2: { move r0, %2; jrp lr };" \ ".section __ex_table,\"a\";" \ + ".align 8;" \ ".quad 1b, 2b;" \ ".popsection" \ : "=m" (*(p)) : "r" (v), "r" (n)); \ @@ -43,6 +44,7 @@ ".pushsection .coldtext.memcpy,\"ax\";" \ "2: { move r0, %2; jrp lr };" \ ".section __ex_table,\"a\";" \ + ".align 8;" \ ".quad 1b, 2b;" \ ".popsection" \ : "=r" (__v) : "m" (*(p)), "r" (n)); \ diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c index 57dbb3a5bff8..2042bfe6595f 100644 --- a/arch/tile/lib/memset_32.c +++ b/arch/tile/lib/memset_32.c @@ -12,13 +12,10 @@ * more details. */ -#include <arch/chip.h> - #include <linux/types.h> #include <linux/string.h> #include <linux/module.h> - -#undef memset +#include <arch/chip.h> void *memset(void *s, int c, size_t n) { @@ -26,11 +23,7 @@ void *memset(void *s, int c, size_t n) int n32; uint32_t v16, v32; uint8_t *out8 = s; -#if !CHIP_HAS_WH64() - int ahead32; -#else int to_align32; -#endif /* Experimentation shows that a trivial tight loop is a win up until * around a size of 20, where writing a word at a time starts to win. @@ -61,21 +54,6 @@ void *memset(void *s, int c, size_t n) return s; } -#if !CHIP_HAS_WH64() - /* Use a spare issue slot to start prefetching the first cache - * line early. This instruction is free as the store can be buried - * in otherwise idle issue slots doing ALU ops. - */ - __insn_prefetch(out8); - - /* We prefetch the end so that a short memset that spans two cache - * lines gets some prefetching benefit. Again we believe this is free - * to issue. - */ - __insn_prefetch(&out8[n - 1]); -#endif /* !CHIP_HAS_WH64() */ - - /* Align 'out8'. We know n >= 3 so this won't write past the end. */ while (((uintptr_t) out8 & 3) != 0) { *out8++ = c; @@ -96,90 +74,6 @@ void *memset(void *s, int c, size_t n) /* This must be at least 8 or the following loop doesn't work. */ #define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4) -#if !CHIP_HAS_WH64() - - ahead32 = CACHE_LINE_SIZE_IN_WORDS; - - /* We already prefetched the first and last cache lines, so - * we only need to do more prefetching if we are storing - * to more than two cache lines. - */ - if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) { - int i; - - /* Prefetch the next several cache lines. - * This is the setup code for the software-pipelined - * loop below. - */ -#define MAX_PREFETCH 5 - ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS; - if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS) - ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS; - - for (i = CACHE_LINE_SIZE_IN_WORDS; - i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS) - __insn_prefetch(&out32[i]); - } - - if (n32 > ahead32) { - while (1) { - int j; - - /* Prefetch by reading one word several cache lines - * ahead. Since loads are non-blocking this will - * cause the full cache line to be read while we are - * finishing earlier cache lines. Using a store - * here causes microarchitectural performance - * problems where a victimizing store miss goes to - * the head of the retry FIFO and locks the pipe for - * a few cycles. So a few subsequent stores in this - * loop go into the retry FIFO, and then later - * stores see other stores to the same cache line - * are already in the retry FIFO and themselves go - * into the retry FIFO, filling it up and grinding - * to a halt waiting for the original miss to be - * satisfied. - */ - __insn_prefetch(&out32[ahead32]); - -#if CACHE_LINE_SIZE_IN_WORDS % 4 != 0 -#error "Unhandled CACHE_LINE_SIZE_IN_WORDS" -#endif - - n32 -= CACHE_LINE_SIZE_IN_WORDS; - - /* Save icache space by only partially unrolling - * this loop. - */ - for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) { - *out32++ = v32; - *out32++ = v32; - *out32++ = v32; - *out32++ = v32; - } - - /* To save compiled code size, reuse this loop even - * when we run out of prefetching to do by dropping - * ahead32 down. - */ - if (n32 <= ahead32) { - /* Not even a full cache line left, - * so stop now. - */ - if (n32 < CACHE_LINE_SIZE_IN_WORDS) - break; - - /* Choose a small enough value that we don't - * prefetch past the end. There's no sense - * in touching cache lines we don't have to. - */ - ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1; - } - } - } - -#else /* CHIP_HAS_WH64() */ - /* Determine how many words we need to emit before the 'out32' * pointer becomes aligned modulo the cache line size. */ @@ -236,8 +130,6 @@ void *memset(void *s, int c, size_t n) n32 &= CACHE_LINE_SIZE_IN_WORDS - 1; } -#endif /* CHIP_HAS_WH64() */ - /* Now handle any leftover values. */ if (n32 != 0) { do { diff --git a/arch/tile/lib/memset_64.c b/arch/tile/lib/memset_64.c index 3873085711d5..03ef69cd73de 100644 --- a/arch/tile/lib/memset_64.c +++ b/arch/tile/lib/memset_64.c @@ -12,13 +12,11 @@ * more details. */ -#include <arch/chip.h> - #include <linux/types.h> #include <linux/string.h> #include <linux/module.h> - -#undef memset +#include <arch/chip.h> +#include "string-endian.h" void *memset(void *s, int c, size_t n) { @@ -70,8 +68,7 @@ void *memset(void *s, int c, size_t n) n64 = n >> 3; /* Tile input byte out to 64 bits. */ - /* KLUDGE */ - v64 = 0x0101010101010101ULL * (uint8_t)c; + v64 = copy_byte(c); /* This must be at least 8 or the following loop doesn't work. */ #define CACHE_LINE_SIZE_IN_DOUBLEWORDS (CHIP_L2_LINE_SIZE() / 8) diff --git a/arch/tile/lib/strchr_32.c b/arch/tile/lib/strchr_32.c index c94e6f7ae7b5..841fe6963019 100644 --- a/arch/tile/lib/strchr_32.c +++ b/arch/tile/lib/strchr_32.c @@ -16,8 +16,6 @@ #include <linux/string.h> #include <linux/module.h> -#undef strchr - char *strchr(const char *s, int c) { int z, g; diff --git a/arch/tile/lib/strchr_64.c b/arch/tile/lib/strchr_64.c index f39f9dc422b0..fe6e31c06f8d 100644 --- a/arch/tile/lib/strchr_64.c +++ b/arch/tile/lib/strchr_64.c @@ -26,7 +26,7 @@ char *strchr(const char *s, int c) const uint64_t *p = (const uint64_t *)(s_int & -8); /* Create eight copies of the byte for which we are looking. */ - const uint64_t goal = 0x0101010101010101ULL * (uint8_t) c; + const uint64_t goal = copy_byte(c); /* Read the first aligned word, but force bytes before the string to * match neither zero nor goal (we make sure the high bit of each diff --git a/arch/tile/lib/string-endian.h b/arch/tile/lib/string-endian.h index c0eed7ce69c3..2e49cbfe9371 100644 --- a/arch/tile/lib/string-endian.h +++ b/arch/tile/lib/string-endian.h @@ -1,5 +1,5 @@ /* - * Copyright 2011 Tilera Corporation. All Rights Reserved. + * Copyright 2013 Tilera Corporation. All Rights Reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -31,3 +31,14 @@ #define CFZ(x) __insn_clz(x) #define REVCZ(x) __insn_ctz(x) #endif + +/* + * Create eight copies of the byte in a uint64_t. Byte Shuffle uses + * the bytes of srcB as the index into the dest vector to select a + * byte. With all indices of zero, the first byte is copied into all + * the other bytes. + */ +static inline uint64_t copy_byte(uint8_t byte) +{ + return __insn_shufflebytes(byte, 0, 0); +} diff --git a/arch/tile/lib/strlen_32.c b/arch/tile/lib/strlen_32.c index 4974292a5534..f26f88e11e4a 100644 --- a/arch/tile/lib/strlen_32.c +++ b/arch/tile/lib/strlen_32.c @@ -16,8 +16,6 @@ #include <linux/string.h> #include <linux/module.h> -#undef strlen - size_t strlen(const char *s) { /* Get an aligned pointer. */ diff --git a/arch/tile/lib/strnlen_32.c b/arch/tile/lib/strnlen_32.c new file mode 100644 index 000000000000..1434141d9e01 --- /dev/null +++ b/arch/tile/lib/strnlen_32.c @@ -0,0 +1,47 @@ +/* + * Copyright 2013 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/module.h> + +size_t strnlen(const char *s, size_t count) +{ + /* Get an aligned pointer. */ + const uintptr_t s_int = (uintptr_t) s; + const uint32_t *p = (const uint32_t *)(s_int & -4); + size_t bytes_read = sizeof(*p) - (s_int & (sizeof(*p) - 1)); + size_t len; + uint32_t v, bits; + + /* Avoid page fault risk by not reading any bytes when count is 0. */ + if (count == 0) + return 0; + + /* Read first word, but force bytes before the string to be nonzero. */ + v = *p | ((1 << ((s_int << 3) & 31)) - 1); + + while ((bits = __insn_seqb(v, 0)) == 0) { + if (bytes_read >= count) { + /* Read COUNT bytes and didn't find the terminator. */ + return count; + } + v = *++p; + bytes_read += sizeof(v); + } + + len = ((const char *) p) + (__insn_ctz(bits) >> 3) - s; + return (len < count ? len : count); +} +EXPORT_SYMBOL(strnlen); diff --git a/arch/tile/lib/strnlen_64.c b/arch/tile/lib/strnlen_64.c new file mode 100644 index 000000000000..2e8de6a5136f --- /dev/null +++ b/arch/tile/lib/strnlen_64.c @@ -0,0 +1,48 @@ +/* + * Copyright 2013 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/module.h> +#include "string-endian.h" + +size_t strnlen(const char *s, size_t count) +{ + /* Get an aligned pointer. */ + const uintptr_t s_int = (uintptr_t) s; + const uint64_t *p = (const uint64_t *)(s_int & -8); + size_t bytes_read = sizeof(*p) - (s_int & (sizeof(*p) - 1)); + size_t len; + uint64_t v, bits; + + /* Avoid page fault risk by not reading any bytes when count is 0. */ + if (count == 0) + return 0; + + /* Read and MASK the first word. */ + v = *p | MASK(s_int); + + while ((bits = __insn_v1cmpeqi(v, 0)) == 0) { + if (bytes_read >= count) { + /* Read COUNT bytes and didn't find the terminator. */ + return count; + } + v = *++p; + bytes_read += sizeof(v); + } + + len = ((const char *) p) + (CFZ(bits) >> 3) - s; + return (len < count ? len : count); +} +EXPORT_SYMBOL(strnlen); diff --git a/arch/tile/lib/usercopy_32.S b/arch/tile/lib/usercopy_32.S index b62d002af009..1bc162224638 100644 --- a/arch/tile/lib/usercopy_32.S +++ b/arch/tile/lib/usercopy_32.S @@ -36,6 +36,7 @@ strnlen_user_fault: { move r0, zero; jrp lr } ENDPROC(strnlen_user_fault) .section __ex_table,"a" + .align 4 .word 1b, strnlen_user_fault .popsection @@ -47,18 +48,20 @@ strnlen_user_fault: */ STD_ENTRY(strncpy_from_user_asm) { bz r2, 2f; move r3, r0 } -1: { lb_u r4, r1; addi r1, r1, 1; addi r2, r2, -1 } +1: { lb_u r4, r1; addi r1, r1, 1; addi r2, r2, -1 } { sb r0, r4; addi r0, r0, 1 } - bz r2, 2f - bnzt r4, 1b - addi r0, r0, -1 /* don't count the trailing NUL */ -2: { sub r0, r0, r3; jrp lr } + bz r4, 2f + bnzt r2, 1b + { sub r0, r0, r3; jrp lr } +2: addi r0, r0, -1 /* don't count the trailing NUL */ + { sub r0, r0, r3; jrp lr } STD_ENDPROC(strncpy_from_user_asm) .pushsection .fixup,"ax" strncpy_from_user_fault: { movei r0, -EFAULT; jrp lr } ENDPROC(strncpy_from_user_fault) .section __ex_table,"a" + .align 4 .word 1b, strncpy_from_user_fault .popsection @@ -77,6 +80,7 @@ STD_ENTRY(clear_user_asm) bnzt r1, 1b 2: { move r0, r1; jrp lr } .pushsection __ex_table,"a" + .align 4 .word 1b, 2b .popsection @@ -86,6 +90,7 @@ STD_ENTRY(clear_user_asm) 2: { move r0, r1; jrp lr } STD_ENDPROC(clear_user_asm) .pushsection __ex_table,"a" + .align 4 .word 1b, 2b .popsection @@ -105,25 +110,7 @@ STD_ENTRY(flush_user_asm) 2: { move r0, r1; jrp lr } STD_ENDPROC(flush_user_asm) .pushsection __ex_table,"a" - .word 1b, 2b - .popsection - -/* - * inv_user_asm takes the user target address in r0 and the - * number of bytes to invalidate in r1. - * It returns the number of not inv'able bytes (hopefully zero) in r0. - */ -STD_ENTRY(inv_user_asm) - bz r1, 2f - { movei r2, L2_CACHE_BYTES; add r1, r0, r1 } - { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 } - { and r0, r0, r2; and r1, r1, r2 } - { sub r1, r1, r0 } -1: { inv r0; addi r1, r1, -CHIP_INV_STRIDE() } - { addi r0, r0, CHIP_INV_STRIDE(); bnzt r1, 1b } -2: { move r0, r1; jrp lr } - STD_ENDPROC(inv_user_asm) - .pushsection __ex_table,"a" + .align 4 .word 1b, 2b .popsection @@ -143,5 +130,6 @@ STD_ENTRY(finv_user_asm) 2: { move r0, r1; jrp lr } STD_ENDPROC(finv_user_asm) .pushsection __ex_table,"a" + .align 4 .word 1b, 2b .popsection diff --git a/arch/tile/lib/usercopy_64.S b/arch/tile/lib/usercopy_64.S index adb2dbbc70cd..b3b31a3306f8 100644 --- a/arch/tile/lib/usercopy_64.S +++ b/arch/tile/lib/usercopy_64.S @@ -36,6 +36,7 @@ strnlen_user_fault: { move r0, zero; jrp lr } ENDPROC(strnlen_user_fault) .section __ex_table,"a" + .align 8 .quad 1b, strnlen_user_fault .popsection @@ -47,18 +48,20 @@ strnlen_user_fault: */ STD_ENTRY(strncpy_from_user_asm) { beqz r2, 2f; move r3, r0 } -1: { ld1u r4, r1; addi r1, r1, 1; addi r2, r2, -1 } +1: { ld1u r4, r1; addi r1, r1, 1; addi r2, r2, -1 } { st1 r0, r4; addi r0, r0, 1 } - beqz r2, 2f - bnezt r4, 1b - addi r0, r0, -1 /* don't count the trailing NUL */ -2: { sub r0, r0, r3; jrp lr } + beqz r4, 2f + bnezt r2, 1b + { sub r0, r0, r3; jrp lr } +2: addi r0, r0, -1 /* don't count the trailing NUL */ + { sub r0, r0, r3; jrp lr } STD_ENDPROC(strncpy_from_user_asm) .pushsection .fixup,"ax" strncpy_from_user_fault: { movei r0, -EFAULT; jrp lr } ENDPROC(strncpy_from_user_fault) .section __ex_table,"a" + .align 8 .quad 1b, strncpy_from_user_fault .popsection @@ -77,6 +80,7 @@ STD_ENTRY(clear_user_asm) bnezt r1, 1b 2: { move r0, r1; jrp lr } .pushsection __ex_table,"a" + .align 8 .quad 1b, 2b .popsection @@ -86,6 +90,7 @@ STD_ENTRY(clear_user_asm) 2: { move r0, r1; jrp lr } STD_ENDPROC(clear_user_asm) .pushsection __ex_table,"a" + .align 8 .quad 1b, 2b .popsection @@ -105,25 +110,7 @@ STD_ENTRY(flush_user_asm) 2: { move r0, r1; jrp lr } STD_ENDPROC(flush_user_asm) .pushsection __ex_table,"a" - .quad 1b, 2b - .popsection - -/* - * inv_user_asm takes the user target address in r0 and the - * number of bytes to invalidate in r1. - * It returns the number of not inv'able bytes (hopefully zero) in r0. - */ -STD_ENTRY(inv_user_asm) - beqz r1, 2f - { movei r2, L2_CACHE_BYTES; add r1, r0, r1 } - { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 } - { and r0, r0, r2; and r1, r1, r2 } - { sub r1, r1, r0 } -1: { inv r0; addi r1, r1, -CHIP_INV_STRIDE() } - { addi r0, r0, CHIP_INV_STRIDE(); bnezt r1, 1b } -2: { move r0, r1; jrp lr } - STD_ENDPROC(inv_user_asm) - .pushsection __ex_table,"a" + .align 8 .quad 1b, 2b .popsection @@ -143,5 +130,6 @@ STD_ENTRY(finv_user_asm) 2: { move r0, r1; jrp lr } STD_ENDPROC(finv_user_asm) .pushsection __ex_table,"a" + .align 8 .quad 1b, 2b .popsection |