arch/alpha/lib/memset.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132

/*
 * linux/arch/alpha/lib/memset.S
 *
 * This is an efficient (and small) implementation of the C library "memset()"
 * function for the alpha.
 *
 *	(C) Copyright 1996 Linus Torvalds
 *
 * This routine is "moral-ware": you are free to use it any way you wish, and
 * the only obligation I put on you is a moral one: if you make any improvements
 * to the routine, please send me your improvements for me to use similarly.
 *
 * The scheduling comments are according to the EV5 documentation (and done by 
 * hand, so they might well be incorrect, please do tell me about it..)
 */
#include <asm/export.h>
	.set noat
	.set noreorder
.text
	.globl memset
	.globl __memset
	.globl ___memset
	.globl __memset16
	.globl __constant_c_memset

	.ent ___memset
.align 5
___memset:
	.frame $30,0,$26,0
	.prologue 0

	and $17,255,$1		/* E1 */
	insbl $17,1,$17		/* .. E0 */
	bis $17,$1,$17		/* E0 (p-c latency, next cycle) */
	sll $17,16,$1		/* E1 (p-c latency, next cycle) */

	bis $17,$1,$17		/* E0 (p-c latency, next cycle) */
	sll $17,32,$1		/* E1 (p-c latency, next cycle) */
	bis $17,$1,$17		/* E0 (p-c latency, next cycle) */
	ldq_u $31,0($30)	/* .. E1 */

.align 5
__constant_c_memset:
	addq $18,$16,$6		/* E0 */
	bis $16,$16,$0		/* .. E1 */
	xor $16,$6,$1		/* E0 */
	ble $18,end		/* .. E1 */

	bic $1,7,$1		/* E0 */
	beq $1,within_one_quad	/* .. E1 (note EV5 zero-latency forwarding) */
	and $16,7,$3		/* E0 */
	beq $3,aligned		/* .. E1 (note EV5 zero-latency forwarding) */

	ldq_u $4,0($16)		/* E0 */
	bis $16,$16,$5		/* .. E1 */
	insql $17,$16,$2	/* E0 */
	subq $3,8,$3		/* .. E1 */

	addq $18,$3,$18		/* E0	$18 is new count ($3 is negative) */
	mskql $4,$16,$4		/* .. E1 (and possible load stall) */
	subq $16,$3,$16		/* E0 	$16 is new aligned destination */
	bis $2,$4,$1		/* .. E1 */

	bis $31,$31,$31		/* E0 */
	ldq_u $31,0($30)	/* .. E1 */
	stq_u $1,0($5)		/* E0 */
	bis $31,$31,$31		/* .. E1 */

.align 4
aligned:
	sra $18,3,$3		/* E0 */
	and $18,7,$18		/* .. E1 */
	bis $16,$16,$5		/* E0 */
	beq $3,no_quad		/* .. E1 */

.align 3
loop:
	stq $17,0($5)		/* E0 */
	subq $3,1,$3		/* .. E1 */
	addq $5,8,$5		/* E0 */
	bne $3,loop		/* .. E1 */

no_quad:
	bis $31,$31,$31		/* E0 */
	beq $18,end		/* .. E1 */
	ldq $7,0($5)		/* E0 */
	mskqh $7,$6,$2		/* .. E1 (and load stall) */

	insqh $17,$6,$4		/* E0 */
	bis $2,$4,$1		/* .. E1 */
	stq $1,0($5)		/* E0 */
	ret $31,($26),1		/* .. E1 */

.align 3
within_one_quad:
	ldq_u $1,0($16)		/* E0 */
	insql $17,$16,$2	/* E1 */
	mskql $1,$16,$4		/* E0 (after load stall) */
	bis $2,$4,$2		/* E0 */

	mskql $2,$6,$4		/* E0 */
	mskqh $1,$6,$2		/* .. E1 */
	bis $2,$4,$1		/* E0 */
	stq_u $1,0($16)		/* E0 */

end:
	ret $31,($26),1		/* E1 */
	.end ___memset
EXPORT_SYMBOL(___memset)
EXPORT_SYMBOL(__constant_c_memset)

	.align 5
	.ent __memset16
__memset16:
	.prologue 0

	inswl $17,0,$1		/* E0 */
	inswl $17,2,$2		/* E0 */
	inswl $17,4,$3		/* E0 */
	or $1,$2,$1		/* .. E1 */
	inswl $17,6,$4		/* E0 */
	or $1,$3,$1		/* .. E1 */
	or $1,$4,$17		/* E0 */
	br __constant_c_memset	/* .. E1 */

	.end __memset16
EXPORT_SYMBOL(__memset16)

memset = ___memset
__memset = ___memset
	EXPORT_SYMBOL(memset)
	EXPORT_SYMBOL(__memset)