| xf.li | bdd93d5 | 2023-05-12 07:10:14 -0700 | [diff] [blame] | 1 | /* Copyright (C) 1996-2016 Free Software Foundation, Inc. | 
|  | 2 | Contributed by Richard Henderson (rth@tamu.edu) | 
|  | 3 | This file is part of the GNU C Library. | 
|  | 4 |  | 
|  | 5 | The GNU C Library is free software; you can redistribute it and/or | 
|  | 6 | modify it under the terms of the GNU Lesser General Public | 
|  | 7 | License as published by the Free Software Foundation; either | 
|  | 8 | version 2.1 of the License, or (at your option) any later version. | 
|  | 9 |  | 
|  | 10 | The GNU C Library is distributed in the hope that it will be useful, | 
|  | 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|  | 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
|  | 13 | Lesser General Public License for more details. | 
|  | 14 |  | 
|  | 15 | You should have received a copy of the GNU Lesser General Public | 
|  | 16 | License along with the GNU C Library.  If not, see | 
|  | 17 | <http://www.gnu.org/licenses/>.  */ | 
|  | 18 |  | 
|  | 19 | /* Fill a block of memory with a character.  Optimized for the Alpha | 
|  | 20 | architecture: | 
|  | 21 |  | 
|  | 22 | - memory accessed as aligned quadwords only | 
|  | 23 | - destination memory not read unless needed for good cache behaviour | 
|  | 24 | - basic blocks arranged to optimize branch prediction for full-quadword | 
|  | 25 | aligned memory blocks. | 
|  | 26 | - partial head and tail quadwords constructed with byte-mask instructions | 
|  | 27 |  | 
|  | 28 | This is generally scheduled for the EV5 (got to look out for my own | 
|  | 29 | interests :-), but with EV4 needs in mind.  There *should* be no more | 
|  | 30 | stalls for the EV4 than there are for the EV5. | 
|  | 31 | */ | 
|  | 32 |  | 
|  | 33 |  | 
|  | 34 | #include <sysdep.h> | 
|  | 35 |  | 
|  | 36 | .set noat | 
|  | 37 | .set noreorder | 
|  | 38 |  | 
|  | 39 | .text | 
|  | 40 | .type	memset, @function | 
|  | 41 | .globl	memset | 
|  | 42 | .usepv	memset, USEPV_PROF | 
|  | 43 |  | 
|  | 44 | cfi_startproc | 
|  | 45 |  | 
|  | 46 | /* On entry to this basic block: | 
|  | 47 | t3 == loop counter | 
|  | 48 | t4 == bytes in partial final word | 
|  | 49 | a0 == possibly misaligned destination pointer | 
|  | 50 | a1 == replicated source character  */ | 
|  | 51 |  | 
|  | 52 | .align 3 | 
|  | 53 | memset_loop: | 
|  | 54 | beq	t3, $tail | 
|  | 55 | blbc	t3, 0f		# skip single store if count even | 
|  | 56 |  | 
|  | 57 | stq_u	a1, 0(a0)	# e0    : store one word | 
|  | 58 | subq	t3, 1, t3	# .. e1 : | 
|  | 59 | addq	a0, 8, a0	# e0    : | 
|  | 60 | beq	t3, $tail	# .. e1 : | 
|  | 61 |  | 
|  | 62 | 0:	stq_u	a1, 0(a0)	# e0    : store two words | 
|  | 63 | subq	t3, 2, t3	# .. e1 : | 
|  | 64 | stq_u	a1, 8(a0)	# e0    : | 
|  | 65 | addq	a0, 16, a0	# .. e1 : | 
|  | 66 | bne	t3, 0b		# e1    : | 
|  | 67 |  | 
|  | 68 | $tail:	bne	t4, 1f		# is there a tail to do? | 
|  | 69 | ret			# no | 
|  | 70 |  | 
|  | 71 | .align 3 | 
|  | 72 | 1:	ldq_u	t0, 0(a0)	# e1    : yes, load original data | 
|  | 73 | mskql	a1, t4, t1	# .. e0 : | 
|  | 74 | mskqh	t0, t4, t0	# e0    : | 
|  | 75 | or	t0, t1, t0	# e1 (stall) | 
|  | 76 | stq_u	t0, 0(a0)	# e0    : | 
|  | 77 | ret			# .. e1 : | 
|  | 78 |  | 
|  | 79 | memset: | 
|  | 80 | #ifdef PROF | 
|  | 81 | ldgp	gp, 0(pv) | 
|  | 82 | lda	AT, _mcount | 
|  | 83 | jsr	AT, (AT), _mcount | 
|  | 84 | #endif | 
|  | 85 |  | 
|  | 86 | and	a1, 0xff, a1	# e0    : zero extend input character | 
|  | 87 | mov	a0, v0		# .. e1 : move return value in place | 
|  | 88 | sll	a1, 8, t0	# e0    : begin replicating the char | 
|  | 89 | beq	a2, $done	# .. e1 : early exit for zero-length store | 
|  | 90 | or	t0, a1, a1	# e0    : | 
|  | 91 | and	a0, 7, t1	# .. e1 : dest misalignment | 
|  | 92 | sll	a1, 16, t0	# e0    : | 
|  | 93 | addq	a2, t1, a2	# .. e1 : add dest misalignment to count | 
|  | 94 | or	t0, a1, a1	# e0    : | 
|  | 95 | srl	a2, 3, t3	# .. e1 : loop = count >> 3 | 
|  | 96 | sll	a1, 32, t0	# e0    : | 
|  | 97 | and	a2, 7, t4	# .. e1 : find number of bytes in tail | 
|  | 98 | or	t0, a1, a1	# e0    : character replication done | 
|  | 99 |  | 
|  | 100 | beq	t1, memset_loop	# .. e1 : aligned head, jump right in | 
|  | 101 |  | 
|  | 102 | ldq_u	t0, 0(a0)	# e1    : load original data to mask into | 
|  | 103 | mskqh	a1, a0, t1	# .. e0 : | 
|  | 104 |  | 
|  | 105 | cmpult	a2, 8, t2	# e0    : is this a sub-word set? | 
|  | 106 | bne	t2, $oneq	# .. e1 (zdb) | 
|  | 107 |  | 
|  | 108 | mskql	t0, a0, t0	# e0    : we span words.  finish this partial | 
|  | 109 | subq	t3, 1, t3	# .. e1 : | 
|  | 110 | addq	a0, 8, a0	# e0    : | 
|  | 111 | or	t0, t1, t0	# .. e1 : | 
|  | 112 | stq_u	t0, -8(a0)	# e0    : | 
|  | 113 | br 	memset_loop	# .. e1 : | 
|  | 114 |  | 
|  | 115 | .align 3 | 
|  | 116 | $oneq: | 
|  | 117 | mskql	t1, a2, t1	# e0    : entire operation within one word | 
|  | 118 | mskql	t0, a0, t2	# e0    : | 
|  | 119 | mskqh	t0, a2, t3	# e0    : | 
|  | 120 | or	t1, t2, t0	# .. e1 : | 
|  | 121 | or	t0, t3, t0	# e1    : | 
|  | 122 | stq_u	t0, 0(a0)	# e0 (stall) | 
|  | 123 |  | 
|  | 124 | $done:	ret | 
|  | 125 |  | 
|  | 126 | cfi_endproc | 
|  | 127 | libc_hidden_builtin_def (memset) |