| xj | b04a402 | 2021-11-25 15:01:52 +0800 | [diff] [blame] | 1 | /* | 
|  | 2 | * Copyright (C) 2013 ARM Ltd. | 
|  | 3 | * Copyright (C) 2013 Linaro. | 
|  | 4 | * | 
|  | 5 | * This code is based on glibc cortex strings work originally authored by Linaro | 
|  | 6 | * and re-licensed under GPLv2 for the Linux kernel. The original code can | 
|  | 7 | * be found @ | 
|  | 8 | * | 
|  | 9 | * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ | 
|  | 10 | * files/head:/src/aarch64/ | 
|  | 11 | * | 
|  | 12 | * This program is free software; you can redistribute it and/or modify | 
|  | 13 | * it under the terms of the GNU General Public License version 2 as | 
|  | 14 | * published by the Free Software Foundation. | 
|  | 15 | * | 
|  | 16 | * This program is distributed in the hope that it will be useful, | 
|  | 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|  | 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
|  | 19 | * GNU General Public License for more details. | 
|  | 20 | * | 
|  | 21 | * You should have received a copy of the GNU General Public License | 
|  | 22 | * along with this program.  If not, see <http://www.gnu.org/licenses/>. | 
|  | 23 | */ | 
|  | 24 |  | 
|  | 25 | #include <linux/linkage.h> | 
|  | 26 | #include <asm/assembler.h> | 
|  | 27 |  | 
|  | 28 | /* | 
|  | 29 | * calculate the length of a string | 
|  | 30 | * | 
|  | 31 | * Parameters: | 
|  | 32 | *	x0 - const string pointer | 
|  | 33 | * Returns: | 
|  | 34 | *	x0 - the return length of specific string | 
|  | 35 | */ | 
|  | 36 |  | 
|  | 37 | /* Arguments and results.  */ | 
|  | 38 | srcin		.req	x0 | 
|  | 39 | len		.req	x0 | 
|  | 40 |  | 
|  | 41 | /* Locals and temporaries.  */ | 
|  | 42 | src		.req	x1 | 
|  | 43 | data1		.req	x2 | 
|  | 44 | data2		.req	x3 | 
|  | 45 | data2a		.req	x4 | 
|  | 46 | has_nul1	.req	x5 | 
|  | 47 | has_nul2	.req	x6 | 
|  | 48 | tmp1		.req	x7 | 
|  | 49 | tmp2		.req	x8 | 
|  | 50 | tmp3		.req	x9 | 
|  | 51 | tmp4		.req	x10 | 
|  | 52 | zeroones	.req	x11 | 
|  | 53 | pos		.req	x12 | 
|  | 54 |  | 
|  | 55 | #define REP8_01 0x0101010101010101 | 
|  | 56 | #define REP8_7f 0x7f7f7f7f7f7f7f7f | 
|  | 57 | #define REP8_80 0x8080808080808080 | 
|  | 58 |  | 
|  | 59 | WEAK(strlen) | 
|  | 60 | mov	zeroones, #REP8_01 | 
|  | 61 | bic	src, srcin, #15 | 
|  | 62 | ands	tmp1, srcin, #15 | 
|  | 63 | b.ne	.Lmisaligned | 
|  | 64 | /* | 
|  | 65 | * NUL detection works on the principle that (X - 1) & (~X) & 0x80 | 
|  | 66 | * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and | 
|  | 67 | * can be done in parallel across the entire word. | 
|  | 68 | */ | 
|  | 69 | /* | 
|  | 70 | * The inner loop deals with two Dwords at a time. This has a | 
|  | 71 | * slightly higher start-up cost, but we should win quite quickly, | 
|  | 72 | * especially on cores with a high number of issue slots per | 
|  | 73 | * cycle, as we get much better parallelism out of the operations. | 
|  | 74 | */ | 
|  | 75 | .Lloop: | 
|  | 76 | ldp	data1, data2, [src], #16 | 
|  | 77 | .Lrealigned: | 
|  | 78 | sub	tmp1, data1, zeroones | 
|  | 79 | orr	tmp2, data1, #REP8_7f | 
|  | 80 | sub	tmp3, data2, zeroones | 
|  | 81 | orr	tmp4, data2, #REP8_7f | 
|  | 82 | bic	has_nul1, tmp1, tmp2 | 
|  | 83 | bics	has_nul2, tmp3, tmp4 | 
|  | 84 | ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */ | 
|  | 85 | b.eq	.Lloop | 
|  | 86 |  | 
|  | 87 | sub	len, src, srcin | 
|  | 88 | cbz	has_nul1, .Lnul_in_data2 | 
|  | 89 | CPU_BE(	mov	data2, data1 )	/*prepare data to re-calculate the syndrome*/ | 
|  | 90 | sub	len, len, #8 | 
|  | 91 | mov	has_nul2, has_nul1 | 
|  | 92 | .Lnul_in_data2: | 
|  | 93 | /* | 
|  | 94 | * For big-endian, carry propagation (if the final byte in the | 
|  | 95 | * string is 0x01) means we cannot use has_nul directly.  The | 
|  | 96 | * easiest way to get the correct byte is to byte-swap the data | 
|  | 97 | * and calculate the syndrome a second time. | 
|  | 98 | */ | 
|  | 99 | CPU_BE( rev	data2, data2 ) | 
|  | 100 | CPU_BE( sub	tmp1, data2, zeroones ) | 
|  | 101 | CPU_BE( orr	tmp2, data2, #REP8_7f ) | 
|  | 102 | CPU_BE( bic	has_nul2, tmp1, tmp2 ) | 
|  | 103 |  | 
|  | 104 | sub	len, len, #8 | 
|  | 105 | rev	has_nul2, has_nul2 | 
|  | 106 | clz	pos, has_nul2 | 
|  | 107 | add	len, len, pos, lsr #3		/* Bits to bytes.  */ | 
|  | 108 | ret | 
|  | 109 |  | 
|  | 110 | .Lmisaligned: | 
|  | 111 | cmp	tmp1, #8 | 
|  | 112 | neg	tmp1, tmp1 | 
|  | 113 | ldp	data1, data2, [src], #16 | 
|  | 114 | lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */ | 
|  | 115 | mov	tmp2, #~0 | 
|  | 116 | /* Big-endian.  Early bytes are at MSB.  */ | 
|  | 117 | CPU_BE( lsl	tmp2, tmp2, tmp1 )	/* Shift (tmp1 & 63).  */ | 
|  | 118 | /* Little-endian.  Early bytes are at LSB.  */ | 
|  | 119 | CPU_LE( lsr	tmp2, tmp2, tmp1 )	/* Shift (tmp1 & 63).  */ | 
|  | 120 |  | 
|  | 121 | orr	data1, data1, tmp2 | 
|  | 122 | orr	data2a, data2, tmp2 | 
|  | 123 | csinv	data1, data1, xzr, le | 
|  | 124 | csel	data2, data2, data2a, le | 
|  | 125 | b	.Lrealigned | 
|  | 126 | ENDPIPROC(strlen) |