Blame - src/kernel/linux/v4.19/arch/xtensa/lib/checksum.S - T800

blob: 528fe0dd9339f99b318bfd62615193f98ef94536 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* IP/TCP/UDP checksumming routines
				7	*
				8	* Xtensa version: Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
				9	* Optimized by Joe Taylor
				10	*
				11	* This program is free software; you can redistribute it and/or
				12	* modify it under the terms of the GNU General Public License
				13	* as published by the Free Software Foundation; either version
				14	* 2 of the License, or (at your option) any later version.
				15	*/
				16
				17	#include <linux/errno.h>
				18	#include <linux/linkage.h>
				19	#include <variant/core.h>
				20	#include <asm/asmmacro.h>
				21
				22	/*
				23	* computes a partial checksum, e.g. for TCP/UDP fragments
				24	*/
				25
				26	/*
				27	* unsigned int csum_partial(const unsigned char *buf, int len,
				28	* unsigned int sum);
				29	* a2 = buf
				30	* a3 = len
				31	* a4 = sum
				32	*
				33	* This function assumes 2- or 4-byte alignment. Other alignments will fail!
				34	*/
				35
				36	/* ONES_ADD converts twos-complement math to ones-complement. */
				37	#define ONES_ADD(sum, val) \
				38	add sum, sum, val ; \
				39	bgeu sum, val, 99f ; \
				40	addi sum, sum, 1 ; \
				41	99: ;
				42
				43	.text
				44	ENTRY(csum_partial)
				45
				46	/*
				47	* Experiments with Ethernet and SLIP connections show that buf
				48	* is aligned on either a 2-byte or 4-byte boundary.
				49	*/
				50	entry sp, 32
				51	extui a5, a2, 0, 2
				52	bnez a5, 8f /* branch if 2-byte aligned */
				53	/* Fall-through on common case, 4-byte alignment */
				54	1:
				55	srli a5, a3, 5 /* 32-byte chunks */
				56	#if XCHAL_HAVE_LOOPS
				57	loopgtz a5, 2f
				58	#else
				59	beqz a5, 2f
				60	slli a5, a5, 5
				61	add a5, a5, a2 /* a5 = end of last 32-byte chunk */
				62	.Loop1:
				63	#endif
				64	l32i a6, a2, 0
				65	l32i a7, a2, 4
				66	ONES_ADD(a4, a6)
				67	ONES_ADD(a4, a7)
				68	l32i a6, a2, 8
				69	l32i a7, a2, 12
				70	ONES_ADD(a4, a6)
				71	ONES_ADD(a4, a7)
				72	l32i a6, a2, 16
				73	l32i a7, a2, 20
				74	ONES_ADD(a4, a6)
				75	ONES_ADD(a4, a7)
				76	l32i a6, a2, 24
				77	l32i a7, a2, 28
				78	ONES_ADD(a4, a6)
				79	ONES_ADD(a4, a7)
				80	addi a2, a2, 4*8
				81	#if !XCHAL_HAVE_LOOPS
				82	blt a2, a5, .Loop1
				83	#endif
				84	2:
				85	extui a5, a3, 2, 3 /* remaining 4-byte chunks */
				86	#if XCHAL_HAVE_LOOPS
				87	loopgtz a5, 3f
				88	#else
				89	beqz a5, 3f
				90	slli a5, a5, 2
				91	add a5, a5, a2 /* a5 = end of last 4-byte chunk */
				92	.Loop2:
				93	#endif
				94	l32i a6, a2, 0
				95	ONES_ADD(a4, a6)
				96	addi a2, a2, 4
				97	#if !XCHAL_HAVE_LOOPS
				98	blt a2, a5, .Loop2
				99	#endif
				100	3:
				101	_bbci.l a3, 1, 5f /* remaining 2-byte chunk */
				102	l16ui a6, a2, 0
				103	ONES_ADD(a4, a6)
				104	addi a2, a2, 2
				105	5:
				106	_bbci.l a3, 0, 7f /* remaining 1-byte chunk */
				107	6: l8ui a6, a2, 0
				108	#ifdef __XTENSA_EB__
				109	slli a6, a6, 8 /* load byte into bits 8..15 */
				110	#endif
				111	ONES_ADD(a4, a6)
				112	7:
				113	mov a2, a4
				114	retw
				115
				116	/* uncommon case, buf is 2-byte aligned */
				117	8:
				118	beqz a3, 7b /* branch if len == 0 */
				119	beqi a3, 1, 6b /* branch if len == 1 */
				120
				121	extui a5, a2, 0, 1
				122	bnez a5, 8f /* branch if 1-byte aligned */
				123
				124	l16ui a6, a2, 0 /* common case, len >= 2 */
				125	ONES_ADD(a4, a6)
				126	addi a2, a2, 2 /* adjust buf */
				127	addi a3, a3, -2 /* adjust len */
				128	j 1b /* now buf is 4-byte aligned */
				129
				130	/* case: odd-byte aligned, len > 1
				131	* This case is dog slow, so don't give us an odd address.
				132	* (I don't think this ever happens, but just in case.)
				133	*/
				134	8:
				135	srli a5, a3, 2 /* 4-byte chunks */
				136	#if XCHAL_HAVE_LOOPS
				137	loopgtz a5, 2f
				138	#else
				139	beqz a5, 2f
				140	slli a5, a5, 2
				141	add a5, a5, a2 /* a5 = end of last 4-byte chunk */
				142	.Loop3:
				143	#endif
				144	l8ui a6, a2, 0 /* bits 24..31 */
				145	l16ui a7, a2, 1 /* bits 8..23 */
				146	l8ui a8, a2, 3 /* bits 0.. 8 */
				147	#ifdef __XTENSA_EB__
				148	slli a6, a6, 24
				149	#else
				150	slli a8, a8, 24
				151	#endif
				152	slli a7, a7, 8
				153	or a7, a7, a6
				154	or a7, a7, a8
				155	ONES_ADD(a4, a7)
				156	addi a2, a2, 4
				157	#if !XCHAL_HAVE_LOOPS
				158	blt a2, a5, .Loop3
				159	#endif
				160	2:
				161	_bbci.l a3, 1, 3f /* remaining 2-byte chunk, still odd addr */
				162	l8ui a6, a2, 0
				163	l8ui a7, a2, 1
				164	#ifdef __XTENSA_EB__
				165	slli a6, a6, 8
				166	#else
				167	slli a7, a7, 8
				168	#endif
				169	or a7, a7, a6
				170	ONES_ADD(a4, a7)
				171	addi a2, a2, 2
				172	3:
				173	j 5b /* branch to handle the remaining byte */
				174
				175	ENDPROC(csum_partial)
				176
				177	/*
				178	* Copy from ds while checksumming, otherwise like csum_partial
				179	*/
				180
				181	/*
				182	unsigned int csum_partial_copy_generic (const char src, char dst, int len,
				183	int sum, int src_err_ptr, int dst_err_ptr)
				184	a2 = src
				185	a3 = dst
				186	a4 = len
				187	a5 = sum
				188	a6 = src_err_ptr
				189	a7 = dst_err_ptr
				190	a8 = temp
				191	a9 = temp
				192	a10 = temp
				193	a11 = original len for exception handling
				194	a12 = original dst for exception handling
				195
				196	This function is optimized for 4-byte aligned addresses. Other
				197	alignments work, but not nearly as efficiently.
				198	*/
				199
				200	ENTRY(csum_partial_copy_generic)
				201
				202	entry sp, 32
				203	mov a12, a3
				204	mov a11, a4
				205	or a10, a2, a3
				206
				207	/* We optimize the following alignment tests for the 4-byte
				208	aligned case. Two bbsi.l instructions might seem more optimal
				209	(commented out below). However, both labels 5: and 3: are out
				210	of the imm8 range, so the assembler relaxes them into
				211	equivalent bbci.l, j combinations, which is actually
				212	slower. */
				213
				214	extui a9, a10, 0, 2
				215	beqz a9, 1f /* branch if both are 4-byte aligned */
				216	bbsi.l a10, 0, 5f /* branch if one address is odd */
				217	j 3f /* one address is 2-byte aligned */
				218
				219	/* _bbsi.l a10, 0, 5f / / branch if odd address */
				220	/* _bbsi.l a10, 1, 3f / / branch if 2-byte-aligned address */
				221
				222	1:
				223	/* src and dst are both 4-byte aligned */
				224	srli a10, a4, 5 /* 32-byte chunks */
				225	#if XCHAL_HAVE_LOOPS
				226	loopgtz a10, 2f
				227	#else
				228	beqz a10, 2f
				229	slli a10, a10, 5
				230	add a10, a10, a2 /* a10 = end of last 32-byte src chunk */
				231	.Loop5:
				232	#endif
				233	EX(10f) l32i a9, a2, 0
				234	EX(10f) l32i a8, a2, 4
				235	EX(11f) s32i a9, a3, 0
				236	EX(11f) s32i a8, a3, 4
				237	ONES_ADD(a5, a9)
				238	ONES_ADD(a5, a8)
				239	EX(10f) l32i a9, a2, 8
				240	EX(10f) l32i a8, a2, 12
				241	EX(11f) s32i a9, a3, 8
				242	EX(11f) s32i a8, a3, 12
				243	ONES_ADD(a5, a9)
				244	ONES_ADD(a5, a8)
				245	EX(10f) l32i a9, a2, 16
				246	EX(10f) l32i a8, a2, 20
				247	EX(11f) s32i a9, a3, 16
				248	EX(11f) s32i a8, a3, 20
				249	ONES_ADD(a5, a9)
				250	ONES_ADD(a5, a8)
				251	EX(10f) l32i a9, a2, 24
				252	EX(10f) l32i a8, a2, 28
				253	EX(11f) s32i a9, a3, 24
				254	EX(11f) s32i a8, a3, 28
				255	ONES_ADD(a5, a9)
				256	ONES_ADD(a5, a8)
				257	addi a2, a2, 32
				258	addi a3, a3, 32
				259	#if !XCHAL_HAVE_LOOPS
				260	blt a2, a10, .Loop5
				261	#endif
				262	2:
				263	extui a10, a4, 2, 3 /* remaining 4-byte chunks */
				264	extui a4, a4, 0, 2 /* reset len for general-case, 2-byte chunks */
				265	#if XCHAL_HAVE_LOOPS
				266	loopgtz a10, 3f
				267	#else
				268	beqz a10, 3f
				269	slli a10, a10, 2
				270	add a10, a10, a2 /* a10 = end of last 4-byte src chunk */
				271	.Loop6:
				272	#endif
				273	EX(10f) l32i a9, a2, 0
				274	EX(11f) s32i a9, a3, 0
				275	ONES_ADD(a5, a9)
				276	addi a2, a2, 4
				277	addi a3, a3, 4
				278	#if !XCHAL_HAVE_LOOPS
				279	blt a2, a10, .Loop6
				280	#endif
				281	3:
				282	/*
				283	Control comes to here in two cases: (1) It may fall through
				284	to here from the 4-byte alignment case to process, at most,
				285	one 2-byte chunk. (2) It branches to here from above if
				286	either src or dst is 2-byte aligned, and we process all bytes
				287	here, except for perhaps a trailing odd byte. It's
				288	inefficient, so align your addresses to 4-byte boundaries.
				289
				290	a2 = src
				291	a3 = dst
				292	a4 = len
				293	a5 = sum
				294	*/
				295	srli a10, a4, 1 /* 2-byte chunks */
				296	#if XCHAL_HAVE_LOOPS
				297	loopgtz a10, 4f
				298	#else
				299	beqz a10, 4f
				300	slli a10, a10, 1
				301	add a10, a10, a2 /* a10 = end of last 2-byte src chunk */
				302	.Loop7:
				303	#endif
				304	EX(10f) l16ui a9, a2, 0
				305	EX(11f) s16i a9, a3, 0
				306	ONES_ADD(a5, a9)
				307	addi a2, a2, 2
				308	addi a3, a3, 2
				309	#if !XCHAL_HAVE_LOOPS
				310	blt a2, a10, .Loop7
				311	#endif
				312	4:
				313	/* This section processes a possible trailing odd byte. */
				314	_bbci.l a4, 0, 8f /* 1-byte chunk */
				315	EX(10f) l8ui a9, a2, 0
				316	EX(11f) s8i a9, a3, 0
				317	#ifdef __XTENSA_EB__
				318	slli a9, a9, 8 /* shift byte to bits 8..15 */
				319	#endif
				320	ONES_ADD(a5, a9)
				321	8:
				322	mov a2, a5
				323	retw
				324
				325	5:
				326	/* Control branch to here when either src or dst is odd. We
				327	process all bytes using 8-bit accesses. Grossly inefficient,
				328	so don't feed us an odd address. */
				329
				330	srli a10, a4, 1 /* handle in pairs for 16-bit csum */
				331	#if XCHAL_HAVE_LOOPS
				332	loopgtz a10, 6f
				333	#else
				334	beqz a10, 6f
				335	slli a10, a10, 1
				336	add a10, a10, a2 /* a10 = end of last odd-aligned, 2-byte src chunk */
				337	.Loop8:
				338	#endif
				339	EX(10f) l8ui a9, a2, 0
				340	EX(10f) l8ui a8, a2, 1
				341	EX(11f) s8i a9, a3, 0
				342	EX(11f) s8i a8, a3, 1
				343	#ifdef __XTENSA_EB__
				344	slli a9, a9, 8 /* combine into a single 16-bit value */
				345	#else /* for checksum computation */
				346	slli a8, a8, 8
				347	#endif
				348	or a9, a9, a8
				349	ONES_ADD(a5, a9)
				350	addi a2, a2, 2
				351	addi a3, a3, 2
				352	#if !XCHAL_HAVE_LOOPS
				353	blt a2, a10, .Loop8
				354	#endif
				355	6:
				356	j 4b /* process the possible trailing odd byte */
				357
				358	ENDPROC(csum_partial_copy_generic)
				359
				360
				361	# Exception handler:
				362	.section .fixup, "ax"
				363	/*
				364	a6 = src_err_ptr
				365	a7 = dst_err_ptr
				366	a11 = original len for exception handling
				367	a12 = original dst for exception handling
				368	*/
				369
				370	10:
				371	_movi a2, -EFAULT
				372	s32i a2, a6, 0 /* src_err_ptr */
				373
				374	# clear the complete destination - computing the rest
				375	# is too much work
				376	movi a2, 0
				377	#if XCHAL_HAVE_LOOPS
				378	loopgtz a11, 2f
				379	#else
				380	beqz a11, 2f
				381	add a11, a11, a12 /* a11 = ending address */
				382	.Leloop:
				383	#endif
				384	s8i a2, a12, 0
				385	addi a12, a12, 1
				386	#if !XCHAL_HAVE_LOOPS
				387	blt a12, a11, .Leloop
				388	#endif
				389	2:
				390	retw
				391
				392	11:
				393	movi a2, -EFAULT
				394	s32i a2, a7, 0 /* dst_err_ptr */
				395	movi a2, 0
				396	retw
				397
				398	.previous