Blame - marvell/linux/arch/x86/lib/copy_user_64.S - T108

blob: 86976b55ae743ef3b534d05475e2dcced6241962 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	/* SPDX-License-Identifier: GPL-2.0-only */
				2	/*
				3	* Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
				4	* Copyright 2002 Andi Kleen, SuSE Labs.
				5	*
				6	* Functions to copy from and to user space.
				7	*/
				8
				9	#include <linux/linkage.h>
				10	#include <asm/current.h>
				11	#include <asm/asm-offsets.h>
				12	#include <asm/thread_info.h>
				13	#include <asm/cpufeatures.h>
				14	#include <asm/alternative-asm.h>
				15	#include <asm/asm.h>
				16	#include <asm/smap.h>
				17	#include <asm/export.h>
				18
				19	.macro ALIGN_DESTINATION
				20	/* check for bad alignment of destination */
				21	movl %edi,%ecx
				22	andl $7,%ecx
				23	jz 102f /* already aligned */
				24	subl $8,%ecx
				25	negl %ecx
				26	subl %ecx,%edx
				27	100: movb (%rsi),%al
				28	101: movb %al,(%rdi)
				29	incq %rsi
				30	incq %rdi
				31	decl %ecx
				32	jnz 100b
				33	102:
				34	.section .fixup,"ax"
				35	103: addl %ecx,%edx /* ecx is zerorest also */
				36	jmp .Lcopy_user_handle_tail
				37	.previous
				38
				39	_ASM_EXTABLE_UA(100b, 103b)
				40	_ASM_EXTABLE_UA(101b, 103b)
				41	.endm
				42
				43	/*
				44	* copy_user_generic_unrolled - memory copy with exception handling.
				45	* This version is for CPUs like P4 that don't have efficient micro
				46	* code for rep movsq
				47	*
				48	* Input:
				49	* rdi destination
				50	* rsi source
				51	* rdx count
				52	*
				53	* Output:
				54	* eax uncopied bytes or 0 if successful.
				55	*/
				56	ENTRY(copy_user_generic_unrolled)
				57	ASM_STAC
				58	cmpl $8,%edx
				59	jb 20f /* less then 8 bytes, go to byte copy loop */
				60	ALIGN_DESTINATION
				61	movl %edx,%ecx
				62	andl $63,%edx
				63	shrl $6,%ecx
				64	jz .L_copy_short_string
				65	1: movq (%rsi),%r8
				66	2: movq 1*8(%rsi),%r9
				67	3: movq 2*8(%rsi),%r10
				68	4: movq 3*8(%rsi),%r11
				69	5: movq %r8,(%rdi)
				70	6: movq %r9,1*8(%rdi)
				71	7: movq %r10,2*8(%rdi)
				72	8: movq %r11,3*8(%rdi)
				73	9: movq 4*8(%rsi),%r8
				74	10: movq 5*8(%rsi),%r9
				75	11: movq 6*8(%rsi),%r10
				76	12: movq 7*8(%rsi),%r11
				77	13: movq %r8,4*8(%rdi)
				78	14: movq %r9,5*8(%rdi)
				79	15: movq %r10,6*8(%rdi)
				80	16: movq %r11,7*8(%rdi)
				81	leaq 64(%rsi),%rsi
				82	leaq 64(%rdi),%rdi
				83	decl %ecx
				84	jnz 1b
				85	.L_copy_short_string:
				86	movl %edx,%ecx
				87	andl $7,%edx
				88	shrl $3,%ecx
				89	jz 20f
				90	18: movq (%rsi),%r8
				91	19: movq %r8,(%rdi)
				92	leaq 8(%rsi),%rsi
				93	leaq 8(%rdi),%rdi
				94	decl %ecx
				95	jnz 18b
				96	20: andl %edx,%edx
				97	jz 23f
				98	movl %edx,%ecx
				99	21: movb (%rsi),%al
				100	22: movb %al,(%rdi)
				101	incq %rsi
				102	incq %rdi
				103	decl %ecx
				104	jnz 21b
				105	23: xor %eax,%eax
				106	ASM_CLAC
				107	ret
				108
				109	.section .fixup,"ax"
				110	30: shll $6,%ecx
				111	addl %ecx,%edx
				112	jmp 60f
				113	40: leal (%rdx,%rcx,8),%edx
				114	jmp 60f
				115	50: movl %ecx,%edx
				116	60: jmp .Lcopy_user_handle_tail /* ecx is zerorest also */
				117	.previous
				118
				119	_ASM_EXTABLE_UA(1b, 30b)
				120	_ASM_EXTABLE_UA(2b, 30b)
				121	_ASM_EXTABLE_UA(3b, 30b)
				122	_ASM_EXTABLE_UA(4b, 30b)
				123	_ASM_EXTABLE_UA(5b, 30b)
				124	_ASM_EXTABLE_UA(6b, 30b)
				125	_ASM_EXTABLE_UA(7b, 30b)
				126	_ASM_EXTABLE_UA(8b, 30b)
				127	_ASM_EXTABLE_UA(9b, 30b)
				128	_ASM_EXTABLE_UA(10b, 30b)
				129	_ASM_EXTABLE_UA(11b, 30b)
				130	_ASM_EXTABLE_UA(12b, 30b)
				131	_ASM_EXTABLE_UA(13b, 30b)
				132	_ASM_EXTABLE_UA(14b, 30b)
				133	_ASM_EXTABLE_UA(15b, 30b)
				134	_ASM_EXTABLE_UA(16b, 30b)
				135	_ASM_EXTABLE_UA(18b, 40b)
				136	_ASM_EXTABLE_UA(19b, 40b)
				137	_ASM_EXTABLE_UA(21b, 50b)
				138	_ASM_EXTABLE_UA(22b, 50b)
				139	ENDPROC(copy_user_generic_unrolled)
				140	EXPORT_SYMBOL(copy_user_generic_unrolled)
				141
				142	/* Some CPUs run faster using the string copy instructions.
				143	* This is also a lot simpler. Use them when possible.
				144	*
				145	* Only 4GB of copy is supported. This shouldn't be a problem
				146	* because the kernel normally only writes from/to page sized chunks
				147	* even if user space passed a longer buffer.
				148	* And more would be dangerous because both Intel and AMD have
				149	* errata with rep movsq > 4GB. If someone feels the need to fix
				150	* this please consider this.
				151	*
				152	* Input:
				153	* rdi destination
				154	* rsi source
				155	* rdx count
				156	*
				157	* Output:
				158	* eax uncopied bytes or 0 if successful.
				159	*/
				160	ENTRY(copy_user_generic_string)
				161	ASM_STAC
				162	cmpl $8,%edx
				163	jb 2f /* less than 8 bytes, go to byte copy loop */
				164	ALIGN_DESTINATION
				165	movl %edx,%ecx
				166	shrl $3,%ecx
				167	andl $7,%edx
				168	1: rep
				169	movsq
				170	2: movl %edx,%ecx
				171	3: rep
				172	movsb
				173	xorl %eax,%eax
				174	ASM_CLAC
				175	ret
				176
				177	.section .fixup,"ax"
				178	11: leal (%rdx,%rcx,8),%ecx
				179	12: movl %ecx,%edx /* ecx is zerorest also */
				180	jmp .Lcopy_user_handle_tail
				181	.previous
				182
				183	_ASM_EXTABLE_UA(1b, 11b)
				184	_ASM_EXTABLE_UA(3b, 12b)
				185	ENDPROC(copy_user_generic_string)
				186	EXPORT_SYMBOL(copy_user_generic_string)
				187
				188	/*
				189	* Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
				190	* It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
				191	*
				192	* Input:
				193	* rdi destination
				194	* rsi source
				195	* rdx count
				196	*
				197	* Output:
				198	* eax uncopied bytes or 0 if successful.
				199	*/
				200	ENTRY(copy_user_enhanced_fast_string)
				201	ASM_STAC
				202	cmpl $64,%edx
				203	jb .L_copy_short_string /* less then 64 bytes, avoid the costly 'rep' */
				204	movl %edx,%ecx
				205	1: rep
				206	movsb
				207	xorl %eax,%eax
				208	ASM_CLAC
				209	ret
				210
				211	.section .fixup,"ax"
				212	12: movl %ecx,%edx /* ecx is zerorest also */
				213	jmp .Lcopy_user_handle_tail
				214	.previous
				215
				216	_ASM_EXTABLE_UA(1b, 12b)
				217	ENDPROC(copy_user_enhanced_fast_string)
				218	EXPORT_SYMBOL(copy_user_enhanced_fast_string)
				219
				220	/*
				221	* Try to copy last bytes and clear the rest if needed.
				222	* Since protection fault in copy_from/to_user is not a normal situation,
				223	* it is not necessary to optimize tail handling.
				224	*
				225	* Input:
				226	* rdi destination
				227	* rsi source
				228	* rdx count
				229	*
				230	* Output:
				231	* eax uncopied bytes or 0 if successful.
				232	*/
				233	ALIGN;
				234	.Lcopy_user_handle_tail:
				235	movl %edx,%ecx
				236	1: rep movsb
				237	2: mov %ecx,%eax
				238	ASM_CLAC
				239	ret
				240
				241	_ASM_EXTABLE_UA(1b, 2b)
				242	END(.Lcopy_user_handle_tail)
				243
				244	/*
				245	* copy_user_nocache - Uncached memory copy with exception handling
				246	* This will force destination out of cache for more performance.
				247	*
				248	* Note: Cached memory copy is used when destination or size is not
				249	* naturally aligned. That is:
				250	* - Require 8-byte alignment when size is 8 bytes or larger.
				251	* - Require 4-byte alignment when size is 4 bytes.
				252	*/
				253	ENTRY(__copy_user_nocache)
				254	ASM_STAC
				255
				256	/* If size is less than 8 bytes, go to 4-byte copy */
				257	cmpl $8,%edx
				258	jb .L_4b_nocache_copy_entry
				259
				260	/* If destination is not 8-byte aligned, "cache" copy to align it */
				261	ALIGN_DESTINATION
				262
				263	/* Set 4x8-byte copy count and remainder */
				264	movl %edx,%ecx
				265	andl $63,%edx
				266	shrl $6,%ecx
				267	jz .L_8b_nocache_copy_entry /* jump if count is 0 */
				268
				269	/* Perform 4x8-byte nocache loop-copy */
				270	.L_4x8b_nocache_copy_loop:
				271	1: movq (%rsi),%r8
				272	2: movq 1*8(%rsi),%r9
				273	3: movq 2*8(%rsi),%r10
				274	4: movq 3*8(%rsi),%r11
				275	5: movnti %r8,(%rdi)
				276	6: movnti %r9,1*8(%rdi)
				277	7: movnti %r10,2*8(%rdi)
				278	8: movnti %r11,3*8(%rdi)
				279	9: movq 4*8(%rsi),%r8
				280	10: movq 5*8(%rsi),%r9
				281	11: movq 6*8(%rsi),%r10
				282	12: movq 7*8(%rsi),%r11
				283	13: movnti %r8,4*8(%rdi)
				284	14: movnti %r9,5*8(%rdi)
				285	15: movnti %r10,6*8(%rdi)
				286	16: movnti %r11,7*8(%rdi)
				287	leaq 64(%rsi),%rsi
				288	leaq 64(%rdi),%rdi
				289	decl %ecx
				290	jnz .L_4x8b_nocache_copy_loop
				291
				292	/* Set 8-byte copy count and remainder */
				293	.L_8b_nocache_copy_entry:
				294	movl %edx,%ecx
				295	andl $7,%edx
				296	shrl $3,%ecx
				297	jz .L_4b_nocache_copy_entry /* jump if count is 0 */
				298
				299	/* Perform 8-byte nocache loop-copy */
				300	.L_8b_nocache_copy_loop:
				301	20: movq (%rsi),%r8
				302	21: movnti %r8,(%rdi)
				303	leaq 8(%rsi),%rsi
				304	leaq 8(%rdi),%rdi
				305	decl %ecx
				306	jnz .L_8b_nocache_copy_loop
				307
				308	/* If no byte left, we're done */
				309	.L_4b_nocache_copy_entry:
				310	andl %edx,%edx
				311	jz .L_finish_copy
				312
				313	/* If destination is not 4-byte aligned, go to byte copy: */
				314	movl %edi,%ecx
				315	andl $3,%ecx
				316	jnz .L_1b_cache_copy_entry
				317
				318	/* Set 4-byte copy count (1 or 0) and remainder */
				319	movl %edx,%ecx
				320	andl $3,%edx
				321	shrl $2,%ecx
				322	jz .L_1b_cache_copy_entry /* jump if count is 0 */
				323
				324	/* Perform 4-byte nocache copy: */
				325	30: movl (%rsi),%r8d
				326	31: movnti %r8d,(%rdi)
				327	leaq 4(%rsi),%rsi
				328	leaq 4(%rdi),%rdi
				329
				330	/* If no bytes left, we're done: */
				331	andl %edx,%edx
				332	jz .L_finish_copy
				333
				334	/* Perform byte "cache" loop-copy for the remainder */
				335	.L_1b_cache_copy_entry:
				336	movl %edx,%ecx
				337	.L_1b_cache_copy_loop:
				338	40: movb (%rsi),%al
				339	41: movb %al,(%rdi)
				340	incq %rsi
				341	incq %rdi
				342	decl %ecx
				343	jnz .L_1b_cache_copy_loop
				344
				345	/* Finished copying; fence the prior stores */
				346	.L_finish_copy:
				347	xorl %eax,%eax
				348	ASM_CLAC
				349	sfence
				350	ret
				351
				352	.section .fixup,"ax"
				353	.L_fixup_4x8b_copy:
				354	shll $6,%ecx
				355	addl %ecx,%edx
				356	jmp .L_fixup_handle_tail
				357	.L_fixup_8b_copy:
				358	lea (%rdx,%rcx,8),%rdx
				359	jmp .L_fixup_handle_tail
				360	.L_fixup_4b_copy:
				361	lea (%rdx,%rcx,4),%rdx
				362	jmp .L_fixup_handle_tail
				363	.L_fixup_1b_copy:
				364	movl %ecx,%edx
				365	.L_fixup_handle_tail:
				366	sfence
				367	jmp .Lcopy_user_handle_tail
				368	.previous
				369
				370	_ASM_EXTABLE_UA(1b, .L_fixup_4x8b_copy)
				371	_ASM_EXTABLE_UA(2b, .L_fixup_4x8b_copy)
				372	_ASM_EXTABLE_UA(3b, .L_fixup_4x8b_copy)
				373	_ASM_EXTABLE_UA(4b, .L_fixup_4x8b_copy)
				374	_ASM_EXTABLE_UA(5b, .L_fixup_4x8b_copy)
				375	_ASM_EXTABLE_UA(6b, .L_fixup_4x8b_copy)
				376	_ASM_EXTABLE_UA(7b, .L_fixup_4x8b_copy)
				377	_ASM_EXTABLE_UA(8b, .L_fixup_4x8b_copy)
				378	_ASM_EXTABLE_UA(9b, .L_fixup_4x8b_copy)
				379	_ASM_EXTABLE_UA(10b, .L_fixup_4x8b_copy)
				380	_ASM_EXTABLE_UA(11b, .L_fixup_4x8b_copy)
				381	_ASM_EXTABLE_UA(12b, .L_fixup_4x8b_copy)
				382	_ASM_EXTABLE_UA(13b, .L_fixup_4x8b_copy)
				383	_ASM_EXTABLE_UA(14b, .L_fixup_4x8b_copy)
				384	_ASM_EXTABLE_UA(15b, .L_fixup_4x8b_copy)
				385	_ASM_EXTABLE_UA(16b, .L_fixup_4x8b_copy)
				386	_ASM_EXTABLE_UA(20b, .L_fixup_8b_copy)
				387	_ASM_EXTABLE_UA(21b, .L_fixup_8b_copy)
				388	_ASM_EXTABLE_UA(30b, .L_fixup_4b_copy)
				389	_ASM_EXTABLE_UA(31b, .L_fixup_4b_copy)
				390	_ASM_EXTABLE_UA(40b, .L_fixup_1b_copy)
				391	_ASM_EXTABLE_UA(41b, .L_fixup_1b_copy)
				392	ENDPROC(__copy_user_nocache)
				393	EXPORT_SYMBOL(__copy_user_nocache)