Blame - ap/libc/glibc/glibc-2.22/sysdeps/alpha/divq.S - T106_DC

blob: b992492f8fd94c4c662e550303d7b40e66bf87ad [file] [log] [blame]

lh	9ed821d	2023-04-07 01:36:19 -0700	[diff] [blame]	1	/* Copyright (C) 2004-2015 Free Software Foundation, Inc.
				2	This file is part of the GNU C Library.
				3
				4	The GNU C Library is free software; you can redistribute it and/or
				5	modify it under the terms of the GNU Lesser General Public
				6	License as published by the Free Software Foundation; either
				7	version 2.1 of the License, or (at your option) any later version.
				8
				9	The GNU C Library is distributed in the hope that it will be useful,
				10	but WITHOUT ANY WARRANTY; without even the implied warranty of
				11	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				12	Lesser General Public License for more details.
				13
				14	You should have received a copy of the GNU Lesser General Public
				15	License along with the GNU C Library. If not, see
				16	<http://www.gnu.org/licenses/>. */
				17
				18	#include "div_libc.h"
				19
				20
				21	/* 64-bit signed long divide. These are not normal C functions. Argument
				22	registers are t10 and t11, the result goes in t12. Only t12 and AT may
				23	be clobbered.
				24
				25	Theory of operation here is that we can use the FPU divider for virtually
				26	all operands that we see: all dividend values between -253 and 253-1
				27	can be computed directly. Note that divisor values need not be checked
				28	against that range because the rounded fp value will be close enough such
				29	that the quotient is < 1, which will properly be truncated to zero when we
				30	convert back to integer.
				31
				32	When the dividend is outside the range for which we can compute exact
				33	results, we use the fp quotent as an estimate from which we begin refining
				34	an exact integral value. This reduces the number of iterations in the
				35	shift-and-subtract loop significantly.
				36
				37	The FPCR save/restore is due to the fact that the EV6 _will_ set FPCR_INE
				38	for cvttq/c even without /sui being set. It will not, however, properly
				39	raise the exception, so we don't have to worry about FPCR_INED being clear
				40	and so dying by SIGFPE. */
				41
				42	.text
				43	.align 4
				44	.globl __divq
				45	.type __divq, @funcnoplt
				46	.usepv __divq, no
				47
				48	cfi_startproc
				49	cfi_return_column (RA)
				50	__divq:
				51	lda sp, -FRAME(sp)
				52	cfi_def_cfa_offset (FRAME)
				53	CALL_MCOUNT
				54
				55	/* Get the fp divide insn issued as quickly as possible. After
				56	that's done, we have at least 22 cycles until its results are
				57	ready -- all the time in the world to figure out how we're
				58	going to use the results. */
				59	stt $f0, 0(sp)
				60	excb
				61	beq Y, DIVBYZERO
				62
				63	stt $f1, 8(sp)
				64	stt $f3, 48(sp)
				65	cfi_rel_offset ($f0, 0)
				66	cfi_rel_offset ($f1, 8)
				67	cfi_rel_offset ($f3, 48)
				68	mf_fpcr $f3
				69
				70	_ITOFT2 X, $f0, 16, Y, $f1, 24
				71	cvtqt $f0, $f0
				72	cvtqt $f1, $f1
				73	divt/c $f0, $f1, $f0
				74
				75	/* Check to see if X fit in the double as an exact value. */
				76	sll X, (64-53), AT
				77	ldt $f1, 8(sp)
				78	sra AT, (64-53), AT
				79	cmpeq X, AT, AT
				80	beq AT, $x_big
				81
				82	/* If we get here, we're expecting exact results from the division.
				83	Do nothing else besides convert and clean up. */
				84	cvttq/c $f0, $f0
				85	excb
				86	mt_fpcr $f3
				87	_FTOIT $f0, RV, 16
				88
				89	ldt $f0, 0(sp)
				90	ldt $f3, 48(sp)
				91	cfi_restore ($f1)
				92	cfi_remember_state
				93	cfi_restore ($f0)
				94	cfi_restore ($f3)
				95	cfi_def_cfa_offset (0)
				96	lda sp, FRAME(sp)
				97	ret $31, (RA), 1
				98
				99	.align 4
				100	cfi_restore_state
				101	$x_big:
				102	/* If we get here, X is large enough that we don't expect exact
				103	results, and neither X nor Y got mis-translated for the fp
				104	division. Our task is to take the fp result, figure out how
				105	far it's off from the correct result and compute a fixup. */
				106	stq t0, 16(sp)
				107	stq t1, 24(sp)
				108	stq t2, 32(sp)
				109	stq t5, 40(sp)
				110	cfi_rel_offset (t0, 16)
				111	cfi_rel_offset (t1, 24)
				112	cfi_rel_offset (t2, 32)
				113	cfi_rel_offset (t5, 40)
				114
				115	#define Q RV /* quotient */
				116	#define R t0 /* remainder */
				117	#define SY t1 /* scaled Y */
				118	#define S t2 /* scalar */
				119	#define QY t3 /* QY /
				120
				121	/* The fixup code below can only handle unsigned values. */
				122	or X, Y, AT
				123	mov $31, t5
				124	blt AT, $fix_sign_in
				125	$fix_sign_in_ret1:
				126	cvttq/c $f0, $f0
				127
				128	_FTOIT $f0, Q, 8
				129	.align 3
				130	$fix_sign_in_ret2:
				131	ldt $f0, 0(sp)
				132	stq t3, 0(sp)
				133	cfi_restore ($f0)
				134	cfi_rel_offset (t3, 0)
				135
				136	mulq Q, Y, QY
				137	excb
				138	stq t4, 8(sp)
				139	mt_fpcr $f3
				140	cfi_rel_offset (t4, 8)
				141
				142	subq QY, X, R
				143	mov Y, SY
				144	mov 1, S
				145	bgt R, $q_high
				146
				147	$q_high_ret:
				148	subq X, QY, R
				149	mov Y, SY
				150	mov 1, S
				151	bgt R, $q_low
				152
				153	$q_low_ret:
				154	ldq t0, 16(sp)
				155	ldq t1, 24(sp)
				156	ldq t2, 32(sp)
				157	bne t5, $fix_sign_out
				158
				159	$fix_sign_out_ret:
				160	ldq t3, 0(sp)
				161	ldq t4, 8(sp)
				162	ldq t5, 40(sp)
				163	ldt $f3, 48(sp)
				164	lda sp, FRAME(sp)
				165	cfi_remember_state
				166	cfi_restore (t0)
				167	cfi_restore (t1)
				168	cfi_restore (t2)
				169	cfi_restore (t3)
				170	cfi_restore (t4)
				171	cfi_restore (t5)
				172	cfi_restore ($f3)
				173	cfi_def_cfa_offset (0)
				174	ret $31, (RA), 1
				175
				176	.align 4
				177	cfi_restore_state
				178	/* The quotient that we computed was too large. We need to reduce
				179	it by S such that Y*S >= R. Obviously the closer we get to the
				180	correct value the better, but overshooting high is ok, as we'll
				181	fix that up later. */
				182	0:
				183	addq SY, SY, SY
				184	addq S, S, S
				185	$q_high:
				186	cmpult SY, R, AT
				187	bne AT, 0b
				188
				189	subq Q, S, Q
				190	unop
				191	subq QY, SY, QY
				192	br $q_high_ret
				193
				194	.align 4
				195	/* The quotient that we computed was too small. Divide Y by the
				196	current remainder (R) and add that to the existing quotient (Q).
				197	The expectation, of course, is that R is much smaller than X. */
				198	/* Begin with a shift-up loop. Compute S such that Y*S >= R. We
				199	already have a copy of Y in SY and the value 1 in S. */
				200	0:
				201	addq SY, SY, SY
				202	addq S, S, S
				203	$q_low:
				204	cmpult SY, R, AT
				205	bne AT, 0b
				206
				207	/* Shift-down and subtract loop. Each iteration compares our scaled
				208	Y (SY) with the remainder (R); if SY <= R then X is divisible by
				209	Y's scalar (S) so add it to the quotient (Q). */
				210	2: addq Q, S, t3
				211	srl S, 1, S
				212	cmpule SY, R, AT
				213	subq R, SY, t4
				214
				215	cmovne AT, t3, Q
				216	cmovne AT, t4, R
				217	srl SY, 1, SY
				218	bne S, 2b
				219
				220	br $q_low_ret
				221
				222	.align 4
				223	$fix_sign_in:
				224	/* If we got here, then X\|Y is negative. Need to adjust everything
				225	such that we're doing unsigned division in the fixup loop. */
				226	/* T5 records the changes we had to make:
				227	bit 0: set if result should be negative.
				228	bit 2: set if X was negated.
				229	bit 3: set if Y was negated.
				230	*/
				231	xor X, Y, AT
				232	cmplt AT, 0, t5
				233	cmplt X, 0, AT
				234	negq X, t0
				235
				236	s4addq AT, t5, t5
				237	cmovne AT, t0, X
				238	cmplt Y, 0, AT
				239	negq Y, t0
				240
				241	s8addq AT, t5, t5
				242	cmovne AT, t0, Y
				243	unop
				244	blbc t5, $fix_sign_in_ret1
				245
				246	cvttq/c $f0, $f0
				247	_FTOIT $f0, Q, 8
				248	.align 3
				249	negq Q, Q
				250	br $fix_sign_in_ret2
				251
				252	.align 4
				253	$fix_sign_out:
				254	/* Now we get to undo what we did above. */
				255	/* ??? Is this really faster than just increasing the size of
				256	the stack frame and storing X and Y in memory? */
				257	and t5, 8, AT
				258	negq Y, t4
				259	cmovne AT, t4, Y
				260
				261	and t5, 4, AT
				262	negq X, t4
				263	cmovne AT, t4, X
				264
				265	negq RV, t4
				266	cmovlbs t5, t4, RV
				267
				268	br $fix_sign_out_ret
				269
				270	cfi_endproc
				271	.size __divq, .-__divq
				272
				273	DO_DIVBYZERO