* sysdeps/alpha/Makefile <gnulib> (sysdep_routines): Merge divrem

variable, add unsigned variants. * sysdeps/alpha/divrem.h: Remove file. * sysdeps/alpha/div_libc.h: New file. * sysdeps/alpha/divl.S: Rewrite from scratch. * sysdeps/alpha/reml.S: Likewise. * sysdeps/alpha/divq.S: Likewise. * sysdeps/alpha/remq.S: Likewise. * sysdeps/alpha/divlu.S: New file. * sysdeps/alpha/remlu.S: New file. * sysdeps/alpha/divqu.S: New file. * sysdeps/alpha/remqu.S: New file.
2004-03-27 00:32:28 +00:00 · 2004-03-27 00:32:28 +00:00 · 08e3c578ca
parent a66be89006
commit 08e3c578ca
11 changed files with 1294 additions and 248 deletions
--- a/sysdeps/alpha/Makefile
+++ b/sysdeps/alpha/Makefile
@ -26,7 +26,7 @@ sysdep_routines += _mcount
 endif
 ifeq ($(subdir),gnulib)
-sysdep_routines += $(divrem)
+sysdep_routines += divl divlu divq divqu reml remlu remq remqu
 endif
 ifeq ($(subdir),string)
@ -38,8 +38,6 @@ ifeq ($(subdir),elf)
 CFLAGS-rtld.c = -mbuild-constants
 endif
 divrem := divl divq reml remq
 # For now, build everything with full IEEE math support.
 # TODO: build separate libm and libm-ieee.
 sysdep-CFLAGS += -mieee
--- a/sysdeps/alpha/div_libc.h
+++ b/sysdeps/alpha/div_libc.h
@ -0,0 +1,113 @@
 /* Copyright (C) 2004 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, write to the Free
   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   02111-1307 USA.  */
 /* Common bits for implementing software divide.  */
 #include <sysdep.h>
 #ifdef __linux__
 # include <asm/gentrap.h>
 # include <asm/pal.h>
 #else
 # include <machine/pal.h>
 #endif
 /* These are not normal C functions.  Argument registers are t10 and t11;
   the result goes in t12; the return address is in t9.  Only t12 and AT
   may be clobbered.  */
 #define X	t10
 #define Y	t11
 #define RV	t12
 #define RA	t9
 /* None of these functions should use implicit anything.  */
 	.set	nomacro
 	.set	noat
 /* Code fragment to invoke _mcount for profiling.  This should be invoked
   directly after allocation of the stack frame.  */
 .macro CALL_MCOUNT
 #ifdef PROF
 	stq	ra, 0(sp)
 	stq	pv, 8(sp)
 	stq	gp, 16(sp)
 	cfi_rel_offset (ra, 0)
 	cfi_rel_offset (pv, 8)
 	cfi_rel_offset (gp, 16)
 	br	AT, 1f
 	.set	macro
 1:	ldgp	gp, 0(AT)
 	mov	RA, ra
 	lda	AT, _mcount
 	jsr	AT, (AT), _mcount
 	.set	nomacro
 	ldq	ra, 0(sp)
 	ldq	pv, 8(sp)
 	ldq	gp, 16(sp)
 	cfi_restore (ra)
 	cfi_restore (pv)
 	cfi_restore (gp)
 	/* Realign subsequent code with what we'd have without this
 	   macro at all.  This means aligned with one arithmetic insn
 	   used within the bundle.  */
 	.align	4
 	nop
 #endif
 .endm
 /* In order to make the below work, all top-level divide routines must
   use the same frame size.  */
 #define FRAME	48
 /* Code fragment to generate an integer divide-by-zero fault.  When
   building libc.so, we arrange for there to be one copy of this code
   placed late in the dso, such that all branches are forward.  When
   building libc.a, we use multiple copies to avoid having an out of
   range branch.  Users should jump to DIVBYZERO.  */
 .macro DO_DIVBYZERO
 #ifdef PIC
 #define DIVBYZERO	__divbyzero
 	.section .gnu.linkonce.t.divbyzero, "ax", @progbits
 	.globl	__divbyzero
 	.type	__divbyzero, @function
 	.usepv	__divbyzero, no
 	.hidden	__divbyzero
 #else
 #define DIVBYZERO	$divbyzero
 #endif
 	.align	4
 DIVBYZERO:
 	cfi_startproc
 	cfi_return_column (RA)
 	cfi_def_cfa_offset (FRAME)
 	mov	a0, RV
 	unop
 	lda	a0, GEN_INTDIV
 	call_pal PAL_gentrap
 	mov	RV, a0
 	clr	RV
 	lda	sp, FRAME(sp)
 	cfi_def_cfa_offset (0)
 	ret	$31, (RA), 1
 	cfi_endproc
 	.size	DIVBYZERO, .-DIVBYZERO
 .endm
--- a/sysdeps/alpha/divl.S
+++ b/sysdeps/alpha/divl.S
@ -1,6 +1,75 @@
-#define IS_REM		0
+/* Copyright (C) 2004 Free Software Foundation, Inc.
-#define SIZE		4
+   This file is part of the GNU C Library.
 #define UFUNC_NAME	__divlu
 #define SFUNC_NAME	__divl
-#include "divrem.h"
+   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, write to the Free
   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   02111-1307 USA.  */
 #include "div_libc.h"
 /* 32-bit signed int divide.  This is not a normal C function.  Argument
   registers are t10 and t11, the result goes in t12.  Only t12 and AT may
   be clobbered.
   The FPU can handle all input values except zero.  Whee!  */
 #ifndef EXTEND
 #define EXTEND(S,D)	sextl S, D
 #endif
 	.text
 	.align	4
 	.globl	__divl
 	.type	__divl, @function
 	.usepv	__divl, no
 	cfi_startproc
 	cfi_return_column (RA)
 __divl:
 	lda	sp, -FRAME(sp)
 	cfi_def_cfa_offset (FRAME)
 	CALL_MCOUNT
 	stt	$f0, 0(sp)
 	stt	$f1, 8(sp)
 	beq	Y, DIVBYZERO
 	cfi_rel_offset ($f0, 0)
 	cfi_rel_offset ($f1, 8)
 	EXTEND	(X, RV)
 	EXTEND	(Y, AT)
 	stq	RV, 16(sp)
 	stq	AT, 24(sp)
 	ldt	$f0, 16(sp)
 	ldt	$f1, 24(sp)
 	cvtqt	$f0, $f0
 	cvtqt	$f1, $f1
 	divt/c	$f0, $f1, $f0
 	cvttq/c	$f0, $f0
 	stt	$f0, 16(sp)
 	ldt	$f0, 0(sp)
 	ldt	$f1, 8(sp)
 	ldl	RV, 16(sp)
 	lda	sp, FRAME(sp)
 	cfi_restore ($f0)
 	cfi_restore ($f1)
 	cfi_def_cfa_offset (0)
 	ret	$31, (RA), 1
 	cfi_endproc
 	.size	__divl, .-__divl
 	DO_DIVBYZERO
--- a/sysdeps/alpha/divlu.S
+++ b/sysdeps/alpha/divlu.S
@ -0,0 +1,4 @@
 #define UNSIGNED
 #define EXTEND(S,D)	zapnot S, 15, D
 #define __divl		__divlu
 #include <divl.S>
--- a/sysdeps/alpha/divq.S
+++ b/sysdeps/alpha/divq.S
@ -1,6 +1,265 @@
-#define IS_REM		0
+/* Copyright (C) 2004 Free Software Foundation, Inc.
-#define SIZE		8
+   This file is part of the GNU C Library.
 #define UFUNC_NAME	__divqu
 #define SFUNC_NAME	__divq
-#include "divrem.h"
+   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, write to the Free
   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   02111-1307 USA.  */
 #include "div_libc.h"
 /* 64-bit signed long divide.  These are not normal C functions.  Argument
   registers are t10 and t11, the result goes in t12.  Only t12 and AT may
   be clobbered.
   Theory of operation here is that we can use the FPU divider for virtually
   all operands that we see: all dividend values between -2**53 and 2**53-1
   can be computed directly.  Note that divisor values need not be checked
   against that range because the rounded fp value will be close enough such
   that the quotient is < 1, which will properly be truncated to zero when we
   convert back to integer.
   When the dividend is outside the range for which we can compute exact
   results, we use the fp quotent as an estimate from which we begin refining
   an exact integral value.  This reduces the number of iterations in the
   shift-and-subtract loop significantly.  */
 	.text
 	.align	4
 	.globl	__divq
 	.type	__divq, @function
 	.usepv	__divq, no
 	cfi_startproc
 	cfi_return_column (RA)
 __divq:
 	lda	sp, -FRAME(sp)
 	cfi_def_cfa_offset (FRAME)
 	CALL_MCOUNT
 	/* Get the fp divide insn issued as quickly as possible.  After
 	   that's done, we have at least 22 cycles until its results are
 	   ready -- all the time in the world to figure out how we're
 	   going to use the results.  */
 	stq	X, 16(sp)
 	stq	Y, 24(sp)
 	beq	Y, DIVBYZERO
 	stt	$f0, 0(sp)
 	stt	$f1, 8(sp)
 	cfi_rel_offset ($f0, 0)
 	cfi_rel_offset ($f1, 8)
 	ldt	$f0, 16(sp)
 	ldt	$f1, 24(sp)
 	cvtqt	$f0, $f0
 	cvtqt	$f1, $f1
 	divt/c	$f0, $f1, $f0
 	/* Check to see if X fit in the double as an exact value.  */
 	sll	X, (64-53), AT
 	ldt	$f1, 8(sp)
 	sra	AT, (64-53), AT
 	cmpeq	X, AT, AT
 	beq	AT, $x_big
 	/* If we get here, we're expecting exact results from the division.
 	   Do nothing else besides convert and clean up.  */
 	cvttq/c	$f0, $f0
 	stt	$f0, 16(sp)
 	ldq	RV, 16(sp)
 	ldt	$f0, 0(sp)
 	cfi_restore ($f1)
 	cfi_remember_state
 	cfi_restore ($f0)
 	cfi_def_cfa_offset (0)
 	lda	sp, FRAME(sp)
 	ret	$31, (RA), 1
 	.align	4
 	cfi_restore_state
 $x_big:
 	/* If we get here, X is large enough that we don't expect exact
 	   results, and neither X nor Y got mis-translated for the fp
 	   division.  Our task is to take the fp result, figure out how
 	   far it's off from the correct result and compute a fixup.  */
 	stq	t0, 16(sp)
 	stq	t1, 24(sp)
 	stq	t2, 32(sp)
 	stq	t5, 40(sp)
 	cfi_rel_offset (t0, 16)
 	cfi_rel_offset (t1, 24)
 	cfi_rel_offset (t2, 32)
 	cfi_rel_offset (t5, 40)
 #define Q	RV		/* quotient */
 #define R	t0		/* remainder */
 #define SY	t1		/* scaled Y */
 #define S	t2		/* scalar */
 #define QY	t3		/* Q*Y */
 	/* The fixup code below can only handle unsigned values.  */
 	or	X, Y, AT
 	mov	$31, t5
 	blt	AT, $fix_sign_in
 $fix_sign_in_ret1:
 	cvttq/c	$f0, $f0
 	stt	$f0, 8(sp)
 	ldq	Q, 8(sp)
 $fix_sign_in_ret2:
 	mulq	Q, Y, QY
 	stq	t4, 8(sp)
 	ldt	$f0, 0(sp)
 	unop
 	cfi_rel_offset (t4, 8)
 	cfi_restore ($f0)
 	stq	t3, 0(sp)
 	unop
 	cfi_rel_offset (t3, 0)
 	subq	QY, X, R
 	mov	Y, SY
 	mov	1, S
 	bgt	R, $q_high
 $q_high_ret:
 	subq	X, QY, R
 	mov	Y, SY
 	mov	1, S
 	bgt	R, $q_low
 $q_low_ret:
 	ldq	t0, 16(sp)
 	ldq	t1, 24(sp)
 	ldq	t2, 32(sp)
 	bne	t5, $fix_sign_out
 $fix_sign_out_ret:
 	ldq	t3, 0(sp)
 	ldq	t4, 8(sp)
 	ldq	t5, 40(sp)
 	lda	sp, FRAME(sp)
 	cfi_remember_state
 	cfi_restore (t0)
 	cfi_restore (t1)
 	cfi_restore (t2)
 	cfi_restore (t3)
 	cfi_restore (t4)
 	cfi_restore (t5)
 	cfi_def_cfa_offset (0)
 	ret	$31, (RA), 1
 	.align	4
 	cfi_restore_state
 	/* The quotient that we computed was too large.  We need to reduce
 	   it by S such that Y*S >= R.  Obviously the closer we get to the
 	   correct value the better, but overshooting high is ok, as we'll
 	   fix that up later.  */
 0:
 	addq	SY, SY, SY
 	addq	S, S, S
 $q_high:
 	cmpult	SY, R, AT
 	bne	AT, 0b
 	subq	Q, S, Q
 	unop
 	subq	QY, SY, QY
 	br	$q_high_ret
 	.align	4
 	/* The quotient that we computed was too small.  Divide Y by the 
 	   current remainder (R) and add that to the existing quotient (Q).
 	   The expectation, of course, is that R is much smaller than X.  */
 	/* Begin with a shift-up loop.  Compute S such that Y*S >= R.  We
 	   already have a copy of Y in SY and the value 1 in S.  */
 0:
 	addq	SY, SY, SY
 	addq	S, S, S
 $q_low:
 	cmpult	SY, R, AT
 	bne	AT, 0b
 	/* Shift-down and subtract loop.  Each iteration compares our scaled
 	   Y (SY) with the remainder (R); if SY <= R then X is divisible by
 	   Y's scalar (S) so add it to the quotient (Q).  */
 2:	addq	Q, S, t3
 	srl	S, 1, S
 	cmpule	SY, R, AT
 	subq	R, SY, t4
 	cmovne	AT, t3, Q
 	cmovne	AT, t4, R
 	srl	SY, 1, SY
 	bne	S, 2b
 	br	$q_low_ret
 	.align	4
 $fix_sign_in:
 	/* If we got here, then X|Y is negative.  Need to adjust everything
 	   such that we're doing unsigned division in the fixup loop.  */
 	/* T5 records the changes we had to make:
 		bit 0:	set if result should be negative.
 		bit 2:	set if X was negated.
 		bit 3:	set if Y was negated.
 	*/
 	xor	X, Y, AT
 	cmplt	AT, 0, t5
 	cmplt	X, 0, AT
 	negq	X, t0
 	s4addq	AT, t5, t5
 	cmovne	AT, t0, X
 	cmplt	Y, 0, AT
 	negq	Y, t0
 	s8addq	AT, t5, t5
 	cmovne	AT, t0, Y
 	unop
 	blbc	t5, $fix_sign_in_ret1
 	cvttq/c	$f0, $f0
 	stt	$f0, 8(sp)
 	ldq	Q, 8(sp)
 	unop
 	negq	Q, Q
 	br	$fix_sign_in_ret2
 	.align	4
 $fix_sign_out:
 	/* Now we get to undo what we did above.  */
 	/* ??? Is this really faster than just increasing the size of
 	   the stack frame and storing X and Y in memory?  */
 	and	t5, 8, AT
 	negq	Y, t4
 	cmovne	AT, t4, Y
 	and	t5, 4, AT
 	negq	X, t4
 	cmovne	AT, t4, X
 	negq	RV, t4
 	cmovlbs	t5, t4, RV
 	br	$fix_sign_out_ret
 	cfi_endproc
 	.size	__divq, .-__divq
 	DO_DIVBYZERO
--- a/sysdeps/alpha/divqu.S
+++ b/sysdeps/alpha/divqu.S
@ -0,0 +1,244 @@
 /* Copyright (C) 2004 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, write to the Free
   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   02111-1307 USA.  */
 #include "div_libc.h"
 /* 64-bit unsigned long divide.  These are not normal C functions.  Argument
   registers are t10 and t11, the result goes in t12.  Only t12 and AT may be
   clobbered.
   Theory of operation here is that we can use the FPU divider for virtually
   all operands that we see: all dividend values between -2**53 and 2**53-1
   can be computed directly.  Note that divisor values need not be checked
   against that range because the rounded fp value will be close enough such
   that the quotient is < 1, which will properly be truncated to zero when we
   convert back to integer.
   When the dividend is outside the range for which we can compute exact
   results, we use the fp quotent as an estimate from which we begin refining
   an exact integral value.  This reduces the number of iterations in the
   shift-and-subtract loop significantly.  */
 	.text
 	.align	4
 	.globl	__divqu
 	.type	__divqu, @function
 	.usepv	__divqu, no
 	cfi_startproc
 	cfi_return_column (RA)
 __divqu:
 	lda	sp, -FRAME(sp)
 	cfi_def_cfa_offset (FRAME)
 	CALL_MCOUNT
 	/* Get the fp divide insn issued as quickly as possible.  After
 	   that's done, we have at least 22 cycles until its results are
 	   ready -- all the time in the world to figure out how we're
 	   going to use the results.  */
 	stq	X, 16(sp)
 	stq	Y, 24(sp)
 	beq	Y, DIVBYZERO
 	stt	$f0, 0(sp)
 	stt	$f1, 8(sp)
 	cfi_rel_offset ($f0, 0)
 	cfi_rel_offset ($f1, 8)
 	ldt	$f0, 16(sp)
 	ldt	$f1, 24(sp)
 	cvtqt	$f0, $f0
 	cvtqt	$f1, $f1
 	blt	X, $x_is_neg
 	divt/c	$f0, $f1, $f0
 	/* Check to see if Y was mis-converted as signed value.  */
 	ldt	$f1, 8(sp)
 	unop
 	nop
 	blt	Y, $y_is_neg
 	/* Check to see if X fit in the double as an exact value.  */
 	srl	X, 53, AT
 	bne	AT, $x_big
 	/* If we get here, we're expecting exact results from the division.
 	   Do nothing else besides convert and clean up.  */
 	cvttq/c	$f0, $f0
 	stt	$f0, 16(sp)
 	ldq	RV, 16(sp)
 	ldt	$f0, 0(sp)
 	cfi_remember_state
 	cfi_restore ($f0)
 	cfi_restore ($f1)
 	cfi_def_cfa_offset (0)
 	lda	sp, FRAME(sp)
 	ret	$31, (RA), 1
 	.align	4
 	cfi_restore_state
 $x_is_neg:
 	/* If we get here, X is so big that bit 63 is set, which made the
 	   conversion come out negative.  Fix it up lest we not even get
 	   a good estimate.  */
 	ldah	AT, 0x5f80		/* 2**64 as float.  */
 	stt	$f2, 24(sp)
 	cfi_rel_offset ($f2, 24)
 	stl	AT, 16(sp)
 	lds	$f2, 16(sp)
 	addt	$f0, $f2, $f0
 	unop
 	divt/c	$f0, $f1, $f0
 	unop
 	/* Ok, we've now the divide issued.  Continue with other checks.  */
 	ldt	$f1, 8(sp)
 	unop
 	ldt	$f2, 24(sp)
 	blt	Y, $y_is_neg
 	cfi_restore ($f1)
 	cfi_restore ($f2)
 	cfi_remember_state	/* for y_is_neg */
 	.align	4
 $x_big:
 	/* If we get here, X is large enough that we don't expect exact
 	   results, and neither X nor Y got mis-translated for the fp
 	   division.  Our task is to take the fp result, figure out how
 	   far it's off from the correct result and compute a fixup.  */
 	stq	t0, 16(sp)
 	stq	t1, 24(sp)
 	stq	t2, 32(sp)
 	stq	t3, 40(sp)
 	cfi_rel_offset (t0, 16)
 	cfi_rel_offset (t1, 24)
 	cfi_rel_offset (t2, 32)
 	cfi_rel_offset (t3, 40)
 #define Q	RV		/* quotient */
 #define R	t0		/* remainder */
 #define SY	t1		/* scaled Y */
 #define S	t2		/* scalar */
 #define QY	t3		/* Q*Y */
 	cvttq/c	$f0, $f0
 	stt	$f0, 8(sp)
 	ldq	Q, 8(sp)
 	mulq	Q, Y, QY
 	stq	t4, 8(sp)
 	unop
 	ldt	$f0, 0(sp)
 	unop
 	cfi_rel_offset (t4, 8)
 	cfi_restore ($f0)
 	subq	QY, X, R
 	mov	Y, SY
 	mov	1, S
 	bgt	R, $q_high
 $q_high_ret:
 	subq	X, QY, R
 	mov	Y, SY
 	mov	1, S
 	bgt	R, $q_low
 $q_low_ret:
 	ldq	t4, 8(sp)
 	ldq	t0, 16(sp)
 	ldq	t1, 24(sp)
 	ldq	t2, 32(sp)
 	ldq	t3, 40(sp)
 	lda	sp, FRAME(sp)
 	cfi_remember_state
 	cfi_restore (t0)
 	cfi_restore (t1)
 	cfi_restore (t2)
 	cfi_restore (t3)
 	cfi_restore (t4)
 	cfi_def_cfa_offset (0)
 	ret	$31, (RA), 1
 	.align	4
 	cfi_restore_state
 	/* The quotient that we computed was too large.  We need to reduce
 	   it by S such that Y*S >= R.  Obviously the closer we get to the
 	   correct value the better, but overshooting high is ok, as we'll
 	   fix that up later.  */
 0:
 	addq	SY, SY, SY
 	addq	S, S, S
 $q_high:
 	cmpult	SY, R, AT
 	bne	AT, 0b
 	subq	Q, S, Q
 	unop
 	subq	QY, SY, QY
 	br	$q_high_ret
 	.align	4
 	/* The quotient that we computed was too small.  Divide Y by the 
 	   current remainder (R) and add that to the existing quotient (Q).
 	   The expectation, of course, is that R is much smaller than X.  */
 	/* Begin with a shift-up loop.  Compute S such that Y*S >= R.  We
 	   already have a copy of Y in SY and the value 1 in S.  */
 0:
 	addq	SY, SY, SY
 	addq	S, S, S
 $q_low:
 	cmpult	SY, R, AT
 	bne	AT, 0b
 	/* Shift-down and subtract loop.  Each iteration compares our scaled
 	   Y (SY) with the remainder (R); if SY <= R then X is divisible by
 	   Y's scalar (S) so add it to the quotient (Q).  */
 2:	addq	Q, S, t3
 	srl	S, 1, S
 	cmpule	SY, R, AT
 	subq	R, SY, t4
 	cmovne	AT, t3, Q
 	cmovne	AT, t4, R
 	srl	SY, 1, SY
 	bne	S, 2b
 	br	$q_low_ret
 	.align	4
 	cfi_restore_state
 $y_is_neg:
 	/* If we get here, Y is so big that bit 63 is set.  The results
 	   from the divide will be completely wrong.  Fortunately, the
 	   quotient must be either 0 or 1, so just compute it directly.  */
 	cmpult	Y, X, RV
 	ldt	$f0, 0(sp)
 	lda	sp, FRAME(sp)
 	cfi_restore ($f0)
 	cfi_def_cfa_offset (0)
 	ret	$31, (RA), 1
 	cfi_endproc
 	.size	__divqu, .-__divqu
 	DO_DIVBYZERO
--- a/sysdeps/alpha/divrem.h
+++ b/sysdeps/alpha/divrem.h
@ -1,225 +0,0 @@
 /* Copyright (C) 1996,97,2002 Free Software Foundation, Inc.
   Contributed by David Mosberger (davidm@cs.arizona.edu).
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, write to the Free
   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   02111-1307 USA.  */
 /* The current Alpha chips don't provide hardware for integer
   division.  The C compiler expects the functions
 	__divqu: 64-bit unsigned long divide
 	__remqu: 64-bit unsigned long remainder
 	__divqs/__remqs: signed 64-bit
 	__divlu/__remlu: unsigned 32-bit
 	__divls/__remls: signed 32-bit
   These are not normal C functions: instead of the normal calling
   sequence, these expect their arguments in registers t10 and t11, and
   return the result in t12 (aka pv).  Register AT may be clobbered
   (assembly temporary), anything else must be saved.  */
 #include <sysdep.h>
 #ifdef __linux__
 # include <asm/gentrap.h>
 # include <asm/pal.h>
 #else
 # include <machine/pal.h>
 #endif
 #define mask			v0
 #define divisor			t0
 #define compare			AT
 #define tmp1			t2
 #define tmp2			t3
 #define retaddr			t9
 #define arg1			t10
 #define arg2			t11
 #define result			t12
 #if IS_REM
 # define DIV_ONLY(x,y...)
 # define REM_ONLY(x,y...)	x,##y
 # define modulus		result
 # define quotient		t1
 # define GETSIGN(x)		mov arg1, x
 # define STACK			32
 #else
 # define DIV_ONLY(x,y...)	x,##y
 # define REM_ONLY(x,y...)
 # define modulus		t1
 # define quotient		result
 # define GETSIGN(x)		xor arg1, arg2, x
 # define STACK			48
 #endif
 #if SIZE == 8
 # define LONGIFY(x,y)		mov x,y
 # define SLONGIFY(x,y)		mov x,y
 # define _SLONGIFY(x)
 # define NEG(x,y)		negq x,y
 #else
 # define LONGIFY(x,y)		zapnot x,15,y
 # define SLONGIFY(x,y)		sextl x,y
 # define _SLONGIFY(x)		sextl x,x
 # define NEG(x,y)		negl x,y
 #endif
 	.set noreorder
 	.set noat
 	.ent UFUNC_NAME
 	.globl UFUNC_NAME
 	.align 3
 UFUNC_NAME:
 $udiv_entry:
 	lda	sp, -STACK(sp)
 	.frame	sp, STACK, retaddr, 0
 #ifdef PROF
 	stq	ra, 0(sp)
 	stq	pv, 8(sp)
 	stq	gp, 16(sp)
 	br	AT, 1f
 1:	ldgp	gp, 0(AT)
 	mov	retaddr, ra
 	lda	AT, _mcount
 	jsr	AT, (AT), _mcount
 	ldq	ra, 0(sp)
 	ldq	pv, 8(sp)
 	ldq	gp, 16(sp)
 #endif
 	.prologue 0
 $udiv:
 	stq	t0, 0(sp)
 	LONGIFY	(arg2, divisor)
 	stq	t1, 8(sp)
 	LONGIFY	(arg1, modulus)
 	stq	v0, 16(sp)
 	clr	quotient
 	stq	tmp1, 24(sp)
 	ldiq	mask, 1
 	DIV_ONLY(stq tmp2,32(sp))
 	beq	divisor, $divbyzero
 	.align 3
 #if SIZE == 8
 	/* Shift divisor left.  */
 1:	cmpult	divisor, modulus, compare
 	blt	divisor, 2f
 	addq	divisor, divisor, divisor
 	addq	mask, mask, mask
 	bne	compare, 1b
 	unop
 2:
 #else
 	/* Shift divisor left using 3-bit shifts as we can't overflow.
 	   This results in looping three times less here, but up to
 	   two more times later.  Thus using a large shift isn't worth it.  */
 1:	cmpult	divisor, modulus, compare
 	s8addq	divisor, zero, divisor
 	s8addq	mask, zero, mask
 	bne	compare, 1b
 #endif
 	/* Now go back to the right.  */
 3:	DIV_ONLY(addq quotient, mask, tmp2)
 	srl	mask, 1, mask
 	cmpule	divisor, modulus, compare
 	subq	modulus, divisor, tmp1
 	DIV_ONLY(cmovne compare, tmp2, quotient)
 	srl	divisor, 1, divisor
 	cmovne	compare, tmp1, modulus
 	bne	mask, 3b
 $done:	ldq	t0, 0(sp)
 	ldq	t1, 8(sp)
 	ldq	v0, 16(sp)
 	ldq	tmp1, 24(sp)
 	DIV_ONLY(ldq tmp2, 32(sp))
 	lda	sp, STACK(sp)
 	ret	zero, (retaddr), 1
 $divbyzero:
 	mov	a0, tmp1
 	ldiq	a0, GEN_INTDIV
 	call_pal PAL_gentrap
 	mov	tmp1, a0
 	clr	result			/* If trap returns, return zero.  */
 	br	$done
 	.end UFUNC_NAME
 	.ent SFUNC_NAME
 	.globl SFUNC_NAME
 	.align 3
 SFUNC_NAME:
 	lda	sp, -STACK(sp)
 	.frame	sp, STACK, retaddr, 0
 #ifdef PROF
 	stq	ra, 0(sp)
 	stq	pv, 8(sp)
 	stq	gp, 16(sp)
 	br	AT, 1f
 1:	ldgp	gp, 0(AT)
 	mov	retaddr, ra
 	jsr	AT, _mcount
 	ldq	ra, 0(sp)
 	ldq	pv, 8(sp)
 	ldq	gp, 16(sp)
 #endif
 	.prologue 0
 	or	arg1, arg2, AT
 	_SLONGIFY(AT)
 	bge	AT, $udiv		/* don't need to mess with signs */
 	/* Save originals and find absolute values.  */
 	stq	arg1, 0(sp)
 	NEG	(arg1, AT)
 	stq	arg2, 8(sp)
 	cmovge	AT, AT, arg1
 	stq	retaddr, 16(sp)
 	NEG	(arg2, AT)
 	stq	tmp1, 24(sp)
 	cmovge	AT, AT, arg2
 	/* Do the unsigned division.  */
 	bsr	retaddr, $udiv_entry
 	/* Restore originals and adjust the sign of the result.  */
 	ldq	arg1, 0(sp)
 	ldq	arg2, 8(sp)
 	GETSIGN	(AT)
 	NEG	(result, tmp1)
 	_SLONGIFY(AT)
 	ldq	retaddr, 16(sp)
 	cmovlt	AT, tmp1, result
 	ldq	tmp1, 24(sp)
 	lda	sp, STACK(sp)
 	ret	zero, (retaddr), 1
 	.end	SFUNC_NAME
--- a/sysdeps/alpha/reml.S
+++ b/sysdeps/alpha/reml.S
@ -1,6 +1,80 @@
-#define IS_REM		1
+/* Copyright (C) 2004 Free Software Foundation, Inc.
-#define SIZE		4
+   Contributed by Richard Henderson  <rth@twiddle.net>
-#define UFUNC_NAME	__remlu
+   This file is part of the GNU C Library.
 #define SFUNC_NAME	__reml
-#include "divrem.h"
+   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, write to the Free
   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   02111-1307 USA.  */
 #include "div_libc.h"
 /* 32-bit signed int remainder.  This is not a normal C function.  Argument
   registers are t10 and t11, the result goes in t12.  Only t12 and AT may
   be clobbered.
   The FPU can handle the division for all input values except zero.
   All we have to do is compute the remainder via multiply-and-subtract.  */
 #ifndef EXTEND
 #define EXTEND(S,D)	sextl S, D
 #endif
 	.text
 	.align	4
 	.globl	__reml
 	.type	__reml, @function
 	.usepv	__reml, no
 	cfi_startproc
 	cfi_return_column (RA)
 __reml:
 	lda	sp, -FRAME(sp)
 	cfi_def_cfa_offset (FRAME)
 	CALL_MCOUNT
 	stt	$f0, 0(sp)
 	stt	$f1, 8(sp)
 	beq	Y, DIVBYZERO
 	cfi_rel_offset ($f0, 0)
 	cfi_rel_offset ($f1, 8)
 	EXTEND	(X, RV)
 	EXTEND	(Y, AT)
 	stq	RV, 16(sp)
 	stq	AT, 24(sp)
 	ldt	$f0, 16(sp)
 	ldt	$f1, 24(sp)
 	cvtqt	$f0, $f0
 	cvtqt	$f1, $f1
 	divt/c	$f0, $f1, $f0
 	cvttq/c	$f0, $f0
 	stt	$f0, 16(sp)
 	ldq	RV, 16(sp)
 	ldt	$f0, 0(sp)
 	mull	RV, Y, RV
 	ldt	$f1, 8(sp)
 	lda	sp, FRAME(sp)
 	cfi_restore ($f0)
 	cfi_restore ($f1)
 	cfi_def_cfa_offset (0)
 	subl	X, RV, RV
 	ret	$31, (RA), 1
 	cfi_endproc
 	.size	__reml, .-__reml
 	DO_DIVBYZERO
--- a/sysdeps/alpha/remlu.S
+++ b/sysdeps/alpha/remlu.S
@ -0,0 +1,4 @@
 #define UNSIGNED
 #define EXTEND(S,D)	zapnot S, 15, D
 #define __reml		__remlu
 #include <reml.S>
--- a/sysdeps/alpha/remq.S
+++ b/sysdeps/alpha/remq.S
@ -1,6 +1,261 @@
-#define IS_REM		1
+/* Copyright (C) 2004 Free Software Foundation, Inc.
-#define SIZE		8
+   This file is part of the GNU C Library.
 #define UFUNC_NAME	__remqu
 #define SFUNC_NAME	__remq
-#include "divrem.h"
+   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, write to the Free
   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   02111-1307 USA.  */
 #include "div_libc.h"
 /* 64-bit signed long remainder.  These are not normal C functions.  Argument
   registers are t10 and t11, the result goes in t12.  Only t12 and AT may
   be clobbered.
   Theory of operation here is that we can use the FPU divider for virtually
   all operands that we see: all dividend values between -2**53 and 2**53-1
   can be computed directly.  Note that divisor values need not be checked
   against that range because the rounded fp value will be close enough such
   that the quotient is < 1, which will properly be truncated to zero when we
   convert back to integer.
   When the dividend is outside the range for which we can compute exact
   results, we use the fp quotent as an estimate from which we begin refining
   an exact integral value.  This reduces the number of iterations in the
   shift-and-subtract loop significantly.  */
 	.text
 	.align	4
 	.globl	__remq
 	.type	__remq, @function
 	.usepv	__remq, no
 	cfi_startproc
 	cfi_return_column (RA)
 __remq:
 	lda	sp, -FRAME(sp)
 	cfi_def_cfa_offset (FRAME)
 	CALL_MCOUNT
 	/* Get the fp divide insn issued as quickly as possible.  After
 	   that's done, we have at least 22 cycles until its results are
 	   ready -- all the time in the world to figure out how we're
 	   going to use the results.  */
 	stq	X, 16(sp)
 	stq	Y, 24(sp)
 	beq	Y, DIVBYZERO
 	stt	$f0, 0(sp)
 	stt	$f1, 8(sp)
 	cfi_rel_offset ($f0, 0)
 	cfi_rel_offset ($f1, 8)
 	ldt	$f0, 16(sp)
 	ldt	$f1, 24(sp)
 	cvtqt	$f0, $f0
 	cvtqt	$f1, $f1
 	divt/c	$f0, $f1, $f0
 	/* Check to see if X fit in the double as an exact value.  */
 	sll	X, (64-53), AT
 	ldt	$f1, 8(sp)
 	sra	AT, (64-53), AT
 	cmpeq	X, AT, AT
 	beq	AT, $x_big
 	/* If we get here, we're expecting exact results from the division.
 	   Do nothing else besides convert, compute remainder, clean up.  */
 	cvttq/c	$f0, $f0
 	stt	$f0, 16(sp)
 	ldq	AT, 16(sp)
 	mulq	AT, Y, AT
 	ldt	$f0, 0(sp)
 	cfi_restore ($f1)
 	cfi_remember_state
 	cfi_restore ($f0)
 	cfi_def_cfa_offset (0)
 	lda	sp, FRAME(sp)
 	subq	X, AT, RV
 	ret	$31, (RA), 1
 	.align	4
 	cfi_restore_state
 $x_big:
 	/* If we get here, X is large enough that we don't expect exact
 	   results, and neither X nor Y got mis-translated for the fp
 	   division.  Our task is to take the fp result, figure out how
 	   far it's off from the correct result and compute a fixup.  */
 	stq	t0, 16(sp)
 	stq	t1, 24(sp)
 	stq	t2, 32(sp)
 	stq	t5, 40(sp)
 	cfi_rel_offset (t0, 16)
 	cfi_rel_offset (t1, 24)
 	cfi_rel_offset (t2, 32)
 	cfi_rel_offset (t5, 40)
 #define Q	t0		/* quotient */
 #define R	RV		/* remainder */
 #define SY	t1		/* scaled Y */
 #define S	t2		/* scalar */
 #define QY	t3		/* Q*Y */
 	/* The fixup code below can only handle unsigned values.  */
 	or	X, Y, AT
 	mov	$31, t5
 	blt	AT, $fix_sign_in
 $fix_sign_in_ret1:
 	cvttq/c	$f0, $f0
 	stt	$f0, 8(sp)
 	ldq	Q, 8(sp)
 $fix_sign_in_ret2:
 	mulq	Q, Y, QY
 	stq	t4, 8(sp)
 	ldt	$f0, 0(sp)
 	unop
 	cfi_rel_offset (t4, 8)
 	cfi_restore ($f0)
 	stq	t3, 0(sp)
 	unop
 	cfi_rel_offset (t3, 0)
 	subq	QY, X, R
 	mov	Y, SY
 	mov	1, S
 	bgt	R, $q_high
 $q_high_ret:
 	subq	X, QY, R
 	mov	Y, SY
 	mov	1, S
 	bgt	R, $q_low
 $q_low_ret:
 	ldq	t0, 16(sp)
 	ldq	t1, 24(sp)
 	ldq	t2, 32(sp)
 	bne	t5, $fix_sign_out
 $fix_sign_out_ret:
 	ldq	t3, 0(sp)
 	ldq	t4, 8(sp)
 	ldq	t5, 40(sp)
 	lda	sp, FRAME(sp)
 	cfi_remember_state
 	cfi_restore (t0)
 	cfi_restore (t1)
 	cfi_restore (t2)
 	cfi_restore (t3)
 	cfi_restore (t4)
 	cfi_restore (t5)
 	cfi_def_cfa_offset (0)
 	ret	$31, (RA), 1
 	.align	4
 	cfi_restore_state
 	/* The quotient that we computed was too large.  We need to reduce
 	   it by S such that Y*S >= R.  Obviously the closer we get to the
 	   correct value the better, but overshooting high is ok, as we'll
 	   fix that up later.  */
 0:
 	addq	SY, SY, SY
 	addq	S, S, S
 $q_high:
 	cmpult	SY, R, AT
 	bne	AT, 0b
 	subq	Q, S, Q
 	unop
 	subq	QY, SY, QY
 	br	$q_high_ret
 	.align	4
 	/* The quotient that we computed was too small.  Divide Y by the 
 	   current remainder (R) and add that to the existing quotient (Q).
 	   The expectation, of course, is that R is much smaller than X.  */
 	/* Begin with a shift-up loop.  Compute S such that Y*S >= R.  We
 	   already have a copy of Y in SY and the value 1 in S.  */
 0:
 	addq	SY, SY, SY
 	addq	S, S, S
 $q_low:
 	cmpult	SY, R, AT
 	bne	AT, 0b
 	/* Shift-down and subtract loop.  Each iteration compares our scaled
 	   Y (SY) with the remainder (R); if SY <= R then X is divisible by
 	   Y's scalar (S) so add it to the quotient (Q).  */
 2:	addq	Q, S, t3
 	srl	S, 1, S
 	cmpule	SY, R, AT
 	subq	R, SY, t4
 	cmovne	AT, t3, Q
 	cmovne	AT, t4, R
 	srl	SY, 1, SY
 	bne	S, 2b
 	br	$q_low_ret
 	.align	4
 $fix_sign_in:
 	/* If we got here, then X|Y is negative.  Need to adjust everything
 	   such that we're doing unsigned division in the fixup loop.  */
 	/* T5 records the changes we had to make:
 		bit 0:	set if X was negated.  Note that the sign of the
 			remainder follows the sign of the divisor.
 		bit 2:	set if Y was negated.
 	*/
 	xor	X, Y, t1
 	cmplt	X, 0, t5
 	negq	X, t0
 	cmovne	t5, t0, X
 	cmplt	Y, 0, AT
 	negq	Y, t0
 	s4addq	AT, t5, t5
 	cmovne	AT, t0, Y
 	bge	t1, $fix_sign_in_ret1
 	cvttq/c	$f0, $f0
 	stt	$f0, 8(sp)
 	ldq	Q, 8(sp)
 	negq	Q, Q
 	br	$fix_sign_in_ret2
 	.align	4
 $fix_sign_out:
 	/* Now we get to undo what we did above.  */
 	/* ??? Is this really faster than just increasing the size of
 	   the stack frame and storing X and Y in memory?  */
 	and	t5, 4, AT
 	negq	Y, t4
 	cmovne	AT, t4, Y
 	negq	X, t4
 	cmovlbs	t5, t4, X
 	negq	RV, t4
 	cmovlbs	t5, t4, RV
 	br	$fix_sign_out_ret
 	cfi_endproc
 	.size	__remq, .-__remq
 	DO_DIVBYZERO
--- a/sysdeps/alpha/remqu.S
+++ b/sysdeps/alpha/remqu.S
@ -0,0 +1,251 @@
 /* Copyright (C) 2004 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, write to the Free
   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   02111-1307 USA.  */
 #include "div_libc.h"
 /* 64-bit unsigned long remainder.  These are not normal C functions.  Argument
   registers are t10 and t11, the result goes in t12.  Only t12 and AT may be
   clobbered.
   Theory of operation here is that we can use the FPU divider for virtually
   all operands that we see: all dividend values between -2**53 and 2**53-1
   can be computed directly.  Note that divisor values need not be checked
   against that range because the rounded fp value will be close enough such
   that the quotient is < 1, which will properly be truncated to zero when we
   convert back to integer.
   When the dividend is outside the range for which we can compute exact
   results, we use the fp quotent as an estimate from which we begin refining
   an exact integral value.  This reduces the number of iterations in the
   shift-and-subtract loop significantly.  */
 	.text
 	.align	4
 	.globl	__remqu
 	.type	__remqu, @function
 	.usepv	__remqu, no
 	cfi_startproc
 	cfi_return_column (RA)
 __remqu:
 	lda	sp, -FRAME(sp)
 	cfi_def_cfa_offset (FRAME)
 	CALL_MCOUNT
 	/* Get the fp divide insn issued as quickly as possible.  After
 	   that's done, we have at least 22 cycles until its results are
 	   ready -- all the time in the world to figure out how we're
 	   going to use the results.  */
 	stq	X, 16(sp)
 	stq	Y, 24(sp)
 	beq	Y, DIVBYZERO
 	stt	$f0, 0(sp)
 	stt	$f1, 8(sp)
 	cfi_rel_offset ($f0, 0)
 	cfi_rel_offset ($f1, 8)
 	ldt	$f0, 16(sp)
 	ldt	$f1, 24(sp)
 	cvtqt	$f0, $f0
 	cvtqt	$f1, $f1
 	blt	X, $x_is_neg
 	divt/c	$f0, $f1, $f0
 	/* Check to see if Y was mis-converted as signed value.  */
 	ldt	$f1, 8(sp)
 	unop
 	nop
 	blt	Y, $y_is_neg
 	/* Check to see if X fit in the double as an exact value.  */
 	srl	X, 53, AT
 	bne	AT, $x_big
 	/* If we get here, we're expecting exact results from the division.
 	   Do nothing else besides convert, compute remainder, clean up.  */
 	cvttq/c	$f0, $f0
 	stt	$f0, 16(sp)
 	ldq	AT, 16(sp)
 	mulq	AT, Y, AT
 	ldt	$f0, 0(sp)
 	lda	sp, FRAME(sp)
 	cfi_remember_state
 	cfi_restore ($f0)
 	cfi_restore ($f1)
 	cfi_def_cfa_offset (0)
 	subq	X, AT, RV
 	ret	$31, (RA), 1
 	.align	4
 	cfi_restore_state
 $x_is_neg:
 	/* If we get here, X is so big that bit 63 is set, which made the
 	   conversion come out negative.  Fix it up lest we not even get
 	   a good estimate.  */
 	ldah	AT, 0x5f80		/* 2**64 as float.  */
 	stt	$f2, 24(sp)
 	cfi_rel_offset ($f2, 24)
 	stl	AT, 16(sp)
 	lds	$f2, 16(sp)
 	addt	$f0, $f2, $f0
 	unop
 	divt/c	$f0, $f1, $f0
 	unop
 	/* Ok, we've now the divide issued.  Continue with other checks.  */
 	ldt	$f1, 8(sp)
 	unop
 	ldt	$f2, 24(sp)
 	blt	Y, $y_is_neg
 	cfi_restore ($f1)
 	cfi_restore ($f2)
 	cfi_remember_state	/* for y_is_neg */
 	.align	4
 $x_big:
 	/* If we get here, X is large enough that we don't expect exact
 	   results, and neither X nor Y got mis-translated for the fp
 	   division.  Our task is to take the fp result, figure out how
 	   far it's off from the correct result and compute a fixup.  */
 	stq	t0, 16(sp)
 	stq	t1, 24(sp)
 	stq	t2, 32(sp)
 	stq	t3, 40(sp)
 	cfi_rel_offset (t0, 16)
 	cfi_rel_offset (t1, 24)
 	cfi_rel_offset (t2, 32)
 	cfi_rel_offset (t3, 40)
 #define Q	t0		/* quotient */
 #define R	RV		/* remainder */
 #define SY	t1		/* scaled Y */
 #define S	t2		/* scalar */
 #define QY	t3		/* Q*Y */
 	cvttq/c	$f0, $f0
 	stt	$f0, 8(sp)
 	ldq	Q, 8(sp)
 	mulq	Q, Y, QY
 	stq	t4, 8(sp)
 	unop
 	ldt	$f0, 0(sp)
 	unop
 	cfi_rel_offset (t4, 8)
 	cfi_restore ($f0)
 	subq	QY, X, R
 	mov	Y, SY
 	mov	1, S
 	bgt	R, $q_high
 $q_high_ret:
 	subq	X, QY, R
 	mov	Y, SY
 	mov	1, S
 	bgt	R, $q_low
 $q_low_ret:
 	ldq	t4, 8(sp)
 	ldq	t0, 16(sp)
 	ldq	t1, 24(sp)
 	ldq	t2, 32(sp)
 	ldq	t3, 40(sp)
 	lda	sp, FRAME(sp)
 	cfi_remember_state
 	cfi_restore (t0)
 	cfi_restore (t1)
 	cfi_restore (t2)
 	cfi_restore (t3)
 	cfi_restore (t4)
 	cfi_def_cfa_offset (0)
 	ret	$31, (RA), 1
 	.align	4
 	cfi_restore_state
 	/* The quotient that we computed was too large.  We need to reduce
 	   it by S such that Y*S >= R.  Obviously the closer we get to the
 	   correct value the better, but overshooting high is ok, as we'll
 	   fix that up later.  */
 0:
 	addq	SY, SY, SY
 	addq	S, S, S
 $q_high:
 	cmpult	SY, R, AT
 	bne	AT, 0b
 	subq	Q, S, Q
 	unop
 	subq	QY, SY, QY
 	br	$q_high_ret
 	.align	4
 	/* The quotient that we computed was too small.  Divide Y by the 
 	   current remainder (R) and add that to the existing quotient (Q).
 	   The expectation, of course, is that R is much smaller than X.  */
 	/* Begin with a shift-up loop.  Compute S such that Y*S >= R.  We
 	   already have a copy of Y in SY and the value 1 in S.  */
 0:
 	addq	SY, SY, SY
 	addq	S, S, S
 $q_low:
 	cmpult	SY, R, AT
 	bne	AT, 0b
 	/* Shift-down and subtract loop.  Each iteration compares our scaled
 	   Y (SY) with the remainder (R); if SY <= R then X is divisible by
 	   Y's scalar (S) so add it to the quotient (Q).  */
 2:	addq	Q, S, t3
 	srl	S, 1, S
 	cmpule	SY, R, AT
 	subq	R, SY, t4
 	cmovne	AT, t3, Q
 	cmovne	AT, t4, R
 	srl	SY, 1, SY
 	bne	S, 2b
 	br	$q_low_ret
 	.align	4
 	cfi_restore_state
 $y_is_neg:
 	/* If we get here, Y is so big that bit 63 is set.  The results
 	   from the divide will be completely wrong.  Fortunately, the
 	   quotient must be either 0 or 1, so the remainder must be X
 	   or X-Y, so just compute it directly.  */
 	cmpult	Y, X, AT
 	subq	X, Y, RV
 	ldt	$f0, 0(sp)
 	cmoveq	AT, X, RV
 	lda	sp, FRAME(sp)
 	cfi_restore ($f0)
 	cfi_def_cfa_offset (0)
 	ret	$31, (RA), 1
 	cfi_endproc
 	.size	__remqu, .-__remqu
 	DO_DIVBYZERO