Updated from /src/gmp-1.937

1996-03-01 18:43:45 +00:00 · 1996-03-01 18:43:45 +00:00 · 3de9f02e92
parent f860256b2e
commit 3de9f02e92
34 changed files with 1393 additions and 419 deletions
--- a/sysdeps/alpha/addmul_1.s
+++ b/sysdeps/alpha/addmul_1.s
@ -26,16 +26,7 @@
 # size		r18
 # s2_limb	r19
- # This code runs at 42 cycles/limb on the 21064.
+ # This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5.
 # To improve performance for long multiplications, we would use
 # 'fetch' for S1 and 'fetch_m' for RES.  It's not obvious how to use
 # these instructions without slowing down the general code: 1. We can
 # only have two prefetches in operation at any time in the Alpha
 # architecture.  2. There will seldom be any special alignment
 # between RES_PTR and S1_PTR.  Maybe we can simply divide the current
 # loop into an inner and outer loop, having the inner loop handle
 # exactly one prefetch block?
 	.set	noreorder
 	.set	noat
@ -52,7 +43,7 @@ __mpn_addmul_1:
 	mulq	$2,$19,$3	# $3 = prod_low
 	ldq	$5,0($16)	# $5 = *res_ptr
 	umulh	$2,$19,$0	# $0 = prod_high
-	beq	$18,Lend1	# jump if size was == 1
+	beq	$18,.Lend1	# jump if size was == 1
 	ldq	$2,0($17)	# $2 = s1_limb
 	addq	$17,8,$17	# s1_ptr++
 	subq	$18,1,$18	# size--
@ -60,10 +51,10 @@ __mpn_addmul_1:
 	cmpult	$3,$5,$4
 	stq	$3,0($16)
 	addq	$16,8,$16	# res_ptr++
-	beq	$18,Lend2	# jump if size was == 2
+	beq	$18,.Lend2	# jump if size was == 2
 	.align	3
-Loop:	mulq	$2,$19,$3	# $3 = prod_low
+.Loop:	mulq	$2,$19,$3	# $3 = prod_low
 	ldq	$5,0($16)	# $5 = *res_ptr
 	addq	$4,$0,$0	# cy_limb = cy_limb + 'cy'
 	subq	$18,1,$18	# size--
@ -77,9 +68,9 @@ Loop:	mulq	$2,$19,$3	# $3 = prod_low
 	stq	$3,0($16)
 	addq	$16,8,$16	# res_ptr++
 	addq	$5,$0,$0	# combine carries
-	bne	$18,Loop
+	bne	$18,.Loop
-Lend2:	mulq	$2,$19,$3	# $3 = prod_low
+.Lend2:	mulq	$2,$19,$3	# $3 = prod_low
 	ldq	$5,0($16)	# $5 = *res_ptr
 	addq	$4,$0,$0	# cy_limb = cy_limb + 'cy'
 	umulh	$2,$19,$4	# $4 = cy_limb
@ -91,7 +82,7 @@ Lend2:	mulq	$2,$19,$3	# $3 = prod_low
 	addq	$5,$0,$0	# combine carries
 	addq	$4,$0,$0	# cy_limb = prod_high + cy
 	ret	$31,($26),1
-Lend1:	addq	$5,$3,$3
+.Lend1:	addq	$5,$3,$3
 	cmpult	$3,$5,$5
 	stq	$3,0($16)
 	addq	$0,$5,$0
--- a/sysdeps/alpha/alphaev5/add_n.s
+++ b/sysdeps/alpha/alphaev5/add_n.s
@ -35,84 +35,113 @@
 __mpn_add_n:
 	.frame	$30,0,$26,0
-	ldq	$3,0($17)
+	or	$31,$31,$25		# clear cy
-	ldq	$4,0($18)
+	subq	$19,4,$19		# decr loop cnt
-
+	blt	$19,.Lend2		# if less than 4 limbs, goto 2nd loop
-	subq	$19,1,$19
+ # Start software pipeline for 1st loop
-	and	$19,4-1,$2	# number of limbs in first loop
+	ldq	$0,0($18)
-	bis	$31,$31,$0
+	ldq	$1,8($18)
-	beq	$2,.L0		# if multiple of 4 limbs, skip first loop
+	ldq	$4,0($17)
 	subq	$19,$2,$19
 .Loop0:	subq	$2,1,$2
 	ldq	$5,8($17)
-	addq	$4,$0,$4
+	addq	$17,32,$17		# update s1_ptr
-	ldq	$6,8($18)
+	ldq	$2,16($18)
-	cmpult	$4,$0,$1
+	addq	$0,$4,$20		# 1st main add
-	addq	$3,$4,$4
+	ldq	$3,24($18)
-	cmpult	$4,$3,$0
+	subq	$19,4,$19		# decr loop cnt
-	stq	$4,0($16)
+	ldq	$6,-16($17)
-	or	$0,$1,$0
+	cmpult	$20,$0,$25		# compute cy from last add
-
+	ldq	$7,-8($17)
-	addq	$17,8,$17
+	addq	$1,$25,$28		# cy add
-	addq	$18,8,$18
+	addq	$18,32,$18		# update s2_ptr
-	bis	$5,$5,$3
+	addq	$5,$28,$21		# 2nd main add
-	bis	$6,$6,$4
+	cmpult	$28,$25,$8		# compute cy from last add
-	addq	$16,8,$16
+	blt	$19,.Lend1		# if less than 4 limbs remain, jump
-	bne	$2,.Loop0
+ # 1st loop handles groups of 4 limbs in a software pipeline
 .L0:	beq	$19,.Lend
 	.align	4
-.Loop:	subq	$19,4,$19
+.Loop:	cmpult	$21,$28,$25		# compute cy from last add
-	unop
+	ldq	$0,0($18)
-
+	or	$8,$25,$25		# combine cy from the two adds
-	ldq	$6,8($18)
+	ldq	$1,8($18)
-	addq	$4,$0,$0
+	addq	$2,$25,$28		# cy add
 	ldq	$4,0($17)
 	addq	$28,$6,$22		# 3rd main add
 	ldq	$5,8($17)
-	cmpult	$0,$4,$1
+	cmpult	$28,$25,$8		# compute cy from last add
-	ldq	$4,16($18)
+	cmpult	$22,$28,$25		# compute cy from last add
 	addq	$3,$0,$20
 	cmpult	$20,$3,$0
 	ldq	$3,16($17)
 	or	$0,$1,$0
 	addq	$6,$0,$0
 	cmpult	$0,$6,$1
 	ldq	$6,24($18)
 	addq	$5,$0,$21
 	cmpult	$21,$5,$0
 	ldq	$5,24($17)
 	or	$0,$1,$0
 	addq	$4,$0,$0
 	cmpult	$0,$4,$1
 	ldq	$4,32($18)
 	addq	$3,$0,$22
 	cmpult	$22,$3,$0
 	ldq	$3,32($17)
 	or	$0,$1,$0
 	addq	$6,$0,$0
 	cmpult	$0,$6,$1
 	addq	$5,$0,$23
 	cmpult	$23,$5,$0
 	or	$0,$1,$0
 	stq	$20,0($16)
 	or	$8,$25,$25		# combine cy from the two adds
 	stq	$21,8($16)
-	stq	$22,16($16)
+	addq	$3,$25,$28		# cy add
-	stq	$23,24($16)
+	addq	$28,$7,$23		# 4th main add
 	cmpult	$28,$25,$8		# compute cy from last add
 	cmpult	$23,$28,$25		# compute cy from last add
 	addq	$17,32,$17		# update s1_ptr
 	or	$8,$25,$25		# combine cy from the two adds
 	addq	$16,32,$16		# update res_ptr
 	addq	$0,$25,$28		# cy add
 	ldq	$2,16($18)
 	addq	$4,$28,$20		# 1st main add
 	ldq	$3,24($18)
 	cmpult	$28,$25,$8		# compute cy from last add
 	ldq	$6,-16($17)
 	cmpult	$20,$28,$25		# compute cy from last add
 	ldq	$7,-8($17)
 	or	$8,$25,$25		# combine cy from the two adds
 	subq	$19,4,$19		# decr loop cnt
 	stq	$22,-16($16)
 	addq	$1,$25,$28		# cy add
 	stq	$23,-8($16)
 	addq	$5,$28,$21		# 2nd main add
 	addq	$18,32,$18		# update s2_ptr
 	cmpult	$28,$25,$8		# compute cy from last add
 	bge	$19,.Loop
 # Finish software pipeline for 1st loop
 .Lend1:	cmpult	$21,$28,$25		# compute cy from last add
 	or	$8,$25,$25		# combine cy from the two adds
 	addq	$2,$25,$28		# cy add
 	addq	$28,$6,$22		# 3rd main add
 	cmpult	$28,$25,$8		# compute cy from last add
 	cmpult	$22,$28,$25		# compute cy from last add
 	stq	$20,0($16)
 	or	$8,$25,$25		# combine cy from the two adds
 	stq	$21,8($16)
 	addq	$3,$25,$28		# cy add
 	addq	$28,$7,$23		# 4th main add
 	cmpult	$28,$25,$8		# compute cy from last add
 	cmpult	$23,$28,$25		# compute cy from last add
 	or	$8,$25,$25		# combine cy from the two adds
 	addq	$16,32,$16		# update res_ptr
 	stq	$22,-16($16)
 	stq	$23,-8($16)
 .Lend2:	addq	$19,4,$19		# restore loop cnt
 	beq	$19,.Lret
 # Start software pipeline for 2nd loop
 	ldq	$0,0($18)
 	ldq	$4,0($17)
 	subq	$19,1,$19
 	beq	$19,.Lend0
 # 2nd loop handles remaining 1-3 limbs
 	.align	4
 .Loop0:	addq	$0,$25,$28		# cy add
 	ldq	$0,8($18)
 	addq	$4,$28,$20		# main add
 	ldq	$4,8($17)
 	addq	$18,8,$18
 	cmpult	$28,$25,$8		# compute cy from last add
 	addq	$17,8,$17
 	stq	$20,0($16)
 	cmpult	$20,$28,$25		# compute cy from last add
 	subq	$19,1,$19		# decr loop cnt
 	or	$8,$25,$25		# combine cy from the two adds
 	addq	$16,8,$16
 	bne	$19,.Loop0
 .Lend0:	addq	$0,$25,$28		# cy add
 	addq	$4,$28,$20		# main add
 	cmpult	$28,$25,$8		# compute cy from last add
 	cmpult	$20,$28,$25		# compute cy from last add
 	stq	$20,0($16)
 	or	$8,$25,$25		# combine cy from the two adds
-	addq	$17,32,$17
+.Lret:	or	$25,$31,$0		# return cy
 	addq	$18,32,$18
 	addq	$16,32,$16
 	bne	$19,.Loop
 .Lend:	addq	$4,$0,$4
 	cmpult	$4,$0,$1
 	addq	$3,$4,$4
 	cmpult	$4,$3,$0
 	stq	$4,0($16)
 	or	$0,$1,$0
 	ret	$31,($26),1
 	.end	__mpn_add_n
--- a/sysdeps/alpha/alphaev5/lshift.s
+++ b/sysdeps/alpha/alphaev5/lshift.s
@ -25,7 +25,7 @@
 # size		r18
 # cnt		r19
- # This code runs at 4.25 cycles/limb on the EV5.
+ # This code runs at 3.25 cycles/limb on the EV5.
 	.set	noreorder
 	.set	noat
@ -44,11 +44,11 @@ __mpn_lshift:
 	and	$18,4-1,$28	# number of limbs in first loop
 	srl	$4,$20,$0	# compute function result
-	beq	$28,L0
+	beq	$28,.L0
 	subq	$18,$28,$18
 	.align	3
-Loop0:	ldq	$3,-16($17)
+.Loop0:	ldq	$3,-16($17)
 	subq	$16,8,$16
 	sll	$4,$19,$5
 	subq	$17,8,$17
@ -57,17 +57,17 @@ Loop0:	ldq	$3,-16($17)
 	or	$3,$3,$4
 	or	$5,$6,$8
 	stq	$8,0($16)
-	bne	$28,Loop0
+	bne	$28,.Loop0
-L0:	sll	$4,$19,$24
+.L0:	sll	$4,$19,$24
-	beq	$18,Lend
+	beq	$18,.Lend
 # warm up phase 1
 	ldq	$1,-16($17)
 	subq	$18,4,$18
 	ldq	$2,-24($17)
 	ldq	$3,-32($17)
 	ldq	$4,-40($17)
-	beq	$18,Lcool1
+	beq	$18,.Lend1
 # warm up phase 2
 	srl	$1,$20,$7
 	sll	$1,$19,$21
@ -84,10 +84,10 @@ L0:	sll	$4,$19,$24
 	sll	$4,$19,$24
 	ldq	$4,-72($17)
 	subq	$18,4,$18
-	beq	$18,Lcool1
+	beq	$18,.Lend2
 	.align  4
 # main loop
-Loop:	stq	$7,-8($16)
+.Loop:	stq	$7,-8($16)
 	or	$5,$22,$5
 	stq	$8,-16($16)
 	or	$6,$23,$6
@ -113,16 +113,14 @@ Loop:	stq	$7,-8($16)
 	subq	$16,32,$16
 	srl	$4,$20,$6
-	ldq	$3,-96($17
+	ldq	$3,-96($17)
 	sll	$4,$19,$24
 	ldq	$4,-104($17)
 	subq	$17,32,$17
-	bne	$18,Loop
+	bne	$18,.Loop
 	unop
 	unop
 # cool down phase 2/1
-Lcool1:	stq	$7,-8($16)
+.Lend2:	stq	$7,-8($16)
 	or	$5,$22,$5
 	stq	$8,-16($16)
 	or	$6,$23,$6
@ -150,7 +148,7 @@ Lcool1:	stq	$7,-8($16)
 	ret	$31,($26),1
 # cool down phase 1/1
-Lcool1:	srl	$1,$20,$7
+.Lend1:	srl	$1,$20,$7
 	sll	$1,$19,$21
 	srl	$2,$20,$8
 	sll	$2,$19,$22
@ -170,6 +168,6 @@ Lcool1:	srl	$1,$20,$7
 	stq	$24,-40($16)
 	ret	$31,($26),1
-Lend	stq	$24,-8($16)
+.Lend:	stq	$24,-8($16)
 	ret	$31,($26),1
 	.end	__mpn_lshift
--- a/sysdeps/alpha/alphaev5/rshift.s
+++ b/sysdeps/alpha/alphaev5/rshift.s
@ -25,7 +25,7 @@
 # size		r18
 # cnt		r19
- # This code runs at 4.25 cycles/limb on the EV5.
+ # This code runs at 3.25 cycles/limb on the EV5.
 	.set	noreorder
 	.set	noat
@ -42,11 +42,11 @@ __mpn_rshift:
 	and	$18,4-1,$28	# number of limbs in first loop
 	sll	$4,$20,$0	# compute function result
-	beq	$28,L0
+	beq	$28,.L0
 	subq	$18,$28,$18
 	.align	3
-Loop0:	ldq	$3,8($17)
+.Loop0:	ldq	$3,8($17)
 	addq	$16,8,$16
 	srl	$4,$19,$5
 	addq	$17,8,$17
@ -55,17 +55,17 @@ Loop0:	ldq	$3,8($17)
 	or	$3,$3,$4
 	or	$5,$6,$8
 	stq	$8,-8($16)
-	bne	$28,Loop0
+	bne	$28,.Loop0
-L0:	srl	$4,$19,$24
+.L0:	srl	$4,$19,$24
-	beq	$18,Lend
+	beq	$18,.Lend
 # warm up phase 1
 	ldq	$1,8($17)
 	subq	$18,4,$18
 	ldq	$2,16($17)
 	ldq	$3,24($17)
 	ldq	$4,32($17)
-	beq	$18,Lcool1
+	beq	$18,.Lend1
 # warm up phase 2
 	sll	$1,$20,$7
 	srl	$1,$19,$21
@ -82,10 +82,10 @@ L0:	srl	$4,$19,$24
 	srl	$4,$19,$24
 	ldq	$4,64($17)
 	subq	$18,4,$18
-	beq	$18,Lcool2
+	beq	$18,.Lend2
 	.align  4
 # main loop
-Loop:	stq	$7,0($16)
+.Loop:	stq	$7,0($16)
 	or	$5,$22,$5
 	stq	$8,8($16)
 	or	$6,$23,$6
@ -116,11 +116,9 @@ Loop:	stq	$7,0($16)
 	ldq	$4,96($17)
 	addq	$17,32,$17
-	bne	$18,Loop
+	bne	$18,.Loop
 	unop
 	unop
 # cool down phase 2/1
-Lcool2:	stq	$7,0($16)
+.Lend2:	stq	$7,0($16)
 	or	$5,$22,$5
 	stq	$8,8($16)
 	or	$6,$23,$6
@ -148,7 +146,7 @@ Lcool2:	stq	$7,0($16)
 	ret	$31,($26),1
 # cool down phase 1/1
-Lcool1:	sll	$1,$20,$7
+.Lend1:	sll	$1,$20,$7
 	srl	$1,$19,$21
 	sll	$2,$20,$8
 	srl	$2,$19,$22
@ -168,6 +166,6 @@ Lcool1:	sll	$1,$20,$7
 	stq	$24,32($16)
 	ret	$31,($26),1
-Lend:	stq	$24,0($16)
+.Lend:	stq	$24,0($16)
 	ret	$31,($26),1
 	.end	__mpn_rshift
--- a/sysdeps/alpha/alphaev5/sub_n.s
+++ b/sysdeps/alpha/alphaev5/sub_n.s
@ -0,0 +1,148 @@
 # Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
 # store difference in a third limb vector.
 # Copyright (C) 1995 Free Software Foundation, Inc.
 # This file is part of the GNU MP Library.
 # The GNU MP Library is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Library General Public License as published by
 # the Free Software Foundation; either version 2 of the License, or (at your
 # option) any later version.
 # The GNU MP Library is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
 # License for more details.
 # You should have received a copy of the GNU Library General Public License
 # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
 # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 # INPUT PARAMETERS
 # res_ptr	$16
 # s1_ptr	$17
 # s2_ptr	$18
 # size		$19
 	.set	noreorder
 	.set	noat
 .text
 	.align	3
 	.globl	__mpn_sub_n
 	.ent	__mpn_sub_n
 __mpn_sub_n:
 	.frame	$30,0,$26,0
 	or	$31,$31,$25		# clear cy
 	subq	$19,4,$19		# decr loop cnt
 	blt	$19,.Lend2		# if less than 4 limbs, goto 2nd loop
 # Start software pipeline for 1st loop
 	ldq	$0,0($18)
 	ldq	$1,8($18)
 	ldq	$4,0($17)
 	ldq	$5,8($17)
 	addq	$17,32,$17		# update s1_ptr
 	ldq	$2,16($18)
 	subq	$4,$0,$20		# 1st main sub
 	ldq	$3,24($18)
 	subq	$19,4,$19		# decr loop cnt
 	ldq	$6,-16($17)
 	cmpult	$4,$20,$25		# compute cy from last sub
 	ldq	$7,-8($17)
 	addq	$1,$25,$28		# cy add
 	addq	$18,32,$18		# update s2_ptr
 	subq	$5,$28,$21		# 2nd main sub
 	cmpult	$28,$25,$8		# compute cy from last add
 	blt	$19,.Lend1		# if less than 4 limbs remain, jump
 # 1st loop handles groups of 4 limbs in a software pipeline
 	.align	4
 .Loop:	cmpult	$5,$21,$25		# compute cy from last add
 	ldq	$0,0($18)
 	or	$8,$25,$25		# combine cy from the two adds
 	ldq	$1,8($18)
 	addq	$2,$25,$28		# cy add
 	ldq	$4,0($17)
 	subq	$6,$28,$22		# 3rd main sub
 	ldq	$5,8($17)
 	cmpult	$28,$25,$8		# compute cy from last add
 	cmpult	$6,$22,$25		# compute cy from last add
 	stq	$20,0($16)
 	or	$8,$25,$25		# combine cy from the two adds
 	stq	$21,8($16)
 	addq	$3,$25,$28		# cy add
 	subq	$7,$28,$23		# 4th main sub
 	cmpult	$28,$25,$8		# compute cy from last add
 	cmpult	$7,$23,$25		# compute cy from last add
 	addq	$17,32,$17		# update s1_ptr
 	or	$8,$25,$25		# combine cy from the two adds
 	addq	$16,32,$16		# update res_ptr
 	addq	$0,$25,$28		# cy add
 	ldq	$2,16($18)
 	subq	$4,$28,$20		# 1st main sub
 	ldq	$3,24($18)
 	cmpult	$28,$25,$8		# compute cy from last add
 	ldq	$6,-16($17)
 	cmpult	$4,$20,$25		# compute cy from last add
 	ldq	$7,-8($17)
 	or	$8,$25,$25		# combine cy from the two adds
 	subq	$19,4,$19		# decr loop cnt
 	stq	$22,-16($16)
 	addq	$1,$25,$28		# cy add
 	stq	$23,-8($16)
 	subq	$5,$28,$21		# 2nd main sub
 	addq	$18,32,$18		# update s2_ptr
 	cmpult	$28,$25,$8		# compute cy from last add
 	bge	$19,.Loop
 # Finish software pipeline for 1st loop
 .Lend1:	cmpult	$5,$21,$25		# compute cy from last add
 	or	$8,$25,$25		# combine cy from the two adds
 	addq	$2,$25,$28		# cy add
 	subq	$6,$28,$22		# 3rd main sub
 	cmpult	$28,$25,$8		# compute cy from last add
 	cmpult	$6,$22,$25		# compute cy from last add
 	stq	$20,0($16)
 	or	$8,$25,$25		# combine cy from the two adds
 	stq	$21,8($16)
 	addq	$3,$25,$28		# cy add
 	subq	$7,$28,$23		# 4th main sub
 	cmpult	$28,$25,$8		# compute cy from last add
 	cmpult	$7,$23,$25		# compute cy from last add
 	or	$8,$25,$25		# combine cy from the two adds
 	addq	$16,32,$16		# update res_ptr
 	stq	$22,-16($16)
 	stq	$23,-8($16)
 .Lend2:	addq	$19,4,$19		# restore loop cnt
 	beq	$19,.Lret
 # Start software pipeline for 2nd loop
 	ldq	$0,0($18)
 	ldq	$4,0($17)
 	subq	$19,1,$19
 	beq	$19,.Lend0
 # 2nd loop handles remaining 1-3 limbs
 	.align	4
 .Loop0:	addq	$0,$25,$28		# cy add
 	ldq	$0,8($18)
 	subq	$4,$28,$20		# main sub
 	ldq	$1,8($17)
 	addq	$18,8,$18
 	cmpult	$28,$25,$8		# compute cy from last add
 	addq	$17,8,$17
 	stq	$20,0($16)
 	cmpult	$4,$20,$25		# compute cy from last add
 	subq	$19,1,$19		# decr loop cnt
 	or	$8,$25,$25		# combine cy from the two adds
 	addq	$16,8,$16
 	or	$1,$31,$4
 	bne	$19,.Loop0
 .Lend0:	addq	$0,$25,$28		# cy add
 	subq	$4,$28,$20		# main sub
 	cmpult	$28,$25,$8		# compute cy from last add
 	cmpult	$4,$20,$25		# compute cy from last add
 	stq	$20,0($16)
 	or	$8,$25,$25		# combine cy from the two adds
 .Lret:	or	$25,$31,$0		# return cy
 	ret	$31,($26),1
 	.end	__mpn_sub_n
--- a/sysdeps/alpha/lshift.s
+++ b/sysdeps/alpha/lshift.s
@ -53,11 +53,11 @@ __mpn_lshift:
 	and	$18,4-1,$20	# number of limbs in first loop
 	srl	$4,$7,$0	# compute function result
-	beq	$20,L0
+	beq	$20,.L0
 	subq	$18,$20,$18
 	.align	3
-Loop0:
+.Loop0:
 	ldq	$3,-8($17)
 	subq	$16,8,$16
 	subq	$17,8,$17
@ -67,12 +67,12 @@ Loop0:
 	bis	$3,$3,$4
 	bis	$5,$6,$8
 	stq	$8,0($16)
-	bne	$20,Loop0
+	bne	$20,.Loop0
-L0:	beq	$18,Lend
+.L0:	beq	$18,.Lend
 	.align	3
-Loop:	ldq	$3,-8($17)
+.Loop:	ldq	$3,-8($17)
 	subq	$16,32,$16
 	subq	$18,4,$18
 	sll	$4,$19,$5
@ -100,9 +100,9 @@ Loop:	ldq	$3,-8($17)
 	bis	$1,$2,$8
 	stq	$8,0($16)
-	bgt	$18,Loop
+	bgt	$18,.Loop
-Lend:	sll	$4,$19,$8
+.Lend:	sll	$4,$19,$8
 	stq	$8,-8($16)
 	ret	$31,($26),1
 	.end	__mpn_lshift
--- a/sysdeps/alpha/mul_1.s
+++ b/sysdeps/alpha/mul_1.s
@ -1,7 +1,7 @@
 # Alpha 21064 __mpn_mul_1 -- Multiply a limb vector with a limb and store
 # the result in a second limb vector.
- # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
 # This file is part of the GNU MP Library.
--- a/sysdeps/alpha/rshift.s
+++ b/sysdeps/alpha/rshift.s
@ -51,11 +51,11 @@ __mpn_rshift:
 	and	$18,4-1,$20	# number of limbs in first loop
 	sll	$4,$7,$0	# compute function result
-	beq	$20,L0
+	beq	$20,.L0
 	subq	$18,$20,$18
 	.align	3
-Loop0:
+.Loop0:
 	ldq	$3,0($17)
 	addq	$16,8,$16
 	addq	$17,8,$17
@ -65,12 +65,12 @@ Loop0:
 	bis	$3,$3,$4
 	bis	$5,$6,$8
 	stq	$8,-8($16)
-	bne	$20,Loop0
+	bne	$20,.Loop0
-L0:	beq	$18,Lend
+.L0:	beq	$18,.Lend
 	.align	3
-Loop:	ldq	$3,0($17)
+.Loop:	ldq	$3,0($17)
 	addq	$16,32,$16
 	subq	$18,4,$18
 	srl	$4,$19,$5
@ -98,9 +98,9 @@ Loop:	ldq	$3,0($17)
 	bis	$1,$2,$8
 	stq	$8,-8($16)
-	bgt	$18,Loop
+	bgt	$18,.Loop
-Lend:	srl	$4,$19,$8
+.Lend:	srl	$4,$19,$8
 	stq	$8,0($16)
 	ret	$31,($26),1
 	.end	__mpn_rshift
--- a/sysdeps/alpha/submul_1.s
+++ b/sysdeps/alpha/submul_1.s
@ -26,16 +26,7 @@
 # size		r18
 # s2_limb	r19
- # This code runs at 42 cycles/limb on the 21064.
+ # This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5.
 # To improve performance for long multiplications, we would use
 # 'fetch' for S1 and 'fetch_m' for RES.  It's not obvious how to use
 # these instructions without slowing down the general code: 1. We can
 # only have two prefetches in operation at any time in the Alpha
 # architecture.  2. There will seldom be any special alignment
 # between RES_PTR and S1_PTR.  Maybe we can simply divide the current
 # loop into an inner and outer loop, having the inner loop handle
 # exactly one prefetch block?
 	.set	noreorder
 	.set	noat
@ -52,7 +43,7 @@ __mpn_submul_1:
 	mulq	$2,$19,$3	# $3 = prod_low
 	ldq	$5,0($16)	# $5 = *res_ptr
 	umulh	$2,$19,$0	# $0 = prod_high
-	beq	$18,Lend1	# jump if size was == 1
+	beq	$18,.Lend1	# jump if size was == 1
 	ldq	$2,0($17)	# $2 = s1_limb
 	addq	$17,8,$17	# s1_ptr++
 	subq	$18,1,$18	# size--
@ -60,10 +51,10 @@ __mpn_submul_1:
 	cmpult	$5,$3,$4
 	stq	$3,0($16)
 	addq	$16,8,$16	# res_ptr++
-	beq	$18,Lend2	# jump if size was == 2
+	beq	$18,.Lend2	# jump if size was == 2
 	.align	3
-Loop:	mulq	$2,$19,$3	# $3 = prod_low
+.Loop:	mulq	$2,$19,$3	# $3 = prod_low
 	ldq	$5,0($16)	# $5 = *res_ptr
 	addq	$4,$0,$0	# cy_limb = cy_limb + 'cy'
 	subq	$18,1,$18	# size--
@ -77,9 +68,9 @@ Loop:	mulq	$2,$19,$3	# $3 = prod_low
 	stq	$3,0($16)
 	addq	$16,8,$16	# res_ptr++
 	addq	$5,$0,$0	# combine carries
-	bne	$18,Loop
+	bne	$18,.Loop
-Lend2:	mulq	$2,$19,$3	# $3 = prod_low
+.Lend2:	mulq	$2,$19,$3	# $3 = prod_low
 	ldq	$5,0($16)	# $5 = *res_ptr
 	addq	$4,$0,$0	# cy_limb = cy_limb + 'cy'
 	umulh	$2,$19,$4	# $4 = cy_limb
@ -91,7 +82,7 @@ Lend2:	mulq	$2,$19,$3	# $3 = prod_low
 	addq	$5,$0,$0	# combine carries
 	addq	$4,$0,$0	# cy_limb = prod_high + cy
 	ret	$31,($26),1
-Lend1:	subq	$5,$3,$3
+.Lend1:	subq	$5,$3,$3
 	cmpult	$5,$3,$5
 	stq	$3,0($16)
 	addq	$0,$5,$0
--- a/sysdeps/alpha/udiv_qrnnd.S
+++ b/sysdeps/alpha/udiv_qrnnd.S
@ -1,6 +1,6 @@
 # Alpha 21064 __udiv_qrnnd
- # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
 # This file is part of the GNU MP Library.
@ -21,13 +21,11 @@
        .set noreorder
        .set noat
 .text
-        .align 3
+        .align	3
-        .globl __udiv_qrnnd
+        .globl	__udiv_qrnnd
-        .ent __udiv_qrnnd 0
+        .ent	__udiv_qrnnd
 __udiv_qrnnd:
 __udiv_qrnnd..ng:
        .frame $30,0,$26,0
        .prologue 0
 #define cnt	$2
@ -39,9 +37,9 @@ __udiv_qrnnd..ng:
 #define qb	$20
 	ldiq	cnt,16
-	blt	d,Largedivisor
+	blt	d,.Largedivisor
-Loop1:	cmplt	n0,0,tmp
+.Loop1:	cmplt	n0,0,tmp
 	addq	n1,n1,n1
 	bis	n1,tmp,n1
 	addq	n0,n0,n0
@ -74,12 +72,12 @@ Loop1:	cmplt	n0,0,tmp
 	cmovne	qb,tmp,n1
 	bis	n0,qb,n0
 	subq	cnt,1,cnt
-	bgt	cnt,Loop1
+	bgt	cnt,.Loop1
 	stq	n1,0(rem_ptr)
 	bis	$31,n0,$0
 	ret	$31,($26),1
-Largedivisor:
+.Largedivisor:
 	and	n0,1,$4
 	srl	n0,1,n0
@ -91,7 +89,7 @@ Largedivisor:
 	srl	d,1,$5
 	addq	$5,$6,$5
-Loop2:	cmplt	n0,0,tmp
+.Loop2:	cmplt	n0,0,tmp
 	addq	n1,n1,n1
 	bis	n1,tmp,n1
 	addq	n0,n0,n0
@ -124,27 +122,27 @@ Loop2:	cmplt	n0,0,tmp
 	cmovne	qb,tmp,n1
 	bis	n0,qb,n0
 	subq	cnt,1,cnt
-	bgt	cnt,Loop2
+	bgt	cnt,.Loop2
 	addq	n1,n1,n1
 	addq	$4,n1,n1
-	bne	$6,Odd
+	bne	$6,.LOdd
 	stq	n1,0(rem_ptr)
 	bis	$31,n0,$0
 	ret	$31,($26),1
-Odd:
+.LOdd:
 	/* q' in n0. r' in n1 */
 	addq	n1,n0,n1
 	cmpult	n1,n0,tmp	# tmp := carry from addq
-	beq	tmp,LLp6
+	beq	tmp,.LLp6
 	addq	n0,1,n0
 	subq	n1,d,n1
-LLp6:	cmpult	n1,d,tmp
+.LLp6:	cmpult	n1,d,tmp
-	bne	tmp,LLp7
+	bne	tmp,.LLp7
 	addq	n0,1,n0
 	subq	n1,d,n1
-LLp7:
+.LLp7:
 	stq	n1,0(rem_ptr)
 	bis	$31,n0,$0
 	ret	$31,($26),1
--- a/sysdeps/m68k/add_n.S
+++ b/sysdeps/m68k/add_n.S
@ -1,7 +1,7 @@
 /* mc68020 __mpn_add_n -- Add two limb vectors of the same length > 0 and store
   sum in a third limb vector.
-Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc.
 This file is part of the GNU MP Library.
@ -27,50 +27,53 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
  size		(sp + 12)
 */
 #include "sysdep.h"
 #include "asm-syntax.h"
 	TEXT
 	ALIGN
-	GLOBL	___mpn_add_n
+	GLOBL	C_SYMBOL_NAME(__mpn_add_n)
-LAB(___mpn_add_n)
+C_SYMBOL_NAME(__mpn_add_n:)
 PROLOG(__mpn_add_n)
 /* Save used registers on the stack.  */
-	INSN2(move,l	,MEM_PREDEC(sp),d2)
+	movel	R(d2),MEM_PREDEC(sp)
-	INSN2(move,l	,MEM_PREDEC(sp),a2)
+	movel	R(a2),MEM_PREDEC(sp)
 /* Copy the arguments to registers.  Better use movem?  */
-	INSN2(move,l	,a2,MEM_DISP(sp,12))
+	movel	MEM_DISP(sp,12),R(a2)
-	INSN2(move,l	,a0,MEM_DISP(sp,16))
+	movel	MEM_DISP(sp,16),R(a0)
-	INSN2(move,l	,a1,MEM_DISP(sp,20))
+	movel	MEM_DISP(sp,20),R(a1)
-	INSN2(move,l	,d2,MEM_DISP(sp,24))
+	movel	MEM_DISP(sp,24),R(d2)
-	INSN2(eor,w	,d2,#1)
+	eorw	#1,R(d2)
-	INSN2(lsr,l	,d2,#1)
+	lsrl	#1,R(d2)
-	bcc L1
+	bcc	L(L1)
-	INSN2(subq,l	,d2,#1)		/* clears cy as side effect */
+	subql	#1,R(d2)	/* clears cy as side effect */
-LAB(Loop)
+L(Loop:)
-	INSN2(move,l	,d0,MEM_POSTINC(a0))
+	movel	MEM_POSTINC(a0),R(d0)
-	INSN2(move,l	,d1,MEM_POSTINC(a1))
+	movel	MEM_POSTINC(a1),R(d1)
-	INSN2(addx,l	,d0,d1)
+	addxl	R(d1),R(d0)
-	INSN2(move,l	,MEM_POSTINC(a2),d0)
+	movel	R(d0),MEM_POSTINC(a2)
-LAB(L1)	INSN2(move,l	,d0,MEM_POSTINC(a0))
+L(L1:)	movel	MEM_POSTINC(a0),R(d0)
-	INSN2(move,l	,d1,MEM_POSTINC(a1))
+	movel	MEM_POSTINC(a1),R(d1)
-	INSN2(addx,l	,d0,d1)
+	addxl	R(d1),R(d0)
-	INSN2(move,l	,MEM_POSTINC(a2),d0)
+	movel	R(d0),MEM_POSTINC(a2)
-	dbf d2,Loop			/* loop until 16 lsb of %4 == -1 */
+	dbf	R(d2),L(Loop)		/* loop until 16 lsb of %4 == -1 */
-	INSN2(subx,l	,d0,d0)		/* d0 <= -cy; save cy as 0 or -1 in d0 */
+	subxl	R(d0),R(d0)	/* d0 <= -cy; save cy as 0 or -1 in d0 */
-	INSN2(sub,l	,d2,#0x10000)
+	subl	#0x10000,R(d2)
-	bcs L2
+	bcs	L(L2)
-	INSN2(add,l	,d0,d0)		/* restore cy */
+	addl	R(d0),R(d0)	/* restore cy */
-	bra Loop
+	bra	L(Loop)
-LAB(L2)
+L(L2:)
-	INSN1(neg,l	,d0)
+	negl	R(d0)
 /* Restore used registers from stack frame.  */
-	INSN2(move,l	,a2,MEM_POSTINC(sp))
+	movel	MEM_POSTINC(sp),R(a2)
-	INSN2(move,l	,d2,MEM_POSTINC(sp))
+	movel	MEM_POSTINC(sp),R(d2)
 	rts
 EPILOG(__mpn_add_n)
--- a/sysdeps/m68k/lshift.S
+++ b/sysdeps/m68k/lshift.S
@ -0,0 +1,150 @@
 /* mc68020 __mpn_lshift -- Shift left a low-level natural-number integer.
 Copyright (C) 1996 Free Software Foundation, Inc.
 This file is part of the GNU MP Library.
 The GNU MP Library is free software; you can redistribute it and/or modify
 it under the terms of the GNU Library General Public License as published by
 the Free Software Foundation; either version 2 of the License, or (at your
 option) any later version.
 The GNU MP Library is distributed in the hope that it will be useful, but
 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
 License for more details.
 You should have received a copy of the GNU Library General Public License
 along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
 the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
 /*
  INPUT PARAMETERS
  res_ptr	(sp + 4)
  s_ptr		(sp + 8)
  s_size	(sp + 16)
  cnt		(sp + 12)
 */
 #include "sysdep.h"
 #include "asm-syntax.h"
 #define res_ptr a1
 #define s_ptr a0
 #define s_size d6
 #define cnt d4
 	TEXT
 	ALIGN
 	GLOBL	C_SYMBOL_NAME(__mpn_lshift)
 C_SYMBOL_NAME(__mpn_lshift:)
 PROLOG(__mpn_lshift)
 /* Save used registers on the stack.  */
 	moveml	R(d2)-R(d6)/R(a2),MEM_PREDEC(sp)
 /* Copy the arguments to registers.  */
 	movel	MEM_DISP(sp,28),R(res_ptr)
 	movel	MEM_DISP(sp,32),R(s_ptr)
 	movel	MEM_DISP(sp,36),R(s_size)
 	movel	MEM_DISP(sp,40),R(cnt)
 	moveql	#1,R(d5)
 	cmpl	R(d5),R(cnt)
 	bne	L(Lnormal)
 	cmpl	R(s_ptr),R(res_ptr)
 	bls	L(Lspecial)		/* jump if s_ptr >= res_ptr */
 #if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020))
 	lea	MEM_INDX1(s_ptr,s_size,l,4),R(a2)
 #else /* not mc68020 */
 	movel	R(s_size),R(d0)
 	asll	#2,R(d0)
 	lea	MEM_INDX(s_ptr,d0,l),R(a2)
 #endif
 	cmpl	R(res_ptr),R(a2)
 	bls	L(Lspecial)		/* jump if res_ptr >= s_ptr + s_size */
 L(Lnormal:)
 	moveql	#32,R(d5)
 	subl	R(cnt),R(d5)
 #if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020))
 	lea	MEM_INDX1(s_ptr,s_size,l,4),R(s_ptr)
 	lea	MEM_INDX1(res_ptr,s_size,l,4),R(res_ptr)
 #else /* not mc68000 */
 	movel	R(s_size),R(d0)
 	asll	#2,R(d0)
 	addl	R(s_size),R(s_ptr)
 	addl	R(s_size),R(res_ptr)
 #endif
 	movel	MEM_PREDEC(s_ptr),R(d2)
 	movel	R(d2),R(d0)
 	lsrl	R(d5),R(d0)		/* compute carry limb */
 	lsll	R(cnt),R(d2)
 	movel	R(d2),R(d1)
 	subql	#1,R(s_size)
 	beq	L(Lend)
 	lsrl	#1,R(s_size)
 	bcs	L(L1)
 	subql	#1,R(s_size)
 L(Loop:)
 	movel	MEM_PREDEC(s_ptr),R(d2)
 	movel	R(d2),R(d3)
 	lsrl	R(d5),R(d3)
 	orl	R(d3),R(d1)
 	movel	R(d1),MEM_PREDEC(res_ptr)
 	lsll	R(cnt),R(d2)
 L(L1:)
 	movel	MEM_PREDEC(s_ptr),R(d1)
 	movel	R(d1),R(d3)
 	lsrl	R(d5),R(d3)
 	orl	R(d3),R(d2)
 	movel	R(d2),MEM_PREDEC(res_ptr)
 	lsll	R(cnt),R(d1)
 	dbf	R(s_size),L(Loop)
 	subl	#0x10000,R(s_size)
 	bcc	L(Loop)
 L(Lend:)
 	movel	R(d1),MEM_PREDEC(res_ptr) /* store least significant limb */
 /* Restore used registers from stack frame.  */
 	moveml	MEM_POSTINC(sp),R(d2)-R(d6)/R(a2)
 	rts
 /* We loop from least significant end of the arrays, which is only
   permissable if the source and destination don't overlap, since the
   function is documented to work for overlapping source and destination.  */
 L(Lspecial:)
 	clrl	R(d0)			/* initialize carry */
 	eorw	#1,R(s_size)
 	lsrl	#1,R(s_size)
 	bcc	L(LL1)
 	subql	#1,R(s_size)
 L(LLoop:)
 	movel	MEM_POSTINC(s_ptr),R(d2)
 	addxl	R(d2),R(d2)
 	movel	R(d2),MEM_POSTINC(res_ptr)
 L(LL1:)
 	movel	MEM_POSTINC(s_ptr),R(d2)
 	addxl	R(d2),R(d2)
 	movel	R(d2),MEM_POSTINC(res_ptr)
 	dbf	R(s_size),L(LLoop)
 	addxl	R(d0),R(d0)		/* save cy in lsb */
 	subl	#0x10000,R(s_size)
 	bcs	L(LLend)
 	lsrl	#1,R(d0)		/* restore cy */
 	bra	L(LLoop)
 L(LLend:)
 /* Restore used registers from stack frame.  */
 	moveml	MEM_POSTINC(sp),R(d2)-R(d6)/R(a2)
 	rts
 EPILOG(__mpn_lshift)
--- a/sysdeps/m68k/m68020/addmul_1.S
+++ b/sysdeps/m68k/m68020/addmul_1.S
@ -1,7 +1,7 @@
 /* mc68020 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
   the result to a second limb vector.
-Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc.
 This file is part of the GNU MP Library.
@ -23,58 +23,61 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
  INPUT PARAMETERS
  res_ptr	(sp + 4)
  s1_ptr	(sp + 8)
-  size		(sp + 12)
+  s1_size	(sp + 12)
  s2_limb	(sp + 16)
 */
 #include "sysdep.h"
 #include "asm-syntax.h"
 	TEXT
 	ALIGN
-	GLOBL	___mpn_addmul_1
+	GLOBL	C_SYMBOL_NAME(__mpn_addmul_1)
-LAB(___mpn_addmul_1)
+C_SYMBOL_NAME(__mpn_addmul_1:)
 PROLOG(__mpn_addmul_1)
 #define res_ptr a0
 #define s1_ptr a1
-#define size d2
+#define s1_size d2
 #define s2_limb d4
 /* Save used registers on the stack.  */
-	INSN2(movem,l	,MEM_PREDEC(sp),d2-d5)
+	moveml	R(d2)-R(d5),MEM_PREDEC(sp)
 /* Copy the arguments to registers.  Better use movem?  */
-	INSN2(move,l	,res_ptr,MEM_DISP(sp,20))
+	movel	MEM_DISP(sp,20),R(res_ptr)
-	INSN2(move,l	,s1_ptr,MEM_DISP(sp,24))
+	movel	MEM_DISP(sp,24),R(s1_ptr)
-	INSN2(move,l	,size,MEM_DISP(sp,28))
+	movel	MEM_DISP(sp,28),R(s1_size)
-	INSN2(move,l	,s2_limb,MEM_DISP(sp,32))
+	movel	MEM_DISP(sp,32),R(s2_limb)
-	INSN2(eor,w	,size,#1)
+	eorw	#1,R(s1_size)
-	INSN1(clr,l	,d1)
+	clrl	R(d1)
-	INSN1(clr,l	,d5)
+	clrl	R(d5)
-	INSN2(lsr,l	,size,#1)
+	lsrl	#1,R(s1_size)
-	bcc	L1
+	bcc	L(L1)
-	INSN2(subq,l	,size,#1)
+	subql	#1,R(s1_size)
-	INSN2(sub,l	,d0,d0)		/* (d0,cy) <= (0,0) */
+	subl	R(d0),R(d0)		/* (d0,cy) <= (0,0) */
-LAB(Loop)
+L(Loop:)
-	INSN2(move,l	,d3,MEM_POSTINC(s1_ptr))
+	movel	MEM_POSTINC(s1_ptr),R(d3)
-	INSN2(mulu,l	,d1:d3,s2_limb)
+	mulul	R(s2_limb),R(d1):R(d3)
-	INSN2(addx,l	,d3,d0)
+	addxl	R(d0),R(d3)
-	INSN2(addx,l	,d1,d5)
+	addxl	R(d5),R(d1)
-	INSN2(add,l	,MEM_POSTINC(res_ptr),d3)
+	addl	R(d3),MEM_POSTINC(res_ptr)
-LAB(L1)	INSN2(move,l	,d3,MEM_POSTINC(s1_ptr))
+L(L1:)	movel	MEM_POSTINC(s1_ptr),R(d3)
-	INSN2(mulu,l	,d0:d3,s2_limb)
+	mulul	R(s2_limb),R(d0):R(d3)
-	INSN2(addx,l	,d3,d1)
+	addxl	R(d1),R(d3)
-	INSN2(addx,l	,d0,d5)
+	addxl	R(d5),R(d0)
-	INSN2(add,l	,MEM_POSTINC(res_ptr),d3)
+	addl	R(d3),MEM_POSTINC(res_ptr)
-	dbf	size,Loop
+	dbf	R(s1_size),L(Loop)
-	INSN2(addx,l	,d0,d5)
+	addxl	R(d5),R(d0)
-	INSN2(sub,l	,size,#0x10000)
+	subl	#0x10000,R(s1_size)
-	bcc	Loop
+	bcc	L(Loop)
 /* Restore used registers from stack frame.  */
-	INSN2(movem,l	,d2-d5,MEM_POSTINC(sp))
+	moveml	MEM_POSTINC(sp),R(d2)-R(d5)
 	rts
 EPILOG(__mpn_addmul_1)
--- a/sysdeps/m68k/m68020/mul_1.S
+++ b/sysdeps/m68k/m68020/mul_1.S
@ -1,7 +1,7 @@
 /* mc68020 __mpn_mul_1 -- Multiply a limb vector with a limb and store
   the result in a second limb vector.
-Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc.
 This file is part of the GNU MP Library.
@ -23,65 +23,68 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
  INPUT PARAMETERS
  res_ptr	(sp + 4)
  s1_ptr	(sp + 8)
-  size		(sp + 12)
+  s1_size	(sp + 12)
  s2_limb	(sp + 16)
 */
 #include "sysdep.h"
 #include "asm-syntax.h"
 	TEXT
 	ALIGN
-	GLOBL	___mpn_mul_1
+	GLOBL	C_SYMBOL_NAME(__mpn_mul_1)
-LAB(___mpn_mul_1)
+C_SYMBOL_NAME(__mpn_mul_1:)
 PROLOG(__mpn_mul_1)
 #define res_ptr a0
 #define s1_ptr a1
-#define size d2
+#define s1_size d2
 #define s2_limb d4
 /* Save used registers on the stack.  */
-	INSN2(movem,l	,MEM_PREDEC(sp),d2-d4)
+	moveml	R(d2)-R(d4),MEM_PREDEC(sp)
 #if 0
-	INSN2(move,l	,MEM_PREDEC(sp),d2)
+	movel	R(d2),MEM_PREDEC(sp)
-	INSN2(move,l	,MEM_PREDEC(sp),d3)
+	movel	R(d3),MEM_PREDEC(sp)
-	INSN2(move,l	,MEM_PREDEC(sp),d4)
+	movel	R(d4),MEM_PREDEC(sp)
 #endif
 /* Copy the arguments to registers.  Better use movem?  */
-	INSN2(move,l	,res_ptr,MEM_DISP(sp,16))
+	movel	MEM_DISP(sp,16),R(res_ptr)
-	INSN2(move,l	,s1_ptr,MEM_DISP(sp,20))
+	movel	MEM_DISP(sp,20),R(s1_ptr)
-	INSN2(move,l	,size,MEM_DISP(sp,24))
+	movel	MEM_DISP(sp,24),R(s1_size)
-	INSN2(move,l	,s2_limb,MEM_DISP(sp,28))
+	movel	MEM_DISP(sp,28),R(s2_limb)
-	INSN2(eor,w	,size,#1)
+	eorw	#1,R(s1_size)
-	INSN1(clr,l	,d1)
+	clrl	R(d1)
-	INSN2(lsr,l	,size,#1)
+	lsrl	#1,R(s1_size)
-	bcc	L1
+	bcc	L(L1)
-	INSN2(subq,l	,size,#1)
+	subql	#1,R(s1_size)
-	INSN2(sub,l	,d0,d0)		/* (d0,cy) <= (0,0) */
+	subl	R(d0),R(d0)	/* (d0,cy) <= (0,0) */
-LAB(Loop)
+L(Loop:)
-	INSN2(move,l	,d3,MEM_POSTINC(s1_ptr))
+	movel	MEM_POSTINC(s1_ptr),R(d3)
-	INSN2(mulu,l	,d1:d3,s2_limb)
+	mulul	R(s2_limb),R(d1):R(d3)
-	INSN2(addx,l	,d3,d0)
+	addxl	R(d0),R(d3)
-	INSN2(move,l	,MEM_POSTINC(res_ptr),d3)
+	movel	R(d3),MEM_POSTINC(res_ptr)
-LAB(L1)	INSN2(move,l	,d3,MEM_POSTINC(s1_ptr))
+L(L1:)	movel	MEM_POSTINC(s1_ptr),R(d3)
-	INSN2(mulu,l	,d0:d3,s2_limb)
+	mulul	R(s2_limb),R(d0):R(d3)
-	INSN2(addx,l	,d3,d1)
+	addxl	R(d1),R(d3)
-	INSN2(move,l	,MEM_POSTINC(res_ptr),d3)
+	movel	R(d3),MEM_POSTINC(res_ptr)
-	dbf	size,Loop
+	dbf	R(s1_size),L(Loop)
-	INSN1(clr,l	,d3)
+	clrl	R(d3)
-	INSN2(addx,l	,d0,d3)
+	addxl	R(d3),R(d0)
-	INSN2(sub,l	,size,#0x10000)
+	subl	#0x10000,R(s1_size)
-	bcc	Loop
+	bcc	L(Loop)
 /* Restore used registers from stack frame.  */
-	INSN2(movem,l	,d2-d4,MEM_POSTINC(sp))
+	moveml	MEM_POSTINC(sp),R(d2)-R(d4)
 #if 0
-	INSN2(move,l	,d4,MEM_POSTINC(sp))
+	movel	MEM_POSTINC(sp),R(d4)
-	INSN2(move,l	,d3,MEM_POSTINC(sp))
+	movel	MEM_POSTINC(sp),R(d3)
-	INSN2(move,l	,d2,MEM_POSTINC(sp))
+	movel	MEM_POSTINC(sp),R(d2)
 #endif
 	rts
 EPILOG(__mpn_mul_1)
--- a/sysdeps/m68k/m68020/submul_1.S
+++ b/sysdeps/m68k/m68020/submul_1.S
@ -1,7 +1,7 @@
 /* mc68020 __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
   the result from a second limb vector.
-Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc.
 This file is part of the GNU MP Library.
@ -23,58 +23,61 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
  INPUT PARAMETERS
  res_ptr	(sp + 4)
  s1_ptr	(sp + 8)
-  size		(sp + 12)
+  s1_size	(sp + 12)
  s2_limb	(sp + 16)
 */
 #include "sysdep.h"
 #include "asm-syntax.h"
 	TEXT
 	ALIGN
-	GLOBL	___mpn_submul_1
+	GLOBL	C_SYMBOL_NAME(__mpn_submul_1)
-LAB(___mpn_submul_1)
+C_SYMBOL_NAME(__mpn_submul_1:)
 PROLOG(__mpn_submul_1)
 #define res_ptr a0
 #define s1_ptr a1
-#define size d2
+#define s1_size d2
 #define s2_limb d4
 /* Save used registers on the stack.  */
-	INSN2(movem,l	,MEM_PREDEC(sp),d2-d5)
+	moveml	R(d2)-R(d5),MEM_PREDEC(sp)
 /* Copy the arguments to registers.  Better use movem?  */
-	INSN2(move,l	,res_ptr,MEM_DISP(sp,20))
+	movel	MEM_DISP(sp,20),R(res_ptr)
-	INSN2(move,l	,s1_ptr,MEM_DISP(sp,24))
+	movel	MEM_DISP(sp,24),R(s1_ptr)
-	INSN2(move,l	,size,MEM_DISP(sp,28))
+	movel	MEM_DISP(sp,28),R(s1_size)
-	INSN2(move,l	,s2_limb,MEM_DISP(sp,32))
+	movel	MEM_DISP(sp,32),R(s2_limb)
-	INSN2(eor,w	,size,#1)
+	eorw	#1,R(s1_size)
-	INSN1(clr,l	,d1)
+	clrl	R(d1)
-	INSN1(clr,l	,d5)
+	clrl	R(d5)
-	INSN2(lsr,l	,size,#1)
+	lsrl	#1,R(s1_size)
-	bcc	L1
+	bcc	L(L1)
-	INSN2(subq,l	,size,#1)
+	subql	#1,R(s1_size)
-	INSN2(sub,l	,d0,d0)		/* (d0,cy) <= (0,0) */
+	subl	R(d0),R(d0)	/* (d0,cy) <= (0,0) */
-LAB(Loop)
+L(Loop:)
-	INSN2(move,l	,d3,MEM_POSTINC(s1_ptr))
+	movel	MEM_POSTINC(s1_ptr),R(d3)
-	INSN2(mulu,l	,d1:d3,s2_limb)
+	mulul	R(s2_limb),R(d1):R(d3)
-	INSN2(addx,l	,d3,d0)
+	addxl	R(d0),R(d3)
-	INSN2(addx,l	,d1,d5)
+	addxl	R(d5),R(d1)
-	INSN2(sub,l	,MEM_POSTINC(res_ptr),d3)
+	subl	R(d3),MEM_POSTINC(res_ptr)
-LAB(L1)	INSN2(move,l	,d3,MEM_POSTINC(s1_ptr))
+L(L1:)	movel	MEM_POSTINC(s1_ptr),R(d3)
-	INSN2(mulu,l	,d0:d3,s2_limb)
+	mulul	R(s2_limb),R(d0):R(d3)
-	INSN2(addx,l	,d3,d1)
+	addxl	R(d1),R(d3)
-	INSN2(addx,l	,d0,d5)
+	addxl	R(d5),R(d0)
-	INSN2(sub,l	,MEM_POSTINC(res_ptr),d3)
+	subl	R(d3),MEM_POSTINC(res_ptr)
-	dbf	size,Loop
+	dbf	R(s1_size),L(Loop)
-	INSN2(addx,l	,d0,d5)
+	addxl	R(d5),R(d0)
-	INSN2(sub,l	,size,#0x10000)
+	subl	#0x10000,R(s1_size)
-	bcc	Loop
+	bcc	L(Loop)
 /* Restore used registers from stack frame.  */
-	INSN2(movem,l	,d2-d5,MEM_POSTINC(sp))
+	moveml	MEM_POSTINC(sp),R(d2)-R(d5)
 	rts
 EPILOG(__mpn_submul_1)
--- a/sysdeps/m68k/rshift.S
+++ b/sysdeps/m68k/rshift.S
@ -0,0 +1,149 @@
 /* mc68020 __mpn_rshift -- Shift right a low-level natural-number integer.
 Copyright (C) 1996 Free Software Foundation, Inc.
 This file is part of the GNU MP Library.
 The GNU MP Library is free software; you can redistribute it and/or modify
 it under the terms of the GNU Library General Public License as published by
 the Free Software Foundation; either version 2 of the License, or (at your
 option) any later version.
 The GNU MP Library is distributed in the hope that it will be useful, but
 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
 License for more details.
 You should have received a copy of the GNU Library General Public License
 along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
 the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
 /*
  INPUT PARAMETERS
  res_ptr	(sp + 4)
  s_ptr		(sp + 8)
  s_size	(sp + 16)
  cnt		(sp + 12)
 */
 #include "sysdep.h"
 #include "asm-syntax.h"
 #define res_ptr a1
 #define s_ptr a0
 #define s_size d6
 #define cnt d4
 	TEXT
 	ALIGN
 	GLOBL	C_SYMBOL_NAME(__mpn_rshift)
 C_SYMBOL_NAME(__mpn_rshift:)
 PROLOG(__mpn_rshift)
 /* Save used registers on the stack.  */
 	moveml	R(d2)-R(d6)/R(a2),MEM_PREDEC(sp)
 /* Copy the arguments to registers.  */
 	movel	MEM_DISP(sp,28),R(res_ptr)
 	movel	MEM_DISP(sp,32),R(s_ptr)
 	movel	MEM_DISP(sp,36),R(s_size)
 	movel	MEM_DISP(sp,40),R(cnt)
 	moveql	#1,R(d5)
 	cmpl	R(d5),R(cnt)
 	bne	L(Lnormal)
 	cmpl	R(res_ptr),R(s_ptr)
 	bls	L(Lspecial)		/* jump if res_ptr >= s_ptr */
 #if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020))
 	lea	MEM_INDX1(res_ptr,s_size,l,4),R(a2)
 #else /* not mc68020 */
 	movel	R(s_size),R(d0)
 	asll	#2,R(d0)
 	lea	MEM_INDX(res_ptr,d0,l),R(a2)
 #endif
 	cmpl	R(s_ptr),R(a2)
 	bls	L(Lspecial)		/* jump if s_ptr >= res_ptr + s_size */
 L(Lnormal:)
 	moveql	#32,R(d5)
 	subl	R(cnt),R(d5)
 	movel	MEM_POSTINC(s_ptr),R(d2)
 	movel	R(d2),R(d0)
 	lsll	R(d5),R(d0)		/* compute carry limb */
 	lsrl	R(cnt),R(d2)
 	movel	R(d2),R(d1)
 	subql	#1,R(s_size)
 	beq	L(Lend)
 	lsrl	#1,R(s_size)
 	bcs	L(L1)
 	subql	#1,R(s_size)
 L(Loop:)
 	movel	MEM_POSTINC(s_ptr),R(d2)
 	movel	R(d2),R(d3)
 	lsll	R(d5),R(d3)
 	orl	R(d3),R(d1)
 	movel	R(d1),MEM_POSTINC(res_ptr)
 	lsrl	R(cnt),R(d2)
 L(L1:)
 	movel	MEM_POSTINC(s_ptr),R(d1)
 	movel	R(d1),R(d3)
 	lsll	R(d5),R(d3)
 	orl	R(d3),R(d2)
 	movel	R(d2),MEM_POSTINC(res_ptr)
 	lsrl	R(cnt),R(d1)
 	dbf	R(s_size),L(Loop)
 	subl	#0x10000,R(s_size)
 	bcc	L(Loop)
 L(Lend:)
 	movel	R(d1),MEM(res_ptr) /* store most significant limb */
 /* Restore used registers from stack frame.  */
 	moveml	MEM_POSTINC(sp),R(d2)-R(d6)/R(a2)
 	rts
 /* We loop from most significant end of the arrays, which is only
   permissable if the source and destination don't overlap, since the
   function is documented to work for overlapping source and destination.  */
 L(Lspecial:)
 #if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020))
 	lea	MEM_INDX1(s_ptr,s_size,l,4),R(s_ptr)
 	lea	MEM_INDX1(res_ptr,s_size,l,4),R(res_ptr)
 #else /* not mc68000 */
 	movel	R(s_size),R(d0)
 	asll	#2,R(d0)
 	addl	R(s_size),R(s_ptr)
 	addl	R(s_size),R(res_ptr)
 #endif
 	clrl	R(d0)			/* initialize carry */
 	eorw	#1,R(s_size)
 	lsrl	#1,R(s_size)
 	bcc	L(LL1)
 	subql	#1,R(s_size)
 L(LLoop:)
 	movel	MEM_PREDEC(s_ptr),R(d2)
 	roxrl	#1,R(d2)
 	movel	R(d2),MEM_PREDEC(res_ptr)
 L(LL1:)
 	movel	MEM_PREDEC(s_ptr),R(d2)
 	roxrl	#1,R(d2)
 	movel	R(d2),MEM_PREDEC(res_ptr)
 	dbf	R(s_size),L(LLoop)
 	roxrl	#1,R(d0)		/* save cy in msb */
 	subl	#0x10000,R(s_size)
 	bcs	L(LLend)
 	addl	R(d0),R(d0)		/* restore cy */
 	bra	L(LLoop)
 L(LLend:)
 /* Restore used registers from stack frame.  */
 	moveml	MEM_POSTINC(sp),R(d2)-R(d6)/R(a2)
 	rts
 EPILOG(__mpn_rshift)
--- a/sysdeps/m68k/sub_n.S
+++ b/sysdeps/m68k/sub_n.S
@ -1,7 +1,7 @@
 /* mc68020 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
   store difference in a third limb vector.
-Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc.
 This file is part of the GNU MP Library.
@ -27,50 +27,53 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
  size		(sp + 12)
 */
 #include "sysdep.h"
 #include "asm-syntax.h"
 	TEXT
 	ALIGN
-	GLOBL	___mpn_sub_n
+	GLOBL	C_SYMBOL_NAME(__mpn_sub_n)
-LAB(___mpn_sub_n)
+C_SYMBOL_NAME(__mpn_sub_n:)
 PROLOG(__mpn_sub_n)
 /* Save used registers on the stack.  */
-	INSN2(move,l	,MEM_PREDEC(sp),d2)
+	movel	R(d2),MEM_PREDEC(sp)
-	INSN2(move,l	,MEM_PREDEC(sp),a2)
+	movel	R(a2),MEM_PREDEC(sp)
 /* Copy the arguments to registers.  Better use movem?  */
-	INSN2(move,l	,a2,MEM_DISP(sp,12))
+	movel	MEM_DISP(sp,12),R(a2)
-	INSN2(move,l	,a0,MEM_DISP(sp,16))
+	movel	MEM_DISP(sp,16),R(a0)
-	INSN2(move,l	,a1,MEM_DISP(sp,20))
+	movel	MEM_DISP(sp,20),R(a1)
-	INSN2(move,l	,d2,MEM_DISP(sp,24))
+	movel	MEM_DISP(sp,24),R(d2)
-	INSN2(eor,w	,d2,#1)
+	eorw	#1,R(d2)
-	INSN2(lsr,l	,d2,#1)
+	lsrl	#1,R(d2)
-	bcc L1
+	bcc	L(L1)
-	INSN2(subq,l	,d2,#1)		/* clears cy as side effect */
+	subql	#1,R(d2)	/* clears cy as side effect */
-LAB(Loop)
+L(Loop:)
-	INSN2(move,l	,d0,MEM_POSTINC(a0))
+	movel	MEM_POSTINC(a0),R(d0)
-	INSN2(move,l	,d1,MEM_POSTINC(a1))
+	movel	MEM_POSTINC(a1),R(d1)
-	INSN2(subx,l	,d0,d1)
+	subxl	R(d1),R(d0)
-	INSN2(move,l	,MEM_POSTINC(a2),d0)
+	movel	R(d0),MEM_POSTINC(a2)
-LAB(L1)	INSN2(move,l	,d0,MEM_POSTINC(a0))
+L(L1:)	movel	MEM_POSTINC(a0),R(d0)
-	INSN2(move,l	,d1,MEM_POSTINC(a1))
+	movel	MEM_POSTINC(a1),R(d1)
-	INSN2(subx,l	,d0,d1)
+	subxl	R(d1),R(d0)
-	INSN2(move,l	,MEM_POSTINC(a2),d0)
+	movel	R(d0),MEM_POSTINC(a2)
-	dbf d2,Loop			/* loop until 16 lsb of %4 == -1 */
+	dbf	R(d2),L(Loop)		/* loop until 16 lsb of %4 == -1 */
-	INSN2(subx,l	,d0,d0)		/* d0 <= -cy; save cy as 0 or -1 in d0 */
+	subxl	R(d0),R(d0)	/* d0 <= -cy; save cy as 0 or -1 in d0 */
-	INSN2(sub,l	,d2,#0x10000)
+	subl	#0x10000,R(d2)
-	bcs L2
+	bcs	L(L2)
-	INSN2(add,l	,d0,d0)		/* restore cy */
+	addl	R(d0),R(d0)	/* restore cy */
-	bra Loop
+	bra	L(Loop)
-LAB(L2)
+L(L2:)
-	INSN1(neg,l	,d0)
+	negl	R(d0)
 /* Restore used registers from stack frame.  */
-	INSN2(move,l	,a2,MEM_POSTINC(sp))
+	movel	MEM_POSTINC(sp),R(a2)
-	INSN2(move,l	,d2,MEM_POSTINC(sp))
+	movel	MEM_POSTINC(sp),R(d2)
 	rts
 EPILOG(__mpn_sub_n)
--- a/sysdeps/m88k/add_n.s
+++ b/sysdeps/m88k/add_n.s
@ -1,7 +1,7 @@
 ; mc88100 __mpn_add -- Add two limb vectors of the same length > 0 and store
 ; sum in a third limb vector.
-; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+; Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
 ; This file is part of the GNU MP Library.
--- a/sysdeps/m88k/m88110/add_n.S
+++ b/sysdeps/m88k/m88110/add_n.S
@ -0,0 +1,199 @@
 ; mc88110 __mpn_add_n -- Add two limb vectors of the same length > 0 and store
 ; sum in a third limb vector.
 ; Copyright (C) 1995, 1996 Free Software Foundation, Inc.
 ; This file is part of the GNU MP Library.
 ; The GNU MP Library is free software; you can redistribute it and/or modify
 ; it under the terms of the GNU Library General Public License as published by
 ; the Free Software Foundation; either version 2 of the License, or (at your
 ; option) any later version.
 ; The GNU MP Library is distributed in the hope that it will be useful, but
 ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 ; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
 ; License for more details.
 ; You should have received a copy of the GNU Library General Public License
 ; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
 ; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 ; INPUT PARAMETERS
 #define res_ptr	r2
 #define s1_ptr	r3
 #define s2_ptr	r4
 #define size	r5
 #include "sysdep.h"
 	text
 	align	16
 	global	C_SYMBOL_NAME(__mpn_add_n)
 C_SYMBOL_NAME(__mpn_add_n):
 	addu.co	 r0,r0,r0		; clear cy flag
 	xor	 r12,s2_ptr,res_ptr
 	bb1	 2,r12,L1
 ; **  V1a  **
 L0:	bb0	 2,res_ptr,L_v1		; branch if res_ptr is aligned?
 /* Add least significant limb separately to align res_ptr and s2_ptr */
 	ld	 r10,s1_ptr,0
 	addu	 s1_ptr,s1_ptr,4
 	ld	 r8,s2_ptr,0
 	addu	 s2_ptr,s2_ptr,4
 	subu	 size,size,1
 	addu.co	 r6,r10,r8
 	st	 r6,res_ptr,0
 	addu	 res_ptr,res_ptr,4
 L_v1:	cmp	 r12,size,2
 	bb1	 lt,r12,Lend2
 	ld	 r10,s1_ptr,0
 	ld	 r12,s1_ptr,4
 	ld.d	 r8,s2_ptr,0
 	subu	 size,size,10
 	bcnd	 lt0,size,Lfin1
 /* Add blocks of 8 limbs until less than 8 limbs remain */
 	align	 8
 Loop1:	subu	 size,size,8
 	addu.cio r6,r10,r8
 	ld	 r10,s1_ptr,8
 	addu.cio r7,r12,r9
 	ld	 r12,s1_ptr,12
 	ld.d	 r8,s2_ptr,8
 	st.d	 r6,res_ptr,0
 	addu.cio r6,r10,r8
 	ld	 r10,s1_ptr,16
 	addu.cio r7,r12,r9
 	ld	 r12,s1_ptr,20
 	ld.d	 r8,s2_ptr,16
 	st.d	 r6,res_ptr,8
 	addu.cio r6,r10,r8
 	ld	 r10,s1_ptr,24
 	addu.cio r7,r12,r9
 	ld	 r12,s1_ptr,28
 	ld.d	 r8,s2_ptr,24
 	st.d	 r6,res_ptr,16
 	addu.cio r6,r10,r8
 	ld	 r10,s1_ptr,32
 	addu.cio r7,r12,r9
 	ld	 r12,s1_ptr,36
 	addu	 s1_ptr,s1_ptr,32
 	ld.d	 r8,s2_ptr,32
 	addu	 s2_ptr,s2_ptr,32
 	st.d	 r6,res_ptr,24
 	addu	 res_ptr,res_ptr,32
 	bcnd	 ge0,size,Loop1
 Lfin1:	addu	 size,size,8-2
 	bcnd	 lt0,size,Lend1
 /* Add blocks of 2 limbs until less than 2 limbs remain */
 Loope1:	addu.cio r6,r10,r8
 	ld	 r10,s1_ptr,8
 	addu.cio r7,r12,r9
 	ld	 r12,s1_ptr,12
 	ld.d	 r8,s2_ptr,8
 	st.d	 r6,res_ptr,0
 	subu	 size,size,2
 	addu	 s1_ptr,s1_ptr,8
 	addu	 s2_ptr,s2_ptr,8
 	addu	 res_ptr,res_ptr,8
 	bcnd	 ge0,size,Loope1
 Lend1:	addu.cio r6,r10,r8
 	addu.cio r7,r12,r9
 	st.d	 r6,res_ptr,0
 	bb0	 0,size,Lret1
 /* Add last limb */
 	ld	 r10,s1_ptr,8
 	ld	 r8,s2_ptr,8
 	addu.cio r6,r10,r8
 	st	 r6,res_ptr,8
 Lret1:	jmp.n	 r1
 	addu.ci	 r2,r0,r0		; return carry-out from most sign. limb
 L1:	xor	 r12,s1_ptr,res_ptr
 	bb1	 2,r12,L2
 ; **  V1b  **
 	or	 r12,r0,s2_ptr
 	or	 s2_ptr,r0,s1_ptr
 	or	 s1_ptr,r0,r12
 	br	 L0
 ; **  V2  **
 /* If we come here, the alignment of s1_ptr and res_ptr as well as the
   alignment of s2_ptr and res_ptr differ.  Since there are only two ways
   things can be aligned (that we care about) we now know that the alignment
   of s1_ptr and s2_ptr are the same.  */
 L2:	cmp	 r12,size,1
 	bb1	 eq,r12,Ljone
 	bb0	 2,s1_ptr,L_v2		; branch if s1_ptr is aligned
 /* Add least significant limb separately to align res_ptr and s2_ptr */
 	ld	 r10,s1_ptr,0
 	addu	 s1_ptr,s1_ptr,4
 	ld	 r8,s2_ptr,0
 	addu	 s2_ptr,s2_ptr,4
 	subu	 size,size,1
 	addu.co	 r6,r10,r8
 	st	 r6,res_ptr,0
 	addu	 res_ptr,res_ptr,4
 L_v2:	subu	 size,size,8
 	bcnd	 lt0,size,Lfin2
 /* Add blocks of 8 limbs until less than 8 limbs remain */
 	align	 8
 Loop2:	subu	 size,size,8
 	ld.d	 r8,s1_ptr,0
 	ld.d	 r6,s2_ptr,0
 	addu.cio r8,r8,r6
 	st	 r8,res_ptr,0
 	addu.cio r9,r9,r7
 	st	 r9,res_ptr,4
 	ld.d	 r8,s1_ptr,8
 	ld.d	 r6,s2_ptr,8
 	addu.cio r8,r8,r6
 	st	 r8,res_ptr,8
 	addu.cio r9,r9,r7
 	st	 r9,res_ptr,12
 	ld.d	 r8,s1_ptr,16
 	ld.d	 r6,s2_ptr,16
 	addu.cio r8,r8,r6
 	st	 r8,res_ptr,16
 	addu.cio r9,r9,r7
 	st	 r9,res_ptr,20
 	ld.d	 r8,s1_ptr,24
 	ld.d	 r6,s2_ptr,24
 	addu.cio r8,r8,r6
 	st	 r8,res_ptr,24
 	addu.cio r9,r9,r7
 	st	 r9,res_ptr,28
 	addu	 s1_ptr,s1_ptr,32
 	addu	 s2_ptr,s2_ptr,32
 	addu	 res_ptr,res_ptr,32
 	bcnd	 ge0,size,Loop2
 Lfin2:	addu	 size,size,8-2
 	bcnd	 lt0,size,Lend2
 Loope2:	ld.d	 r8,s1_ptr,0
 	ld.d	 r6,s2_ptr,0
 	addu.cio r8,r8,r6
 	st	 r8,res_ptr,0
 	addu.cio r9,r9,r7
 	st	 r9,res_ptr,4
 	subu	 size,size,2
 	addu	 s1_ptr,s1_ptr,8
 	addu	 s2_ptr,s2_ptr,8
 	addu	 res_ptr,res_ptr,8
 	bcnd	 ge0,size,Loope2
 Lend2:	bb0	 0,size,Lret2
 /* Add last limb */
 Ljone:	ld	 r10,s1_ptr,0
 	ld	 r8,s2_ptr,0
 	addu.cio r6,r10,r8
 	st	 r6,res_ptr,0
 Lret2:	jmp.n	 r1
 	addu.ci	 r2,r0,r0		; return carry-out from most sign. limb
--- a/sysdeps/m88k/m88110/addmul_1.s
+++ b/sysdeps/m88k/m88110/addmul_1.s
@ -0,0 +1,60 @@
 ; mc88110 __mpn_addmul_1 -- Multiply a limb vector with a single limb and
 ; store the product in a second limb vector.
 ; Copyright (C) 1996 Free Software Foundation, Inc.
 ; This file is part of the GNU MP Library.
 ; The GNU MP Library is free software; you can redistribute it and/or modify
 ; it under the terms of the GNU Library General Public License as published by
 ; the Free Software Foundation; either version 2 of the License, or (at your
 ; option) any later version.
 ; The GNU MP Library is distributed in the hope that it will be useful, but
 ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 ; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
 ; License for more details.
 ; You should have received a copy of the GNU Library General Public License
 ; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
 ; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 ; INPUT PARAMETERS
 ; res_ptr	r2
 ; s1_ptr	r3
 ; size		r4
 ; s2_limb	r5
 	text
 	align	16
 	global	___mpn_addmul_1
 ___mpn_addmul_1:
 	lda	 r3,r3[r4]
 	lda	 r8,r2[r4]		; RES_PTR in r8 since r2 is retval
 	subu	 r4,r0,r4
 	addu.co	 r2,r0,r0		; r2 = cy = 0
 	ld	 r6,r3[r4]
 	addu	 r4,r4,1
 	subu	 r8,r8,4
 	bcnd.n	 eq0,r4,Lend
 	 mulu.d	 r10,r6,r5
 Loop:	ld	 r7,r8[r4]
 	ld	 r6,r3[r4]
 	addu.cio r9,r11,r2
 	addu.ci	 r2,r10,r0
 	addu.co	 r9,r9,r7
 	st	 r9,r8[r4]
 	addu	 r4,r4,1
 	mulu.d	 r10,r6,r5
 	bcnd	 ne0,r4,Loop
 Lend:	ld	 r7,r8,0
 	addu.cio r9,r11,r2
 	addu.ci	 r2,r10,r0
 	addu.co	 r9,r9,r7
 	st	 r9,r8,0
 	jmp.n	 r1
 	 addu.ci r2,r2,r0
--- a/sysdeps/m88k/m88110/mul_1.s
+++ b/sysdeps/m88k/m88110/mul_1.s
@ -1,7 +1,7 @@
 ; mc88110 __mpn_mul_1 -- Multiply a limb vector with a single limb and
 ; store the product in a second limb vector.
-; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+; Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
 ; This file is part of the GNU MP Library.
@ -56,29 +56,3 @@ Lend:	addu.cio r9,r11,r2
 	st	 r9,r8,4
 	jmp.n	 r1
 	 addu.ci r2,r10,r0
 ; This is the Right Way to do this on '110.  4 cycles / 64-bit limb.
 ;	ld.d	r10,
 ;	mulu.d
 ;	addu.cio
 ;	addu.cio
 ;	st.d
 ;	mulu.d	,r11,r5
 ;	ld.d	r12,
 ;	mulu.d	,r10,r5
 ;	addu.cio
 ;	addu.cio
 ;	st.d
 ;	mulu.d
 ;	ld.d	r10,
 ;	mulu.d
 ;	addu.cio
 ;	addu.cio
 ;	st.d
 ;	mulu.d
 ;	ld.d	r10,
 ;	mulu.d
 ;	addu.cio
 ;	addu.cio
 ;	st.d
 ;	mulu.d
--- a/sysdeps/m88k/m88110/sub_n.S
+++ b/sysdeps/m88k/m88110/sub_n.S
@ -0,0 +1,275 @@
 ; mc88110 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
 ; store difference in a third limb vector.
 ; Copyright (C) 1995, 1996 Free Software Foundation, Inc.
 ; This file is part of the GNU MP Library.
 ; The GNU MP Library is free software; you can redistribute it and/or modify
 ; it under the terms of the GNU Library General Public License as published by
 ; the Free Software Foundation; either version 2 of the License, or (at your
 ; option) any later version.
 ; The GNU MP Library is distributed in the hope that it will be useful, but
 ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 ; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
 ; License for more details.
 ; You should have received a copy of the GNU Library General Public License
 ; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
 ; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 ; INPUT PARAMETERS
 #define res_ptr	r2
 #define s1_ptr	r3
 #define s2_ptr	r4
 #define size	r5
 #include "sysdep.h"
 	text
 	align	16
 	global	C_SYMBOL_NAME(__mpn_sub_n)
 C_SYMBOL_NAME(__mpn_sub_n):
 	subu.co	 r0,r0,r0		; set cy flag
 	xor	 r12,s2_ptr,res_ptr
 	bb1	 2,r12,L1
 ; **  V1a  **
 L0:	bb0	 2,res_ptr,L_v1		; branch if res_ptr is aligned
 /* Add least significant limb separately to align res_ptr and s2_ptr */
 	ld	 r10,s1_ptr,0
 	addu	 s1_ptr,s1_ptr,4
 	ld	 r8,s2_ptr,0
 	addu	 s2_ptr,s2_ptr,4
 	subu	 size,size,1
 	subu.co	 r6,r10,r8
 	st	 r6,res_ptr,0
 	addu	 res_ptr,res_ptr,4
 L_v1:	cmp	 r12,size,2
 	bb1	 lt,r12,Lend2
 	ld	 r10,s1_ptr,0
 	ld	 r12,s1_ptr,4
 	ld.d	 r8,s2_ptr,0
 	subu	 size,size,10
 	bcnd	 lt0,size,Lfin1
 /* Add blocks of 8 limbs until less than 8 limbs remain */
 	align	 8
 Loop1:	subu	 size,size,8
 	subu.cio r6,r10,r8
 	ld	 r10,s1_ptr,8
 	subu.cio r7,r12,r9
 	ld	 r12,s1_ptr,12
 	ld.d	 r8,s2_ptr,8
 	st.d	 r6,res_ptr,0
 	subu.cio r6,r10,r8
 	ld	 r10,s1_ptr,16
 	subu.cio r7,r12,r9
 	ld	 r12,s1_ptr,20
 	ld.d	 r8,s2_ptr,16
 	st.d	 r6,res_ptr,8
 	subu.cio r6,r10,r8
 	ld	 r10,s1_ptr,24
 	subu.cio r7,r12,r9
 	ld	 r12,s1_ptr,28
 	ld.d	 r8,s2_ptr,24
 	st.d	 r6,res_ptr,16
 	subu.cio r6,r10,r8
 	ld	 r10,s1_ptr,32
 	subu.cio r7,r12,r9
 	ld	 r12,s1_ptr,36
 	addu	 s1_ptr,s1_ptr,32
 	ld.d	 r8,s2_ptr,32
 	addu	 s2_ptr,s2_ptr,32
 	st.d	 r6,res_ptr,24
 	addu	 res_ptr,res_ptr,32
 	bcnd	 ge0,size,Loop1
 Lfin1:	addu	 size,size,8-2
 	bcnd	 lt0,size,Lend1
 /* Add blocks of 2 limbs until less than 2 limbs remain */
 Loope1:	subu.cio r6,r10,r8
 	ld	 r10,s1_ptr,8
 	subu.cio r7,r12,r9
 	ld	 r12,s1_ptr,12
 	ld.d	 r8,s2_ptr,8
 	st.d	 r6,res_ptr,0
 	subu	 size,size,2
 	addu	 s1_ptr,s1_ptr,8
 	addu	 s2_ptr,s2_ptr,8
 	addu	 res_ptr,res_ptr,8
 	bcnd	 ge0,size,Loope1
 Lend1:	subu.cio r6,r10,r8
 	subu.cio r7,r12,r9
 	st.d	 r6,res_ptr,0
 	bb0	 0,size,Lret1
 /* Add last limb */
 	ld	 r10,s1_ptr,8
 	ld	 r8,s2_ptr,8
 	subu.cio r6,r10,r8
 	st	 r6,res_ptr,8
 Lret1:	addu.ci r2,r0,r0		; return carry-out from most sign. limb
 	jmp.n	 r1
 	 xor	r2,r2,1
 L1:	xor	 r12,s1_ptr,res_ptr
 	bb1	 2,r12,L2
 ; **  V1b  **
 	bb0	 2,res_ptr,L_v1b	; branch if res_ptr is aligned
 /* Add least significant limb separately to align res_ptr and s1_ptr */
 	ld	 r10,s2_ptr,0
 	addu	 s2_ptr,s2_ptr,4
 	ld	 r8,s1_ptr,0
 	addu	 s1_ptr,s1_ptr,4
 	subu	 size,size,1
 	subu.co	 r6,r8,r10
 	st	 r6,res_ptr,0
 	addu	 res_ptr,res_ptr,4
 L_v1b:	cmp	 r12,size,2
 	bb1	 lt,r12,Lend2
 	ld	 r10,s2_ptr,0
 	ld	 r12,s2_ptr,4
 	ld.d	 r8,s1_ptr,0
 	subu	 size,size,10
 	bcnd	 lt0,size,Lfin1b
 /* Add blocks of 8 limbs until less than 8 limbs remain */
 	align	 8
 Loop1b:	subu	 size,size,8
 	subu.cio r6,r8,r10
 	ld	 r10,s2_ptr,8
 	subu.cio r7,r9,r12
 	ld	 r12,s2_ptr,12
 	ld.d	 r8,s1_ptr,8
 	st.d	 r6,res_ptr,0
 	subu.cio r6,r8,r10
 	ld	 r10,s2_ptr,16
 	subu.cio r7,r9,r12
 	ld	 r12,s2_ptr,20
 	ld.d	 r8,s1_ptr,16
 	st.d	 r6,res_ptr,8
 	subu.cio r6,r8,r10
 	ld	 r10,s2_ptr,24
 	subu.cio r7,r9,r12
 	ld	 r12,s2_ptr,28
 	ld.d	 r8,s1_ptr,24
 	st.d	 r6,res_ptr,16
 	subu.cio r6,r8,r10
 	ld	 r10,s2_ptr,32
 	subu.cio r7,r9,r12
 	ld	 r12,s2_ptr,36
 	addu	 s2_ptr,s2_ptr,32
 	ld.d	 r8,s1_ptr,32
 	addu	 s1_ptr,s1_ptr,32
 	st.d	 r6,res_ptr,24
 	addu	 res_ptr,res_ptr,32
 	bcnd	 ge0,size,Loop1b
 Lfin1b:	addu	 size,size,8-2
 	bcnd	 lt0,size,Lend1b
 /* Add blocks of 2 limbs until less than 2 limbs remain */
 Loope1b:subu.cio r6,r8,r10
 	ld	 r10,s2_ptr,8
 	subu.cio r7,r9,r12
 	ld	 r12,s2_ptr,12
 	ld.d	 r8,s1_ptr,8
 	st.d	 r6,res_ptr,0
 	subu	 size,size,2
 	addu	 s1_ptr,s1_ptr,8
 	addu	 s2_ptr,s2_ptr,8
 	addu	 res_ptr,res_ptr,8
 	bcnd	 ge0,size,Loope1b
 Lend1b:	subu.cio r6,r8,r10
 	subu.cio r7,r9,r12
 	st.d	 r6,res_ptr,0
 	bb0	 0,size,Lret1b
 /* Add last limb */
 	ld	 r10,s2_ptr,8
 	ld	 r8,s1_ptr,8
 	subu.cio r6,r8,r10
 	st	 r6,res_ptr,8
 Lret1b:	addu.ci r2,r0,r0		; return carry-out from most sign. limb
 	jmp.n	 r1
 	 xor	r2,r2,1
 ; **  V2  **
 /* If we come here, the alignment of s1_ptr and res_ptr as well as the
   alignment of s2_ptr and res_ptr differ.  Since there are only two ways
   things can be aligned (that we care about) we now know that the alignment
   of s1_ptr and s2_ptr are the same.  */
 L2:	cmp	 r12,size,1
 	bb1	 eq,r12,Ljone
 	bb0	 2,s1_ptr,L_v2		; branch if s1_ptr is aligned
 /* Add least significant limb separately to align res_ptr and s2_ptr */
 	ld	 r10,s1_ptr,0
 	addu	 s1_ptr,s1_ptr,4
 	ld	 r8,s2_ptr,0
 	addu	 s2_ptr,s2_ptr,4
 	subu	 size,size,1
 	subu.co	 r6,r10,r8
 	st	 r6,res_ptr,0
 	addu	 res_ptr,res_ptr,4
 L_v2:	subu	 size,size,8
 	bcnd	 lt0,size,Lfin2
 /* Add blocks of 8 limbs until less than 8 limbs remain */
 	align	 8
 Loop2:	subu	 size,size,8
 	ld.d	 r8,s1_ptr,0
 	ld.d	 r6,s2_ptr,0
 	subu.cio r8,r8,r6
 	st	 r8,res_ptr,0
 	subu.cio r9,r9,r7
 	st	 r9,res_ptr,4
 	ld.d	 r8,s1_ptr,8
 	ld.d	 r6,s2_ptr,8
 	subu.cio r8,r8,r6
 	st	 r8,res_ptr,8
 	subu.cio r9,r9,r7
 	st	 r9,res_ptr,12
 	ld.d	 r8,s1_ptr,16
 	ld.d	 r6,s2_ptr,16
 	subu.cio r8,r8,r6
 	st	 r8,res_ptr,16
 	subu.cio r9,r9,r7
 	st	 r9,res_ptr,20
 	ld.d	 r8,s1_ptr,24
 	ld.d	 r6,s2_ptr,24
 	subu.cio r8,r8,r6
 	st	 r8,res_ptr,24
 	subu.cio r9,r9,r7
 	st	 r9,res_ptr,28
 	addu	 s1_ptr,s1_ptr,32
 	addu	 s2_ptr,s2_ptr,32
 	addu	 res_ptr,res_ptr,32
 	bcnd	 ge0,size,Loop2
 Lfin2:	addu	 size,size,8-2
 	bcnd	 lt0,size,Lend2
 Loope2:	ld.d	 r8,s1_ptr,0
 	ld.d	 r6,s2_ptr,0
 	subu.cio r8,r8,r6
 	st	 r8,res_ptr,0
 	subu.cio r9,r9,r7
 	st	 r9,res_ptr,4
 	subu	 size,size,2
 	addu	 s1_ptr,s1_ptr,8
 	addu	 s2_ptr,s2_ptr,8
 	addu	 res_ptr,res_ptr,8
 	bcnd	 ge0,size,Loope2
 Lend2:	bb0	 0,size,Lret2
 /* Add last limb */
 Ljone:	ld	 r10,s1_ptr,0
 	ld	 r8,s2_ptr,0
 	subu.cio r6,r10,r8
 	st	 r6,res_ptr,0
 Lret2:	addu.ci r2,r0,r0		; return carry-out from most sign. limb
 	jmp.n	 r1
 	 xor	r2,r2,1
--- a/sysdeps/m88k/mul_1.s
+++ b/sysdeps/m88k/mul_1.s
@ -1,7 +1,7 @@
 ; mc88100 __mpn_mul_1 -- Multiply a limb vector with a single limb and
 ; store the product in a second limb vector.
-; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+; Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
 ; This file is part of the GNU MP Library.
@ -55,14 +55,14 @@ ___mpn_mul_1:
 	; Make S1_PTR and RES_PTR point at the end of their blocks
 	; and negate SIZE.
 	lda	 r3,r3[r4]
-	lda	 r6,r2[r4]		; RES_PTR in r6 since r2 is retval
+	lda	 r6,r2[r4]	; RES_PTR in r6 since r2 is retval
 	subu	 r4,r0,r4
-	addu.co	 r2,r0,r0		; r2 = cy = 0
+	addu.co	 r2,r0,r0	; r2 = cy = 0
 	ld	 r9,r3[r4]
-	mask	 r7,r5,0xffff		; r7 = lo(S2_LIMB)
+	mask	 r7,r5,0xffff	; r7 = lo(S2_LIMB)
-	extu	 r8,r5,16		; r8 = hi(S2_LIMB)
+	extu	 r8,r5,16	; r8 = hi(S2_LIMB)
-	bcnd.n	 eq0,r8,Lsmall		; jump if (hi(S2_LIMB) == 0)
+	bcnd.n	 eq0,r8,Lsmall	; jump if (hi(S2_LIMB) == 0)
 	 subu	 r6,r6,4
 ; General code for any value of S2_LIMB.
@ -75,28 +75,27 @@ ___mpn_mul_1:
 	br.n	L1
 	addu	 r4,r4,1
-Loop:
+Loop:	ld	 r9,r3[r4]
 	ld	 r9,r3[r4]
 	st	 r26,r6[r4]
-; bcnd	ne0,r0,0			; bubble
+; bcnd	ne0,r0,0		; bubble
 	addu	 r4,r4,1
-L1:	mul	 r26,r9,r5		; low word of product	mul_1	WB ld
+L1:	mul	 r26,r9,r5	; low word of product	mul_1	WB ld
-	mask	 r12,r9,0xffff		; r12 = lo(s1_limb)	mask_1
+	mask	 r12,r9,0xffff	; r12 = lo(s1_limb)	mask_1
-	mul	 r11,r12,r7		; r11 =  prod_0		mul_2	WB mask_1
+	mul	 r11,r12,r7	; r11 =  prod_0		mul_2	WB mask_1
-	mul	 r10,r12,r8		; r10 = prod_1a		mul_3
+	mul	 r10,r12,r8	; r10 = prod_1a		mul_3
-	extu	 r13,r9,16		; r13 = hi(s1_limb)	extu_1	WB mul_1
+	extu	 r13,r9,16	; r13 = hi(s1_limb)	extu_1	WB mul_1
-	mul	 r12,r13,r7		; r12 = prod_1b		mul_4	WB extu_1
+	mul	 r12,r13,r7	; r12 = prod_1b		mul_4	WB extu_1
-	mul	 r25,r13,r8		; r25  = prod_2		mul_5	WB mul_2
+	mul	 r25,r13,r8	; r25  = prod_2		mul_5	WB mul_2
-	extu	 r11,r11,16		; r11 = hi(prod_0)	extu_2	WB mul_3
+	extu	 r11,r11,16	; r11 = hi(prod_0)	extu_2	WB mul_3
-	addu	 r10,r10,r11		;			addu_1	WB extu_2
+	addu	 r10,r10,r11	;			addu_1	WB extu_2
-; bcnd	ne0,r0,0			; bubble			WB addu_1
+; bcnd	ne0,r0,0		; bubble			WB addu_1
-	addu.co	 r10,r10,r12		;				WB mul_4
+	addu.co	 r10,r10,r12	;				WB mul_4
-	mask.u	 r10,r10,0xffff		; move the 16 most significant bits...
+	mask.u	 r10,r10,0xffff	; move the 16 most significant bits...
-	addu.ci	 r10,r10,r0		; ...to the low half of the word...
+	addu.ci	 r10,r10,r0	; ...to the low half of the word...
-	rot	 r10,r10,16		; ...and put carry in pos 16.
+	rot	 r10,r10,16	; ...and put carry in pos 16.
-	addu.co	 r26,r26,r2		; add old carry limb
+	addu.co	 r26,r26,r2	; add old carry limb
 	bcnd.n	 ne0,r4,Loop
-	 addu.ci r2,r25,r10		; compute new carry limb
+	 addu.ci r2,r25,r10	; compute new carry limb
 	st	 r26,r6[r4]
 	ld.d	 r25,r31,8
@ -109,20 +108,19 @@ Lsmall:
 	br.n	SL1
 	addu	 r4,r4,1
-SLoop:
+SLoop:	ld	 r9,r3[r4]	;
-	ld	 r9,r3[r4]		;
+	st	 r8,r6[r4]	;
-	st	 r8,r6[r4]		;
+	addu	 r4,r4,1	;
-	addu	 r4,r4,1		;
+SL1:	mul	 r8,r9,r5	; low word of product
-SL1:	mul	 r8,r9,r5		; low word of product
+	mask	 r12,r9,0xffff	; r12 = lo(s1_limb)
-	mask	 r12,r9,0xffff		; r12 = lo(s1_limb)
+	extu	 r13,r9,16	; r13 = hi(s1_limb)
-	extu	 r13,r9,16		; r13 = hi(s1_limb)
+	mul	 r11,r12,r7	; r11 =  prod_0
-	mul	 r11,r12,r7		; r11 =  prod_0
+	mul	 r12,r13,r7	; r12 = prod_1b
-	mul	 r12,r13,r7		; r12 = prod_1b
+	addu.cio r8,r8,r2	; add old carry limb
-	addu.cio r8,r8,r2		; add old carry limb
+	extu	 r10,r11,16	; r11 = hi(prod_0)
-	extu	 r10,r11,16		; r11 = hi(prod_0)
+	addu	 r10,r10,r12	;
 	addu	 r10,r10,r12		;
 	bcnd.n	 ne0,r4,SLoop
-	extu	 r2,r10,16		; r2 = new carry limb
+	extu	 r2,r10,16	; r2 = new carry limb
 	jmp.n	 r1
 	st	 r8,r6[r4]
--- a/sysdeps/m88k/sub_n.s
+++ b/sysdeps/m88k/sub_n.s
@ -1,7 +1,7 @@
 ; mc88100 __mpn_sub -- Subtract two limb vectors of the same length > 0 and
 ; store difference in a third limb vector.
-; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+; Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc.
 ; This file is part of the GNU MP Library.
@ -41,9 +41,10 @@ ___mpn_sub_n:
 	extu	r10,r5,3
 	ld	r7,r4,0			; read first limb from s2_ptr
-	subu.co	r5,r0,r5		; (clear carry as side effect)
+	subu	r5,r0,r5
 	mak	r5,r5,3<4>
-	bcnd	eq0,r5,Lzero
+	bcnd.n	eq0,r5,Lzero
 	subu.co	r0,r0,r0		; initialize carry
 	or	r12,r0,lo16(Lbase)
 	or.u	r12,r12,hi16(Lbase)
--- a/sysdeps/mips/addmul_1.s
+++ b/sysdeps/mips/addmul_1.s
@ -1,7 +1,7 @@
 # MIPS __mpn_addmul_1 -- Multiply a limb vector with a single limb and
 # add the product to a second limb vector.
- # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+ # Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc.
 # This file is part of the GNU MP Library.
@ -63,7 +63,7 @@ Loop:	lw	$10,0($4)
 	addu	$2,$2,$10
 	sw	$3,0($4)
 	addiu	$4,$4,4
-	bne	$6,$0,Loop	# should be "bnel"
+	bne	$6,$0,Loop
 	 addu	$2,$9,$2	# add high product limb and carry from addition
 # cool down phase 1
--- a/sysdeps/mips/mips3/addmul_1.s
+++ b/sysdeps/mips/mips3/addmul_1.s
@ -63,7 +63,7 @@ Loop:	ld	$10,0($4)
 	daddu	$2,$2,$10
 	sd	$3,0($4)
 	daddiu	$4,$4,8
-	bne	$6,$0,Loop	# should be "bnel"
+	bne	$6,$0,Loop
 	 daddu	$2,$9,$2	# add high product limb and carry from addition
 # cool down phase 1
--- a/sysdeps/mips/mips3/mul_1.s
+++ b/sysdeps/mips/mips3/mul_1.s
@ -59,7 +59,7 @@ Loop:	mflo	$10
 	sltu	$2,$10,$2	# carry from previous addition -> $2
 	sd	$10,0($4)
 	daddiu	$4,$4,8
-	bne	$6,$0,Loop	# should be "bnel"
+	bne	$6,$0,Loop
 	 daddu	$2,$9,$2	# add high product limb and carry from addition
 # cool down phase 1
--- a/sysdeps/mips/mips3/submul_1.s
+++ b/sysdeps/mips/mips3/submul_1.s
@ -63,7 +63,7 @@ Loop:	ld	$10,0($4)
 	daddu	$2,$2,$10
 	sd	$3,0($4)
 	daddiu	$4,$4,8
-	bne	$6,$0,Loop	# should be "bnel"
+	bne	$6,$0,Loop
 	 daddu	$2,$9,$2	# add high product limb and carry from addition
 # cool down phase 1
--- a/sysdeps/mips/mul_1.s
+++ b/sysdeps/mips/mul_1.s
@ -1,7 +1,7 @@
 # MIPS __mpn_mul_1 -- Multiply a limb vector with a single limb and
 # store the product in a second limb vector.
- # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+ # Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc.
 # This file is part of the GNU MP Library.
@ -59,7 +59,7 @@ Loop:	mflo	$10
 	sltu	$2,$10,$2	# carry from previous addition -> $2
 	sw	$10,0($4)
 	addiu	$4,$4,4
-	bne	$6,$0,Loop	# should be "bnel"
+	bne	$6,$0,Loop
 	 addu	$2,$9,$2	# add high product limb and carry from addition
 # cool down phase 1
--- a/sysdeps/mips/submul_1.s
+++ b/sysdeps/mips/submul_1.s
@ -1,7 +1,7 @@
 # MIPS __mpn_submul_1 -- Multiply a limb vector with a single limb and
 # subtract the product from a second limb vector.
- # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+ # Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc.
 # This file is part of the GNU MP Library.
@ -63,7 +63,7 @@ Loop:	lw	$10,0($4)
 	addu	$2,$2,$10
 	sw	$3,0($4)
 	addiu	$4,$4,4
-	bne	$6,$0,Loop	# should be "bnel"
+	bne	$6,$0,Loop
 	 addu	$2,$9,$2	# add high product limb and carry from addition
 # cool down phase 1
--- a/sysdeps/rs6000/add_n.s
+++ b/sysdeps/rs6000/add_n.s
@ -1,6 +1,6 @@
 # IBM POWER __mpn_add_n -- Add two limb vectors of equal, non-zero length.
-# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+# Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
 # This file is part of the GNU MP Library.
--- a/sysdeps/rs6000/sub_n.s
+++ b/sysdeps/rs6000/sub_n.s
@ -1,7 +1,7 @@
 # IBM POWER __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
 # store difference in a third limb vector.
-# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+# Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
 # This file is part of the GNU MP Library.
--- a/sysdeps/vax/gmp-mparam.h
+++ b/sysdeps/vax/gmp-mparam.h
@ -1,6 +1,6 @@
 /* gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
 This file is part of the GNU MP Library.
--- a/sysdeps/z8000/mul_1.s
+++ b/sysdeps/z8000/mul_1.s
@ -1,7 +1,7 @@
 ! Z8000 __mpn_mul_1 -- Multiply a limb vector with a limb and store
 ! the result in a second limb vector.
-! Copyright (C) 1993, 1994 Free Software Foundation, Inc.
+! Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
 ! This file is part of the GNU MP Library.