593 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			593 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| /*
 | |
|  *  Copyright (C) 2003-2013 Altera Corporation
 | |
|  *  All rights reserved.
 | |
|  *
 | |
|  * This program is free software; you can redistribute it and/or modify
 | |
|  * it under the terms of the GNU General Public License as published by
 | |
|  * the Free Software Foundation; either version 2 of the License, or
 | |
|  * (at your option) any later version.
 | |
|  *
 | |
|  * This program is distributed in the hope that it will be useful,
 | |
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
|  * GNU General Public License for more details.
 | |
|  *
 | |
|  * You should have received a copy of the GNU General Public License
 | |
|  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 | |
|  */
 | |
| 
 | |
| 
 | |
| #include <linux/linkage.h>
 | |
| #include <asm/entry.h>
 | |
| 
 | |
| .set noat
 | |
| .set nobreak
 | |
| 
 | |
| /*
 | |
| * Explicitly allow the use of r1 (the assembler temporary register)
 | |
| * within this code. This register is normally reserved for the use of
 | |
| * the compiler.
 | |
| */
 | |
| 
 | |
| ENTRY(instruction_trap)
 | |
| 	ldw	r1, PT_R1(sp)		// Restore registers
 | |
| 	ldw	r2, PT_R2(sp)
 | |
| 	ldw	r3, PT_R3(sp)
 | |
| 	ldw	r4, PT_R4(sp)
 | |
| 	ldw	r5, PT_R5(sp)
 | |
| 	ldw	r6, PT_R6(sp)
 | |
| 	ldw	r7, PT_R7(sp)
 | |
| 	ldw	r8, PT_R8(sp)
 | |
| 	ldw	r9, PT_R9(sp)
 | |
| 	ldw	r10, PT_R10(sp)
 | |
| 	ldw	r11, PT_R11(sp)
 | |
| 	ldw	r12, PT_R12(sp)
 | |
| 	ldw	r13, PT_R13(sp)
 | |
| 	ldw	r14, PT_R14(sp)
 | |
| 	ldw	r15, PT_R15(sp)
 | |
| 	ldw	ra, PT_RA(sp)
 | |
| 	ldw	fp, PT_FP(sp)
 | |
| 	ldw	gp, PT_GP(sp)
 | |
| 	ldw	et, PT_ESTATUS(sp)
 | |
| 	wrctl	estatus, et
 | |
| 	ldw	ea, PT_EA(sp)
 | |
| 	ldw	et, PT_SP(sp)		/* backup sp in et */
 | |
| 
 | |
| 	addi	sp, sp, PT_REGS_SIZE
 | |
| 
 | |
| 	/* INSTRUCTION EMULATION
 | |
| 	*  ---------------------
 | |
| 	*
 | |
| 	* Nios II processors generate exceptions for unimplemented instructions.
 | |
| 	* The routines below emulate these instructions.  Depending on the
 | |
| 	* processor core, the only instructions that might need to be emulated
 | |
| 	* are div, divu, mul, muli, mulxss, mulxsu, and mulxuu.
 | |
| 	*
 | |
| 	* The emulations match the instructions, except for the following
 | |
| 	* limitations:
 | |
| 	*
 | |
| 	* 1) The emulation routines do not emulate the use of the exception
 | |
| 	*    temporary register (et) as a source operand because the exception
 | |
| 	*    handler already has modified it.
 | |
| 	*
 | |
| 	* 2) The routines do not emulate the use of the stack pointer (sp) or
 | |
| 	*    the exception return address register (ea) as a destination because
 | |
| 	*    modifying these registers crashes the exception handler or the
 | |
| 	*    interrupted routine.
 | |
| 	*
 | |
| 	* Detailed Design
 | |
| 	* ---------------
 | |
| 	*
 | |
| 	* The emulation routines expect the contents of integer registers r0-r31
 | |
| 	* to be on the stack at addresses sp, 4(sp), 8(sp), ... 124(sp).  The
 | |
| 	* routines retrieve source operands from the stack and modify the
 | |
| 	* destination register's value on the stack prior to the end of the
 | |
| 	* exception handler.  Then all registers except the destination register
 | |
| 	* are restored to their previous values.
 | |
| 	*
 | |
| 	* The instruction that causes the exception is found at address -4(ea).
 | |
| 	* The instruction's OP and OPX fields identify the operation to be
 | |
| 	* performed.
 | |
| 	*
 | |
| 	* One instruction, muli, is an I-type instruction that is identified by
 | |
| 	* an OP field of 0x24.
 | |
| 	*
 | |
| 	* muli   AAAAA,BBBBB,IIIIIIIIIIIIIIII,-0x24-
 | |
| 	*           27    22                6      0    <-- LSB of field
 | |
| 	*
 | |
| 	* The remaining emulated instructions are R-type and have an OP field
 | |
| 	* of 0x3a.  Their OPX fields identify them.
 | |
| 	*
 | |
| 	* R-type AAAAA,BBBBB,CCCCC,XXXXXX,NNNNN,-0x3a-
 | |
| 	*           27    22    17     11     6      0  <-- LSB of field
 | |
| 	*
 | |
| 	*
 | |
| 	* Opcode Encoding.  muli is identified by its OP value.  Then OPX & 0x02
 | |
| 	* is used to differentiate between the division opcodes and the
 | |
| 	* remaining multiplication opcodes.
 | |
| 	*
 | |
| 	* Instruction   OP      OPX    OPX & 0x02
 | |
| 	* -----------   ----    ----   ----------
 | |
| 	* muli          0x24
 | |
| 	* divu          0x3a    0x24         0
 | |
| 	* div           0x3a    0x25         0
 | |
| 	* mul           0x3a    0x27      != 0
 | |
| 	* mulxuu        0x3a    0x07      != 0
 | |
| 	* mulxsu        0x3a    0x17      != 0
 | |
| 	* mulxss        0x3a    0x1f      != 0
 | |
| 	*/
 | |
| 
 | |
| 
 | |
| 	/*
 | |
| 	* Save everything on the stack to make it easy for the emulation
 | |
| 	* routines to retrieve the source register operands.
 | |
| 	*/
 | |
| 
 | |
| 	addi sp, sp, -128
 | |
| 	stw zero, 0(sp)	/* Save zero on stack to avoid special case for r0. */
 | |
| 	stw r1, 4(sp)
 | |
| 	stw r2,  8(sp)
 | |
| 	stw r3, 12(sp)
 | |
| 	stw r4, 16(sp)
 | |
| 	stw r5, 20(sp)
 | |
| 	stw r6, 24(sp)
 | |
| 	stw r7, 28(sp)
 | |
| 	stw r8, 32(sp)
 | |
| 	stw r9, 36(sp)
 | |
| 	stw r10, 40(sp)
 | |
| 	stw r11, 44(sp)
 | |
| 	stw r12, 48(sp)
 | |
| 	stw r13, 52(sp)
 | |
| 	stw r14, 56(sp)
 | |
| 	stw r15, 60(sp)
 | |
| 	stw r16, 64(sp)
 | |
| 	stw r17, 68(sp)
 | |
| 	stw r18, 72(sp)
 | |
| 	stw r19, 76(sp)
 | |
| 	stw r20, 80(sp)
 | |
| 	stw r21, 84(sp)
 | |
| 	stw r22, 88(sp)
 | |
| 	stw r23, 92(sp)
 | |
| 		/* Don't bother to save et.  It's already been changed. */
 | |
| 	rdctl r5, estatus
 | |
| 	stw r5,  100(sp)
 | |
| 
 | |
| 	stw gp, 104(sp)
 | |
| 	stw et, 108(sp)	/* et contains previous sp value. */
 | |
| 	stw fp, 112(sp)
 | |
| 	stw ea, 116(sp)
 | |
| 	stw ra, 120(sp)
 | |
| 
 | |
| 
 | |
| 	/*
 | |
| 	* Split the instruction into its fields.  We need 4*A, 4*B, and 4*C as
 | |
| 	* offsets to the stack pointer for access to the stored register values.
 | |
| 	*/
 | |
| 	ldw r2,-4(ea)	/* r2 = AAAAA,BBBBB,IIIIIIIIIIIIIIII,PPPPPP */
 | |
| 	roli r3, r2, 7	/* r3 = BBB,IIIIIIIIIIIIIIII,PPPPPP,AAAAA,BB */
 | |
| 	roli r4, r3, 3	/* r4 = IIIIIIIIIIIIIIII,PPPPPP,AAAAA,BBBBB */
 | |
| 	roli r5, r4, 2	/* r5 = IIIIIIIIIIIIII,PPPPPP,AAAAA,BBBBB,II */
 | |
| 	srai r4, r4, 16	/* r4 = (sign-extended) IMM16 */
 | |
| 	roli r6, r5, 5	/* r6 = XXXX,NNNNN,PPPPPP,AAAAA,BBBBB,CCCCC,XX */
 | |
| 	andi r2, r2, 0x3f	/* r2 = 00000000000000000000000000,PPPPPP */
 | |
| 	andi r3, r3, 0x7c	/* r3 = 0000000000000000000000000,AAAAA,00 */
 | |
| 	andi r5, r5, 0x7c	/* r5 = 0000000000000000000000000,BBBBB,00 */
 | |
| 	andi r6, r6, 0x7c	/* r6 = 0000000000000000000000000,CCCCC,00 */
 | |
| 
 | |
| 	/* Now
 | |
| 	* r2 = OP
 | |
| 	* r3 = 4*A
 | |
| 	* r4 = IMM16 (sign extended)
 | |
| 	* r5 = 4*B
 | |
| 	* r6 = 4*C
 | |
| 	*/
 | |
| 
 | |
| 	/*
 | |
| 	* Get the operands.
 | |
| 	*
 | |
| 	* It is necessary to check for muli because it uses an I-type
 | |
| 	* instruction format, while the other instructions are have an R-type
 | |
| 	* format.
 | |
| 	*
 | |
| 	*  Prepare for either multiplication or division loop.
 | |
| 	*  They both loop 32 times.
 | |
| 	*/
 | |
| 	movi r14, 32
 | |
| 
 | |
| 	add  r3, r3, sp		/* r3 = address of A-operand. */
 | |
| 	ldw  r3, 0(r3)		/* r3 = A-operand. */
 | |
| 	movi r7, 0x24		/* muli opcode (I-type instruction format) */
 | |
| 	beq r2, r7, mul_immed /* muli doesn't use the B register as a source */
 | |
| 
 | |
| 	add  r5, r5, sp		/* r5 = address of B-operand. */
 | |
| 	ldw  r5, 0(r5)		/* r5 = B-operand. */
 | |
| 				/* r4 = SSSSSSSSSSSSSSSS,-----IMM16------ */
 | |
| 				/* IMM16 not needed, align OPX portion */
 | |
| 				/* r4 = SSSSSSSSSSSSSSSS,CCCCC,-OPX--,00000 */
 | |
| 	srli r4, r4, 5		/* r4 = 00000,SSSSSSSSSSSSSSSS,CCCCC,-OPX-- */
 | |
| 	andi r4, r4, 0x3f	/* r4 = 00000000000000000000000000,-OPX-- */
 | |
| 
 | |
| 	/* Now
 | |
| 	* r2 = OP
 | |
| 	* r3 = src1
 | |
| 	* r5 = src2
 | |
| 	* r4 = OPX (no longer can be muli)
 | |
| 	* r6 = 4*C
 | |
| 	*/
 | |
| 
 | |
| 
 | |
| 	/*
 | |
| 	*  Multiply or Divide?
 | |
| 	*/
 | |
| 	andi r7, r4, 0x02	/* For R-type multiply instructions,
 | |
| 				   OPX & 0x02 != 0 */
 | |
| 	bne r7, zero, multiply
 | |
| 
 | |
| 
 | |
| 	/* DIVISION
 | |
| 	*
 | |
| 	* Divide an unsigned dividend by an unsigned divisor using
 | |
| 	* a shift-and-subtract algorithm.  The example below shows
 | |
| 	* 43 div 7 = 6 for 8-bit integers.  This classic algorithm uses a
 | |
| 	* single register to store both the dividend and the quotient,
 | |
| 	* allowing both values to be shifted with a single instruction.
 | |
| 	*
 | |
| 	*                               remainder dividend:quotient
 | |
| 	*                               --------- -----------------
 | |
| 	*   initialize                   00000000     00101011:
 | |
| 	*   shift                        00000000     0101011:_
 | |
| 	*   remainder >= divisor? no     00000000     0101011:0
 | |
| 	*   shift                        00000000     101011:0_
 | |
| 	*   remainder >= divisor? no     00000000     101011:00
 | |
| 	*   shift                        00000001     01011:00_
 | |
| 	*   remainder >= divisor? no     00000001     01011:000
 | |
| 	*   shift                        00000010     1011:000_
 | |
| 	*   remainder >= divisor? no     00000010     1011:0000
 | |
| 	*   shift                        00000101     011:0000_
 | |
| 	*   remainder >= divisor? no     00000101     011:00000
 | |
| 	*   shift                        00001010     11:00000_
 | |
| 	*   remainder >= divisor? yes    00001010     11:000001
 | |
| 	*       remainder -= divisor   - 00000111
 | |
| 	*                              ----------
 | |
| 	*                                00000011     11:000001
 | |
| 	*   shift                        00000111     1:000001_
 | |
| 	*   remainder >= divisor? yes    00000111     1:0000011
 | |
| 	*       remainder -= divisor   - 00000111
 | |
| 	*                              ----------
 | |
| 	*                                00000000     1:0000011
 | |
| 	*   shift                        00000001     :0000011_
 | |
| 	*   remainder >= divisor? no     00000001     :00000110
 | |
| 	*
 | |
| 	* The quotient is 00000110.
 | |
| 	*/
 | |
| 
 | |
| divide:
 | |
| 	/*
 | |
| 	*  Prepare for division by assuming the result
 | |
| 	*  is unsigned, and storing its "sign" as 0.
 | |
| 	*/
 | |
| 	movi r17, 0
 | |
| 
 | |
| 
 | |
| 	/* Which division opcode? */
 | |
| 	xori r7, r4, 0x25		/* OPX of div */
 | |
| 	bne r7, zero, unsigned_division
 | |
| 
 | |
| 
 | |
| 	/*
 | |
| 	*  OPX is div.  Determine and store the sign of the quotient.
 | |
| 	*  Then take the absolute value of both operands.
 | |
| 	*/
 | |
| 	xor r17, r3, r5		/* MSB contains sign of quotient */
 | |
| 	bge r3,zero,dividend_is_nonnegative
 | |
| 	sub r3, zero, r3	/* -r3 */
 | |
| dividend_is_nonnegative:
 | |
| 	bge r5, zero, divisor_is_nonnegative
 | |
| 	sub r5, zero, r5	/* -r5 */
 | |
| divisor_is_nonnegative:
 | |
| 
 | |
| 
 | |
| unsigned_division:
 | |
| 	/* Initialize the unsigned-division loop. */
 | |
| 	movi r13, 0	/* remainder = 0 */
 | |
| 
 | |
| 	/* Now
 | |
| 	* r3 = dividend : quotient
 | |
| 	* r4 = 0x25 for div, 0x24 for divu
 | |
| 	* r5 = divisor
 | |
| 	* r13 = remainder
 | |
| 	* r14 = loop counter (already initialized to 32)
 | |
| 	* r17 = MSB contains sign of quotient
 | |
| 	*/
 | |
| 
 | |
| 
 | |
| 	/*
 | |
| 	*   for (count = 32; count > 0; --count)
 | |
| 	*   {
 | |
| 	*/
 | |
| divide_loop:
 | |
| 
 | |
| 	/*
 | |
| 	*       Division:
 | |
| 	*
 | |
| 	*       (remainder:dividend:quotient) <<= 1;
 | |
| 	*/
 | |
| 	slli r13, r13, 1
 | |
| 	cmplt r7, r3, zero	/* r7 = MSB of r3 */
 | |
| 	or r13, r13, r7
 | |
| 	slli r3, r3, 1
 | |
| 
 | |
| 
 | |
| 	/*
 | |
| 	*       if (remainder >= divisor)
 | |
| 	*       {
 | |
| 	*           set LSB of quotient
 | |
| 	*           remainder -= divisor;
 | |
| 	*       }
 | |
| 	*/
 | |
| 	bltu r13, r5, div_skip
 | |
| 	ori r3, r3, 1
 | |
| 	sub r13, r13, r5
 | |
| div_skip:
 | |
| 
 | |
| 	/*
 | |
| 	*   }
 | |
| 	*/
 | |
| 	subi r14, r14, 1
 | |
| 	bne r14, zero, divide_loop
 | |
| 
 | |
| 
 | |
| 	/* Now
 | |
| 	* r3 = quotient
 | |
| 	* r4 = 0x25 for div, 0x24 for divu
 | |
| 	* r6 = 4*C
 | |
| 	* r17 = MSB contains sign of quotient
 | |
| 	*/
 | |
| 
 | |
| 
 | |
| 	/*
 | |
| 	*  Conditionally negate signed quotient.  If quotient is unsigned,
 | |
| 	*  the sign already is initialized to 0.
 | |
| 	*/
 | |
| 	bge r17, zero, quotient_is_nonnegative
 | |
| 	sub r3, zero, r3		/* -r3 */
 | |
| 	quotient_is_nonnegative:
 | |
| 
 | |
| 
 | |
| 	/*
 | |
| 	*  Final quotient is in r3.
 | |
| 	*/
 | |
| 	add r6, r6, sp
 | |
| 	stw r3, 0(r6)	/* write quotient to stack */
 | |
| 	br restore_registers
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 	/* MULTIPLICATION
 | |
| 	*
 | |
| 	* A "product" is the number that one gets by summing a "multiplicand"
 | |
| 	* several times.  The "multiplier" specifies the number of copies of the
 | |
| 	* multiplicand that are summed.
 | |
| 	*
 | |
| 	* Actual multiplication algorithms don't use repeated addition, however.
 | |
| 	* Shift-and-add algorithms get the same answer as repeated addition, and
 | |
| 	* they are faster.  To compute the lower half of a product (pppp below)
 | |
| 	* one shifts the product left before adding in each of the partial
 | |
| 	* products (a * mmmm) through (d * mmmm).
 | |
| 	*
 | |
| 	* To compute the upper half of a product (PPPP below), one adds in the
 | |
| 	* partial products (d * mmmm) through (a * mmmm), each time following
 | |
| 	* the add by a right shift of the product.
 | |
| 	*
 | |
| 	*     mmmm
 | |
| 	*   * abcd
 | |
| 	*   ------
 | |
| 	*     ####  = d * mmmm
 | |
| 	*    ####   = c * mmmm
 | |
| 	*   ####    = b * mmmm
 | |
| 	*  ####     = a * mmmm
 | |
| 	* --------
 | |
| 	* PPPPpppp
 | |
| 	*
 | |
| 	* The example above shows 4 partial products.  Computing actual Nios II
 | |
| 	* products requires 32 partials.
 | |
| 	*
 | |
| 	* It is possible to compute the result of mulxsu from the result of
 | |
| 	* mulxuu because the only difference between the results of these two
 | |
| 	* opcodes is the value of the partial product associated with the sign
 | |
| 	* bit of rA.
 | |
| 	*
 | |
| 	*   mulxsu = mulxuu - (rA < 0) ? rB : 0;
 | |
| 	*
 | |
| 	* It is possible to compute the result of mulxss from the result of
 | |
| 	* mulxsu because the only difference between the results of these two
 | |
| 	* opcodes is the value of the partial product associated with the sign
 | |
| 	* bit of rB.
 | |
| 	*
 | |
| 	*   mulxss = mulxsu - (rB < 0) ? rA : 0;
 | |
| 	*
 | |
| 	*/
 | |
| 
 | |
| mul_immed:
 | |
| 	/* Opcode is muli.  Change it into mul for remainder of algorithm. */
 | |
| 	mov r6, r5		/* Field B is dest register, not field C. */
 | |
| 	mov r5, r4		/* Field IMM16 is src2, not field B. */
 | |
| 	movi r4, 0x27		/* OPX of mul is 0x27 */
 | |
| 
 | |
| multiply:
 | |
| 	/* Initialize the multiplication loop. */
 | |
| 	movi r9, 0	/* mul_product    = 0 */
 | |
| 	movi r10, 0	/* mulxuu_product = 0 */
 | |
| 	mov r11, r5	/* save original multiplier for mulxsu and mulxss */
 | |
| 	mov r12, r5	/* mulxuu_multiplier (will be shifted) */
 | |
| 	movi r16, 1	/* used to create "rori B,A,1" from "ror B,A,r16" */
 | |
| 
 | |
| 	/* Now
 | |
| 	* r3 = multiplicand
 | |
| 	* r5 = mul_multiplier
 | |
| 	* r6 = 4 * dest_register (used later as offset to sp)
 | |
| 	* r7 = temp
 | |
| 	* r9 = mul_product
 | |
| 	* r10 = mulxuu_product
 | |
| 	* r11 = original multiplier
 | |
| 	* r12 = mulxuu_multiplier
 | |
| 	* r14 = loop counter (already initialized)
 | |
| 	* r16 = 1
 | |
| 	*/
 | |
| 
 | |
| 
 | |
| 	/*
 | |
| 	*   for (count = 32; count > 0; --count)
 | |
| 	*   {
 | |
| 	*/
 | |
| multiply_loop:
 | |
| 
 | |
| 	/*
 | |
| 	*       mul_product <<= 1;
 | |
| 	*       lsb = multiplier & 1;
 | |
| 	*/
 | |
| 	slli r9, r9, 1
 | |
| 	andi r7, r12, 1
 | |
| 
 | |
| 	/*
 | |
| 	*       if (lsb == 1)
 | |
| 	*       {
 | |
| 	*           mulxuu_product += multiplicand;
 | |
| 	*       }
 | |
| 	*/
 | |
| 	beq r7, zero, mulx_skip
 | |
| 	add r10, r10, r3
 | |
| 	cmpltu r7, r10, r3 /* Save the carry from the MSB of mulxuu_product. */
 | |
| 	ror r7, r7, r16	/* r7 = 0x80000000 on carry, or else 0x00000000 */
 | |
| mulx_skip:
 | |
| 
 | |
| 	/*
 | |
| 	*       if (MSB of mul_multiplier == 1)
 | |
| 	*       {
 | |
| 	*           mul_product += multiplicand;
 | |
| 	*       }
 | |
| 	*/
 | |
| 	bge r5, zero, mul_skip
 | |
| 	add r9, r9, r3
 | |
| mul_skip:
 | |
| 
 | |
| 	/*
 | |
| 	*       mulxuu_product >>= 1;           logical shift
 | |
| 	*       mul_multiplier <<= 1;           done with MSB
 | |
| 	*       mulx_multiplier >>= 1;          done with LSB
 | |
| 	*/
 | |
| 	srli r10, r10, 1
 | |
| 	or r10, r10, r7		/* OR in the saved carry bit. */
 | |
| 	slli r5, r5, 1
 | |
| 	srli r12, r12, 1
 | |
| 
 | |
| 
 | |
| 	/*
 | |
| 	*   }
 | |
| 	*/
 | |
| 	subi r14, r14, 1
 | |
| 	bne r14, zero, multiply_loop
 | |
| 
 | |
| 
 | |
| 	/*
 | |
| 	*  Multiply emulation loop done.
 | |
| 	*/
 | |
| 
 | |
| 	/* Now
 | |
| 	* r3 = multiplicand
 | |
| 	* r4 = OPX
 | |
| 	* r6 = 4 * dest_register (used later as offset to sp)
 | |
| 	* r7 = temp
 | |
| 	* r9 = mul_product
 | |
| 	* r10 = mulxuu_product
 | |
| 	* r11 = original multiplier
 | |
| 	*/
 | |
| 
 | |
| 
 | |
| 	/* Calculate address for result from 4 * dest_register */
 | |
| 	add r6, r6, sp
 | |
| 
 | |
| 
 | |
| 	/*
 | |
| 	* Select/compute the result based on OPX.
 | |
| 	*/
 | |
| 
 | |
| 
 | |
| 	/* OPX == mul?  Then store. */
 | |
| 	xori r7, r4, 0x27
 | |
| 	beq r7, zero, store_product
 | |
| 
 | |
| 	/* It's one of the mulx.. opcodes.  Move over the result. */
 | |
| 	mov r9, r10
 | |
| 
 | |
| 	/* OPX == mulxuu?  Then store. */
 | |
| 	xori r7, r4, 0x07
 | |
| 	beq r7, zero, store_product
 | |
| 
 | |
| 	/* Compute mulxsu
 | |
| 	 *
 | |
| 	 * mulxsu = mulxuu - (rA < 0) ? rB : 0;
 | |
| 	 */
 | |
| 	bge r3, zero, mulxsu_skip
 | |
| 	sub r9, r9, r11
 | |
| mulxsu_skip:
 | |
| 
 | |
| 	/* OPX == mulxsu?  Then store. */
 | |
| 	xori r7, r4, 0x17
 | |
| 	beq r7, zero, store_product
 | |
| 
 | |
| 	/* Compute mulxss
 | |
| 	 *
 | |
| 	 * mulxss = mulxsu - (rB < 0) ? rA : 0;
 | |
| 	 */
 | |
| 	bge r11,zero,mulxss_skip
 | |
| 	sub r9, r9, r3
 | |
| mulxss_skip:
 | |
| 	/* At this point, assume that OPX is mulxss, so store*/
 | |
| 
 | |
| 
 | |
| store_product:
 | |
| 	stw r9, 0(r6)
 | |
| 
 | |
| 
 | |
| restore_registers:
 | |
| 			/* No need to restore r0. */
 | |
| 	ldw r5, 100(sp)
 | |
| 	wrctl estatus, r5
 | |
| 
 | |
| 	ldw r1, 4(sp)
 | |
| 	ldw r2, 8(sp)
 | |
| 	ldw r3, 12(sp)
 | |
| 	ldw r4, 16(sp)
 | |
| 	ldw r5, 20(sp)
 | |
| 	ldw r6, 24(sp)
 | |
| 	ldw r7, 28(sp)
 | |
| 	ldw r8, 32(sp)
 | |
| 	ldw r9, 36(sp)
 | |
| 	ldw r10, 40(sp)
 | |
| 	ldw r11, 44(sp)
 | |
| 	ldw r12, 48(sp)
 | |
| 	ldw r13, 52(sp)
 | |
| 	ldw r14, 56(sp)
 | |
| 	ldw r15, 60(sp)
 | |
| 	ldw r16, 64(sp)
 | |
| 	ldw r17, 68(sp)
 | |
| 	ldw r18, 72(sp)
 | |
| 	ldw r19, 76(sp)
 | |
| 	ldw r20, 80(sp)
 | |
| 	ldw r21, 84(sp)
 | |
| 	ldw r22, 88(sp)
 | |
| 	ldw r23, 92(sp)
 | |
| 			/* Does not need to restore et */
 | |
| 	ldw gp, 104(sp)
 | |
| 
 | |
| 	ldw fp, 112(sp)
 | |
| 	ldw ea, 116(sp)
 | |
| 	ldw ra, 120(sp)
 | |
| 	ldw sp, 108(sp)	/* last restore sp */
 | |
| 	eret
 | |
| 
 | |
| .set at
 | |
| .set break
 | 
