90 lines
		
	
	
		
			2.1 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			90 lines
		
	
	
		
			2.1 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| /*
 | |
|    Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
 | |
| 
 | |
|    This file is subject to the terms and conditions of the GNU General Public
 | |
|    License.  See the file "COPYING" in the main directory of this archive
 | |
|    for more details.
 | |
| 
 | |
|    Tight version of mempy for the case of just copying a page.
 | |
|    Prefetch strategy empirically optimised against RTL simulations
 | |
|    of SH5-101 cut2 eval chip with Cayman board DDR memory.
 | |
| 
 | |
|    Parameters:
 | |
|    r2 : destination effective address (start of page)
 | |
|    r3 : source effective address (start of page)
 | |
| 
 | |
|    Always copies 4096 bytes.
 | |
| 
 | |
|    Points to review.
 | |
|    * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead.
 | |
|      It seems like the prefetch needs to be at at least 4 lines ahead to get
 | |
|      the data into the cache in time, and the allocos contend with outstanding
 | |
|      prefetches for the same cache set, so it's better to have the numbers
 | |
|      different.
 | |
|    */
 | |
| 
 | |
| 	.section .text..SHmedia32,"ax"
 | |
| 	.little
 | |
| 
 | |
| 	.balign 8
 | |
| 	.global copy_page
 | |
| copy_page:
 | |
| 
 | |
| 	/* Copy 4096 bytes worth of data from r3 to r2.
 | |
| 	   Do prefetches 4 lines ahead.
 | |
| 	   Do alloco 2 lines ahead */
 | |
| 
 | |
| 	pta 1f, tr1
 | |
| 	pta 2f, tr2
 | |
| 	pta 3f, tr3
 | |
| 	ptabs r18, tr0
 | |
| 
 | |
| #if 0
 | |
| 	/* TAKum03020 */
 | |
| 	ld.q r3, 0x00, r63
 | |
| 	ld.q r3, 0x20, r63
 | |
| 	ld.q r3, 0x40, r63
 | |
| 	ld.q r3, 0x60, r63
 | |
| #endif
 | |
| 	alloco r2, 0x00
 | |
| 	synco		! TAKum03020
 | |
| 	alloco r2, 0x20
 | |
| 	synco		! TAKum03020
 | |
| 
 | |
| 	movi 3968, r6
 | |
| 	add  r2, r6, r6
 | |
| 	addi r6, 64, r7
 | |
| 	addi r7, 64, r8
 | |
| 	sub r3, r2, r60
 | |
| 	addi r60, 8, r61
 | |
| 	addi r61, 8, r62
 | |
| 	addi r62, 8, r23
 | |
| 	addi r60, 0x80, r22
 | |
| 
 | |
| /* Minimal code size.  The extra branches inside the loop don't cost much
 | |
|    because they overlap with the time spent waiting for prefetches to
 | |
|    complete. */
 | |
| 1:
 | |
| #if 0
 | |
| 	/* TAKum03020 */
 | |
| 	bge/u r2, r6, tr2  ! skip prefetch for last 4 lines
 | |
| 	ldx.q r2, r22, r63 ! prefetch 4 lines hence
 | |
| #endif
 | |
| 2:
 | |
| 	bge/u r2, r7, tr3  ! skip alloco for last 2 lines
 | |
| 	alloco r2, 0x40    ! alloc destination line 2 lines ahead
 | |
| 	synco		! TAKum03020
 | |
| 3:
 | |
| 	ldx.q r2, r60, r36
 | |
| 	ldx.q r2, r61, r37
 | |
| 	ldx.q r2, r62, r38
 | |
| 	ldx.q r2, r23, r39
 | |
| 	st.q  r2,   0, r36
 | |
| 	st.q  r2,   8, r37
 | |
| 	st.q  r2,  16, r38
 | |
| 	st.q  r2,  24, r39
 | |
| 	addi r2, 32, r2
 | |
| 	bgt/l r8, r2, tr1
 | |
| 
 | |
| 	blink tr0, r63	   ! return
 | 
