1/* 2 Copyright 2003 Richard Curnow, SuperH (UK) Ltd. 3 4 This file is subject to the terms and conditions of the GNU General Public 5 License. See the file "COPYING" in the main directory of this archive 6 for more details. 7 8 Tight version of mempy for the case of just copying a page. 9 Prefetch strategy empirically optimised against RTL simulations 10 of SH5-101 cut2 eval chip with Cayman board DDR memory. 11 12 Parameters: 13 r2 : destination effective address (start of page) 14 r3 : source effective address (start of page) 15 16 Always copies 4096 bytes. 17 18 Points to review. 19 * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead. 20 It seems like the prefetch needs to be at at least 4 lines ahead to get 21 the data into the cache in time, and the allocos contend with outstanding 22 prefetches for the same cache set, so it's better to have the numbers 23 different. 24 */ 25 26 .section .text..SHmedia32,"ax" 27 .little 28 29 .balign 8 30 .global copy_page 31copy_page: 32 33 /* Copy 4096 bytes worth of data from r3 to r2. 34 Do prefetches 4 lines ahead. 35 Do alloco 2 lines ahead */ 36 37 pta 1f, tr1 38 pta 2f, tr2 39 pta 3f, tr3 40 ptabs r18, tr0 41 42#if 0 43 /* TAKum03020 */ 44 ld.q r3, 0x00, r63 45 ld.q r3, 0x20, r63 46 ld.q r3, 0x40, r63 47 ld.q r3, 0x60, r63 48#endif 49 alloco r2, 0x00 50 synco ! TAKum03020 51 alloco r2, 0x20 52 synco ! TAKum03020 53 54 movi 3968, r6 55 add r2, r6, r6 56 addi r6, 64, r7 57 addi r7, 64, r8 58 sub r3, r2, r60 59 addi r60, 8, r61 60 addi r61, 8, r62 61 addi r62, 8, r23 62 addi r60, 0x80, r22 63 64/* Minimal code size. The extra branches inside the loop don't cost much 65 because they overlap with the time spent waiting for prefetches to 66 complete. */ 671: 68#if 0 69 /* TAKum03020 */ 70 bge/u r2, r6, tr2 ! skip prefetch for last 4 lines 71 ldx.q r2, r22, r63 ! prefetch 4 lines hence 72#endif 732: 74 bge/u r2, r7, tr3 ! skip alloco for last 2 lines 75 alloco r2, 0x40 ! alloc destination line 2 lines ahead 76 synco ! TAKum03020 773: 78 ldx.q r2, r60, r36 79 ldx.q r2, r61, r37 80 ldx.q r2, r62, r38 81 ldx.q r2, r23, r39 82 st.q r2, 0, r36 83 st.q r2, 8, r37 84 st.q r2, 16, r38 85 st.q r2, 24, r39 86 addi r2, 32, r2 87 bgt/l r8, r2, tr1 88 89 blink tr0, r63 ! return 90