1! Copyright (C) 2008-2012 Imagination Technologies Ltd. 2 3 .text 4 .global _memcpy 5 .type _memcpy,function 6! D1Ar1 dst 7! D0Ar2 src 8! D1Ar3 cnt 9! D0Re0 dst 10_memcpy: 11 CMP D1Ar3, #16 12 MOV A1.2, D0Ar2 ! source pointer 13 MOV A0.2, D1Ar1 ! destination pointer 14 MOV A0.3, D1Ar1 ! for return value 15! If there are less than 16 bytes to copy use the byte copy loop 16 BGE $Llong_copy 17 18$Lbyte_copy: 19! Simply copy a byte at a time 20 SUBS TXRPT, D1Ar3, #1 21 BLT $Lend 22$Lloop_byte: 23 GETB D1Re0, [A1.2++] 24 SETB [A0.2++], D1Re0 25 BR $Lloop_byte 26 27$Lend: 28! Finally set return value and return 29 MOV D0Re0, A0.3 30 MOV PC, D1RtP 31 32$Llong_copy: 33 ANDS D1Ar5, D1Ar1, #7 ! test destination alignment 34 BZ $Laligned_dst 35 36! The destination address is not 8 byte aligned. We will copy bytes from 37! the source to the destination until the remaining data has an 8 byte 38! destination address alignment (i.e we should never copy more than 7 39! bytes here). 40$Lalign_dst: 41 GETB D0Re0, [A1.2++] 42 ADD D1Ar5, D1Ar5, #1 ! dest is aligned when D1Ar5 reaches #8 43 SUB D1Ar3, D1Ar3, #1 ! decrement count of remaining bytes 44 SETB [A0.2++], D0Re0 45 CMP D1Ar5, #8 46 BNE $Lalign_dst 47 48! We have at least (16 - 7) = 9 bytes to copy - calculate the number of 8 byte 49! blocks, then jump to the unaligned copy loop or fall through to the aligned 50! copy loop as appropriate. 51$Laligned_dst: 52 MOV D0Ar4, A1.2 53 LSR D1Ar5, D1Ar3, #3 ! D1Ar5 = number of 8 byte blocks 54 ANDS D0Ar4, D0Ar4, #7 ! test source alignment 55 BNZ $Lunaligned_copy ! if unaligned, use unaligned copy loop 56 57! Both source and destination are 8 byte aligned - the easy case. 58$Laligned_copy: 59 LSRS D1Ar5, D1Ar3, #5 ! D1Ar5 = number of 32 byte blocks 60 BZ $Lbyte_copy 61 SUB TXRPT, D1Ar5, #1 62 63$Laligned_32: 64 GETL D0Re0, D1Re0, [A1.2++] 65 GETL D0Ar6, D1Ar5, [A1.2++] 66 SETL [A0.2++], D0Re0, D1Re0 67 SETL [A0.2++], D0Ar6, D1Ar5 68 GETL D0Re0, D1Re0, [A1.2++] 69 GETL D0Ar6, D1Ar5, [A1.2++] 70 SETL [A0.2++], D0Re0, D1Re0 71 SETL [A0.2++], D0Ar6, D1Ar5 72 BR $Laligned_32 73 74! If there are any remaining bytes use the byte copy loop, otherwise we are done 75 ANDS D1Ar3, D1Ar3, #0x1f 76 BNZ $Lbyte_copy 77 B $Lend 78 79! The destination is 8 byte aligned but the source is not, and there are 8 80! or more bytes to be copied. 81$Lunaligned_copy: 82! Adjust the source pointer (A1.2) to the 8 byte boundary before its 83! current value 84 MOV D0Ar4, A1.2 85 MOV D0Ar6, A1.2 86 ANDMB D0Ar4, D0Ar4, #0xfff8 87 MOV A1.2, D0Ar4 88! Save the number of bytes of mis-alignment in D0Ar4 for use later 89 SUBS D0Ar6, D0Ar6, D0Ar4 90 MOV D0Ar4, D0Ar6 91! if there is no mis-alignment after all, use the aligned copy loop 92 BZ $Laligned_copy 93 94! prefetch 8 bytes 95 GETL D0Re0, D1Re0, [A1.2] 96 97 SUB TXRPT, D1Ar5, #1 98 99! There are 3 mis-alignment cases to be considered. Less than 4 bytes, exactly 100! 4 bytes, and more than 4 bytes. 101 CMP D0Ar6, #4 102 BLT $Lunaligned_1_2_3 ! use 1-3 byte mis-alignment loop 103 BZ $Lunaligned_4 ! use 4 byte mis-alignment loop 104 105! The mis-alignment is more than 4 bytes 106$Lunaligned_5_6_7: 107 SUB D0Ar6, D0Ar6, #4 108! Calculate the bit offsets required for the shift operations necesssary 109! to align the data. 110! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset) 111 MULW D0Ar6, D0Ar6, #8 112 MOV D1Ar5, #32 113 SUB D1Ar5, D1Ar5, D0Ar6 114! Move data 4 bytes before we enter the main loop 115 MOV D0Re0, D1Re0 116 117$Lloop_5_6_7: 118 GETL D0Ar2, D1Ar1, [++A1.2] 119! form 64-bit data in D0Re0, D1Re0 120 LSR D0Re0, D0Re0, D0Ar6 121 MOV D1Re0, D0Ar2 122 LSL D1Re0, D1Re0, D1Ar5 123 ADD D0Re0, D0Re0, D1Re0 124 125 LSR D0Ar2, D0Ar2, D0Ar6 126 LSL D1Re0, D1Ar1, D1Ar5 127 ADD D1Re0, D1Re0, D0Ar2 128 129 SETL [A0.2++], D0Re0, D1Re0 130 MOV D0Re0, D1Ar1 131 BR $Lloop_5_6_7 132 133 B $Lunaligned_end 134 135$Lunaligned_1_2_3: 136! Calculate the bit offsets required for the shift operations necesssary 137! to align the data. 138! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset) 139 MULW D0Ar6, D0Ar6, #8 140 MOV D1Ar5, #32 141 SUB D1Ar5, D1Ar5, D0Ar6 142 143$Lloop_1_2_3: 144! form 64-bit data in D0Re0,D1Re0 145 LSR D0Re0, D0Re0, D0Ar6 146 LSL D1Ar1, D1Re0, D1Ar5 147 ADD D0Re0, D0Re0, D1Ar1 148 MOV D0Ar2, D1Re0 149 LSR D0FrT, D0Ar2, D0Ar6 150 GETL D0Ar2, D1Ar1, [++A1.2] 151 152 MOV D1Re0, D0Ar2 153 LSL D1Re0, D1Re0, D1Ar5 154 ADD D1Re0, D1Re0, D0FrT 155 156 SETL [A0.2++], D0Re0, D1Re0 157 MOV D0Re0, D0Ar2 158 MOV D1Re0, D1Ar1 159 BR $Lloop_1_2_3 160 161 B $Lunaligned_end 162 163! The 4 byte mis-alignment case - this does not require any shifting, just a 164! shuffling of registers. 165$Lunaligned_4: 166 MOV D0Re0, D1Re0 167$Lloop_4: 168 GETL D0Ar2, D1Ar1, [++A1.2] 169 MOV D1Re0, D0Ar2 170 SETL [A0.2++], D0Re0, D1Re0 171 MOV D0Re0, D1Ar1 172 BR $Lloop_4 173 174$Lunaligned_end: 175! If there are no remaining bytes to copy, we are done. 176 ANDS D1Ar3, D1Ar3, #7 177 BZ $Lend 178! Re-adjust the source pointer (A1.2) back to the actual (unaligned) byte 179! address of the remaining bytes, and fall through to the byte copy loop. 180 MOV D0Ar6, A1.2 181 ADD D1Ar5, D0Ar4, D0Ar6 182 MOV A1.2, D1Ar5 183 B $Lbyte_copy 184 185 .size _memcpy,.-_memcpy 186