1/* 2 * arch/alpha/lib/ev6-stxncpy.S 3 * 21264 version contributed by Rick Gorton <rick.gorton@api-networks.com> 4 * 5 * Copy no more than COUNT bytes of the null-terminated string from 6 * SRC to DST. 7 * 8 * This is an internal routine used by strncpy, stpncpy, and strncat. 9 * As such, it uses special linkage conventions to make implementation 10 * of these public functions more efficient. 11 * 12 * On input: 13 * t9 = return address 14 * a0 = DST 15 * a1 = SRC 16 * a2 = COUNT 17 * 18 * Furthermore, COUNT may not be zero. 19 * 20 * On output: 21 * t0 = last word written 22 * t10 = bitmask (with one bit set) indicating the byte position of 23 * the end of the range specified by COUNT 24 * t12 = bitmask (with one bit set) indicating the last byte written 25 * a0 = unaligned address of the last *word* written 26 * a2 = the number of full words left in COUNT 27 * 28 * Furthermore, v0, a3-a5, t11, and $at are untouched. 29 * 30 * Much of the information about 21264 scheduling/coding comes from: 31 * Compiler Writer's Guide for the Alpha 21264 32 * abbreviated as 'CWG' in other comments here 33 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 34 * Scheduling notation: 35 * E - either cluster 36 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 37 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 38 * Try not to change the actual algorithm if possible for consistency. 39 */ 40 41#include <asm/regdef.h> 42 43 .set noat 44 .set noreorder 45 46 .text 47 48/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that 49 doesn't like putting the entry point for a procedure somewhere in the 50 middle of the procedure descriptor. Work around this by putting the 51 aligned copy in its own procedure descriptor */ 52 53 54 .ent stxncpy_aligned 55 .align 4 56stxncpy_aligned: 57 .frame sp, 0, t9, 0 58 .prologue 0 59 60 /* On entry to this basic block: 61 t0 == the first destination word for masking back in 62 t1 == the first source word. */ 63 64 /* Create the 1st output word and detect 0's in the 1st input word. */ 65 lda t2, -1 # E : build a mask against false zero 66 mskqh t2, a1, t2 # U : detection in the src word (stall) 67 mskqh t1, a1, t3 # U : 68 ornot t1, t2, t2 # E : (stall) 69 70 mskql t0, a1, t0 # U : assemble the first output word 71 cmpbge zero, t2, t8 # E : bits set iff null found 72 or t0, t3, t0 # E : (stall) 73 beq a2, $a_eoc # U : 74 75 bne t8, $a_eos # U : 76 nop 77 nop 78 nop 79 80 /* On entry to this basic block: 81 t0 == a source word not containing a null. */ 82 83 /* 84 * nops here to: 85 * separate store quads from load quads 86 * limit of 1 bcond/quad to permit training 87 */ 88$a_loop: 89 stq_u t0, 0(a0) # L : 90 addq a0, 8, a0 # E : 91 subq a2, 1, a2 # E : 92 nop 93 94 ldq_u t0, 0(a1) # L : 95 addq a1, 8, a1 # E : 96 cmpbge zero, t0, t8 # E : 97 beq a2, $a_eoc # U : 98 99 beq t8, $a_loop # U : 100 nop 101 nop 102 nop 103 104 /* Take care of the final (partial) word store. At this point 105 the end-of-count bit is set in t8 iff it applies. 106 107 On entry to this basic block we have: 108 t0 == the source word containing the null 109 t8 == the cmpbge mask that found it. */ 110 111$a_eos: 112 negq t8, t12 # E : find low bit set 113 and t8, t12, t12 # E : (stall) 114 /* For the sake of the cache, don't read a destination word 115 if we're not going to need it. */ 116 and t12, 0x80, t6 # E : (stall) 117 bne t6, 1f # U : (stall) 118 119 /* We're doing a partial word store and so need to combine 120 our source and original destination words. */ 121 ldq_u t1, 0(a0) # L : 122 subq t12, 1, t6 # E : 123 or t12, t6, t8 # E : (stall) 124 zapnot t0, t8, t0 # U : clear src bytes > null (stall) 125 126 zap t1, t8, t1 # .. e1 : clear dst bytes <= null 127 or t0, t1, t0 # e1 : (stall) 128 nop 129 nop 130 1311: stq_u t0, 0(a0) # L : 132 ret (t9) # L0 : Latency=3 133 nop 134 nop 135 136 /* Add the end-of-count bit to the eos detection bitmask. */ 137$a_eoc: 138 or t10, t8, t8 # E : 139 br $a_eos # L0 : Latency=3 140 nop 141 nop 142 143 .end stxncpy_aligned 144 145 .align 4 146 .ent __stxncpy 147 .globl __stxncpy 148__stxncpy: 149 .frame sp, 0, t9, 0 150 .prologue 0 151 152 /* Are source and destination co-aligned? */ 153 xor a0, a1, t1 # E : 154 and a0, 7, t0 # E : find dest misalignment 155 and t1, 7, t1 # E : (stall) 156 addq a2, t0, a2 # E : bias count by dest misalignment (stall) 157 158 subq a2, 1, a2 # E : 159 and a2, 7, t2 # E : (stall) 160 srl a2, 3, a2 # U : a2 = loop counter = (count - 1)/8 (stall) 161 addq zero, 1, t10 # E : 162 163 sll t10, t2, t10 # U : t10 = bitmask of last count byte 164 bne t1, $unaligned # U : 165 /* We are co-aligned; take care of a partial first word. */ 166 ldq_u t1, 0(a1) # L : load first src word 167 addq a1, 8, a1 # E : 168 169 beq t0, stxncpy_aligned # U : avoid loading dest word if not needed 170 ldq_u t0, 0(a0) # L : 171 nop 172 nop 173 174 br stxncpy_aligned # .. e1 : 175 nop 176 nop 177 nop 178 179 180 181/* The source and destination are not co-aligned. Align the destination 182 and cope. We have to be very careful about not reading too much and 183 causing a SEGV. */ 184 185 .align 4 186$u_head: 187 /* We know just enough now to be able to assemble the first 188 full source word. We can still find a zero at the end of it 189 that prevents us from outputting the whole thing. 190 191 On entry to this basic block: 192 t0 == the first dest word, unmasked 193 t1 == the shifted low bits of the first source word 194 t6 == bytemask that is -1 in dest word bytes */ 195 196 ldq_u t2, 8(a1) # L : Latency=3 load second src word 197 addq a1, 8, a1 # E : 198 mskql t0, a0, t0 # U : mask trailing garbage in dst 199 extqh t2, a1, t4 # U : (3 cycle stall on t2) 200 201 or t1, t4, t1 # E : first aligned src word complete (stall) 202 mskqh t1, a0, t1 # U : mask leading garbage in src (stall) 203 or t0, t1, t0 # E : first output word complete (stall) 204 or t0, t6, t6 # E : mask original data for zero test (stall) 205 206 cmpbge zero, t6, t8 # E : 207 beq a2, $u_eocfin # U : 208 lda t6, -1 # E : 209 nop 210 211 bne t8, $u_final # U : 212 mskql t6, a1, t6 # U : mask out bits already seen 213 stq_u t0, 0(a0) # L : store first output word 214 or t6, t2, t2 # E : (stall) 215 216 cmpbge zero, t2, t8 # E : find nulls in second partial 217 addq a0, 8, a0 # E : 218 subq a2, 1, a2 # E : 219 bne t8, $u_late_head_exit # U : 220 221 /* Finally, we've got all the stupid leading edge cases taken care 222 of and we can set up to enter the main loop. */ 223 extql t2, a1, t1 # U : position hi-bits of lo word 224 beq a2, $u_eoc # U : 225 ldq_u t2, 8(a1) # L : read next high-order source word 226 addq a1, 8, a1 # E : 227 228 extqh t2, a1, t0 # U : position lo-bits of hi word (stall) 229 cmpbge zero, t2, t8 # E : 230 nop 231 bne t8, $u_eos # U : 232 233 /* Unaligned copy main loop. In order to avoid reading too much, 234 the loop is structured to detect zeros in aligned source words. 235 This has, unfortunately, effectively pulled half of a loop 236 iteration out into the head and half into the tail, but it does 237 prevent nastiness from accumulating in the very thing we want 238 to run as fast as possible. 239 240 On entry to this basic block: 241 t0 == the shifted low-order bits from the current source word 242 t1 == the shifted high-order bits from the previous source word 243 t2 == the unshifted current source word 244 245 We further know that t2 does not contain a null terminator. */ 246 247 .align 4 248$u_loop: 249 or t0, t1, t0 # E : current dst word now complete 250 subq a2, 1, a2 # E : decrement word count 251 extql t2, a1, t1 # U : extract low bits for next time 252 addq a0, 8, a0 # E : 253 254 stq_u t0, -8(a0) # U : save the current word 255 beq a2, $u_eoc # U : 256 ldq_u t2, 8(a1) # U : Latency=3 load high word for next time 257 addq a1, 8, a1 # E : 258 259 extqh t2, a1, t0 # U : extract low bits (2 cycle stall) 260 cmpbge zero, t2, t8 # E : test new word for eos 261 nop 262 beq t8, $u_loop # U : 263 264 /* We've found a zero somewhere in the source word we just read. 265 If it resides in the lower half, we have one (probably partial) 266 word to write out, and if it resides in the upper half, we 267 have one full and one partial word left to write out. 268 269 On entry to this basic block: 270 t0 == the shifted low-order bits from the current source word 271 t1 == the shifted high-order bits from the previous source word 272 t2 == the unshifted current source word. */ 273$u_eos: 274 or t0, t1, t0 # E : first (partial) source word complete 275 nop 276 cmpbge zero, t0, t8 # E : is the null in this first bit? (stall) 277 bne t8, $u_final # U : (stall) 278 279 stq_u t0, 0(a0) # L : the null was in the high-order bits 280 addq a0, 8, a0 # E : 281 subq a2, 1, a2 # E : 282 nop 283 284$u_late_head_exit: 285 extql t2, a1, t0 # U : 286 cmpbge zero, t0, t8 # E : 287 or t8, t10, t6 # E : (stall) 288 cmoveq a2, t6, t8 # E : Latency=2, extra map slot (stall) 289 290 /* Take care of a final (probably partial) result word. 291 On entry to this basic block: 292 t0 == assembled source word 293 t8 == cmpbge mask that found the null. */ 294$u_final: 295 negq t8, t6 # E : isolate low bit set 296 and t6, t8, t12 # E : (stall) 297 and t12, 0x80, t6 # E : avoid dest word load if we can (stall) 298 bne t6, 1f # U : (stall) 299 300 ldq_u t1, 0(a0) # L : 301 subq t12, 1, t6 # E : 302 or t6, t12, t8 # E : (stall) 303 zapnot t0, t8, t0 # U : kill source bytes > null 304 305 zap t1, t8, t1 # U : kill dest bytes <= null 306 or t0, t1, t0 # E : (stall) 307 nop 308 nop 309 3101: stq_u t0, 0(a0) # L : 311 ret (t9) # L0 : Latency=3 312 313 /* Got to end-of-count before end of string. 314 On entry to this basic block: 315 t1 == the shifted high-order bits from the previous source word */ 316$u_eoc: 317 and a1, 7, t6 # E : avoid final load if possible 318 sll t10, t6, t6 # U : (stall) 319 and t6, 0xff, t6 # E : (stall) 320 bne t6, 1f # U : (stall) 321 322 ldq_u t2, 8(a1) # L : load final src word 323 nop 324 extqh t2, a1, t0 # U : extract low bits for last word (stall) 325 or t1, t0, t1 # E : (stall) 326 3271: cmpbge zero, t1, t8 # E : 328 mov t1, t0 # E : 329 330$u_eocfin: # end-of-count, final word 331 or t10, t8, t8 # E : 332 br $u_final # L0 : Latency=3 333 334 /* Unaligned copy entry point. */ 335 .align 4 336$unaligned: 337 338 ldq_u t1, 0(a1) # L : load first source word 339 and a0, 7, t4 # E : find dest misalignment 340 and a1, 7, t5 # E : find src misalignment 341 /* Conditionally load the first destination word and a bytemask 342 with 0xff indicating that the destination byte is sacrosanct. */ 343 mov zero, t0 # E : 344 345 mov zero, t6 # E : 346 beq t4, 1f # U : 347 ldq_u t0, 0(a0) # L : 348 lda t6, -1 # E : 349 350 mskql t6, a0, t6 # U : 351 nop 352 nop 353 subq a1, t4, a1 # E : sub dest misalignment from src addr 354 355 /* If source misalignment is larger than dest misalignment, we need 356 extra startup checks to avoid SEGV. */ 357 3581: cmplt t4, t5, t12 # E : 359 extql t1, a1, t1 # U : shift src into place 360 lda t2, -1 # E : for creating masks later 361 beq t12, $u_head # U : (stall) 362 363 extql t2, a1, t2 # U : 364 cmpbge zero, t1, t8 # E : is there a zero? 365 andnot t2, t6, t2 # E : dest mask for a single word copy 366 or t8, t10, t5 # E : test for end-of-count too 367 368 cmpbge zero, t2, t3 # E : 369 cmoveq a2, t5, t8 # E : Latency=2, extra map slot 370 nop # E : keep with cmoveq 371 andnot t8, t3, t8 # E : (stall) 372 373 beq t8, $u_head # U : 374 /* At this point we've found a zero in the first partial word of 375 the source. We need to isolate the valid source data and mask 376 it into the original destination data. (Incidentally, we know 377 that we'll need at least one byte of that original dest word.) */ 378 ldq_u t0, 0(a0) # L : 379 negq t8, t6 # E : build bitmask of bytes <= zero 380 mskqh t1, t4, t1 # U : 381 382 and t6, t8, t12 # E : 383 subq t12, 1, t6 # E : (stall) 384 or t6, t12, t8 # E : (stall) 385 zapnot t2, t8, t2 # U : prepare source word; mirror changes (stall) 386 387 zapnot t1, t8, t1 # U : to source validity mask 388 andnot t0, t2, t0 # E : zero place for source to reside 389 or t0, t1, t0 # E : and put it there (stall both t0, t1) 390 stq_u t0, 0(a0) # L : (stall) 391 392 ret (t9) # L0 : Latency=3 393 nop 394 nop 395 nop 396 397 .end __stxncpy 398