1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30 #include <asm/export.h>
31 .set noat
32 .set noreorder
33 .text
34 .globl memset
35 .globl __memset
36 .globl ___memset
37 .globl __memset16
38 .globl __constant_c_memset
39
40 .ent ___memset
41 .align 5
42 ___memset:
43 .frame $30,0,$26,0
44 .prologue 0
45
46
47
48
49
50
51
52
53 and $17,255,$1 # E : 00000000000000ch
54 insbl $17,1,$2 # U : 000000000000ch00
55 bis $16,$16,$0 # E : return value
56 ble $18,end_b # U : zero length requested?
57
58 addq $18,$16,$6 # E : max address to write to
59 bis $1,$2,$17 # E : 000000000000chch
60 insbl $1,2,$3 # U : 0000000000ch0000
61 insbl $1,3,$4 # U : 00000000ch000000
62
63 or $3,$4,$3 # E : 00000000chch0000
64 inswl $17,4,$5 # U : 0000chch00000000
65 xor $16,$6,$1 # E : will complete write be within one quadword?
66 inswl $17,6,$2 # U : chch000000000000
67
68 or $17,$3,$17 # E : 00000000chchchch
69 or $2,$5,$2 # E : chchchch00000000
70 bic $1,7,$1 # E : fit within a single quadword?
71 and $16,7,$3 # E : Target addr misalignment
72
73 or $17,$2,$17 # E : chchchchchchchch
74 beq $1,within_quad_b # U :
75 nop # E :
76 beq $3,aligned_b # U : target is 0mod8
77
78
79
80
81 ldq_u $4,0($16) # L : Fetch first partial
82 bis $16,$16,$5 # E : Save the address
83 insql $17,$16,$2 # U : Insert new bytes
84 subq $3,8,$3 # E : Invert (for addressing uses)
85
86 addq $18,$3,$18 # E : $18 is new count ($3 is negative)
87 mskql $4,$16,$4 # U : clear relevant parts of the quad
88 subq $16,$3,$16 # E : $16 is new aligned destination
89 bis $2,$4,$1 # E : Final bytes
90
91 nop
92 stq_u $1,0($5) # L : Store result
93 nop
94 nop
95
96 .align 4
97 aligned_b:
98
99
100
101
102
103 sra $18,3,$3 # U : Number of remaining quads to write
104 and $18,7,$18 # E : Number of trailing bytes to write
105 bis $16,$16,$5 # E : Save dest address
106 beq $3,no_quad_b # U : tail stuff only
107
108
109
110
111
112
113
114
115
116
117
118
119 and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
120 subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
121 subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
122 blt $4, loop_b # U :
123
124
125
126
127
128
129
130 nop # E :
131 nop # E :
132 nop # E :
133 beq $1, $bigalign_b # U :
134
135 $alignmod64_b:
136 stq $17, 0($5) # L :
137 subq $3, 1, $3 # E : For consistency later
138 addq $1, 8, $1 # E : Increment towards zero for alignment
139 addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
140
141 nop
142 nop
143 addq $5, 8, $5 # E : Inc address
144 blt $1, $alignmod64_b # U :
145
146 $bigalign_b:
147
148
149
150
151
152
153
154
155
156
157
158
159
160 $do_wh64_b:
161 wh64 ($4) # L1 : memory subsystem write hint
162 subq $3, 24, $2 # E : For determining future wh64 addresses
163 stq $17, 0($5) # L :
164 nop # E :
165
166 addq $5, 128, $4 # E : speculative target of next wh64
167 stq $17, 8($5) # L :
168 stq $17, 16($5) # L :
169 addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
170
171 stq $17, 24($5) # L :
172 stq $17, 32($5) # L :
173 cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
174 nop
175
176 stq $17, 40($5) # L :
177 stq $17, 48($5) # L :
178 subq $3, 16, $2 # E : Repeat the loop at least once more?
179 nop
180
181 stq $17, 56($5) # L :
182 addq $5, 64, $5 # E :
183 subq $3, 8, $3 # E :
184 bge $2, $do_wh64_b # U :
185
186 nop
187 nop
188 nop
189 beq $3, no_quad_b # U : Might have finished already
190
191 .align 4
192
193
194
195
196 loop_b:
197 stq $17,0($5) # L :
198 subq $3,1,$3 # E : Decrement number quads left
199 addq $5,8,$5 # E : Inc address
200 bne $3,loop_b # U : more?
201
202 no_quad_b:
203
204
205
206 nop # E :
207 beq $18,end_b # U : All done?
208 ldq $7,0($5) # L :
209 mskqh $7,$6,$2 # U : Mask final quad
210
211 insqh $17,$6,$4 # U : New bits
212 bis $2,$4,$1 # E : Put it all together
213 stq $1,0($5) # L : And back to memory
214 ret $31,($26),1 # L0 :
215
216 within_quad_b:
217 ldq_u $1,0($16) # L :
218 insql $17,$16,$2 # U : New bits
219 mskql $1,$16,$4 # U : Clear old
220 bis $2,$4,$2 # E : New result
221
222 mskql $2,$6,$4 # U :
223 mskqh $1,$6,$2 # U :
224 bis $2,$4,$1 # E :
225 stq_u $1,0($16) # L :
226
227 end_b:
228 nop
229 nop
230 nop
231 ret $31,($26),1 # L0 :
232 .end ___memset
233 EXPORT_SYMBOL(___memset)
234
235
236
237
238
239
240 .align 4
241 .ent __constant_c_memset
242 __constant_c_memset:
243 .frame $30,0,$26,0
244 .prologue 0
245
246 addq $18,$16,$6 # E : max address to write to
247 bis $16,$16,$0 # E : return value
248 xor $16,$6,$1 # E : will complete write be within one quadword?
249 ble $18,end # U : zero length requested?
250
251 bic $1,7,$1 # E : fit within a single quadword
252 beq $1,within_one_quad # U :
253 and $16,7,$3 # E : Target addr misalignment
254 beq $3,aligned # U : target is 0mod8
255
256
257
258
259 ldq_u $4,0($16) # L : Fetch first partial
260 bis $16,$16,$5 # E : Save the address
261 insql $17,$16,$2 # U : Insert new bytes
262 subq $3,8,$3 # E : Invert (for addressing uses)
263
264 addq $18,$3,$18 # E : $18 is new count ($3 is negative)
265 mskql $4,$16,$4 # U : clear relevant parts of the quad
266 subq $16,$3,$16 # E : $16 is new aligned destination
267 bis $2,$4,$1 # E : Final bytes
268
269 nop
270 stq_u $1,0($5) # L : Store result
271 nop
272 nop
273
274 .align 4
275 aligned:
276
277
278
279
280
281 sra $18,3,$3 # U : Number of remaining quads to write
282 and $18,7,$18 # E : Number of trailing bytes to write
283 bis $16,$16,$5 # E : Save dest address
284 beq $3,no_quad # U : tail stuff only
285
286
287
288
289
290
291
292
293
294
295
296
297 and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
298 subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
299 subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
300 blt $4, loop # U :
301
302
303
304
305
306
307
308 nop # E :
309 nop # E :
310 nop # E :
311 beq $1, $bigalign # U :
312
313 $alignmod64:
314 stq $17, 0($5) # L :
315 subq $3, 1, $3 # E : For consistency later
316 addq $1, 8, $1 # E : Increment towards zero for alignment
317 addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
318
319 nop
320 nop
321 addq $5, 8, $5 # E : Inc address
322 blt $1, $alignmod64 # U :
323
324 $bigalign:
325
326
327
328
329
330
331
332
333
334
335
336
337
338 $do_wh64:
339 wh64 ($4) # L1 : memory subsystem write hint
340 subq $3, 24, $2 # E : For determining future wh64 addresses
341 stq $17, 0($5) # L :
342 nop # E :
343
344 addq $5, 128, $4 # E : speculative target of next wh64
345 stq $17, 8($5) # L :
346 stq $17, 16($5) # L :
347 addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
348
349 stq $17, 24($5) # L :
350 stq $17, 32($5) # L :
351 cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
352 nop
353
354 stq $17, 40($5) # L :
355 stq $17, 48($5) # L :
356 subq $3, 16, $2 # E : Repeat the loop at least once more?
357 nop
358
359 stq $17, 56($5) # L :
360 addq $5, 64, $5 # E :
361 subq $3, 8, $3 # E :
362 bge $2, $do_wh64 # U :
363
364 nop
365 nop
366 nop
367 beq $3, no_quad # U : Might have finished already
368
369 .align 4
370
371
372
373
374 loop:
375 stq $17,0($5) # L :
376 subq $3,1,$3 # E : Decrement number quads left
377 addq $5,8,$5 # E : Inc address
378 bne $3,loop # U : more?
379
380 no_quad:
381
382
383
384 nop # E :
385 beq $18,end # U : All done?
386 ldq $7,0($5) # L :
387 mskqh $7,$6,$2 # U : Mask final quad
388
389 insqh $17,$6,$4 # U : New bits
390 bis $2,$4,$1 # E : Put it all together
391 stq $1,0($5) # L : And back to memory
392 ret $31,($26),1 # L0 :
393
394 within_one_quad:
395 ldq_u $1,0($16) # L :
396 insql $17,$16,$2 # U : New bits
397 mskql $1,$16,$4 # U : Clear old
398 bis $2,$4,$2 # E : New result
399
400 mskql $2,$6,$4 # U :
401 mskqh $1,$6,$2 # U :
402 bis $2,$4,$1 # E :
403 stq_u $1,0($16) # L :
404
405 end:
406 nop
407 nop
408 nop
409 ret $31,($26),1 # L0 :
410 .end __constant_c_memset
411 EXPORT_SYMBOL(__constant_c_memset)
412
413
414
415
416
417 .align 5
418 .ent __memset16
419
420 __memset16:
421 .frame $30,0,$26,0
422 .prologue 0
423
424 inswl $17,0,$5 # U : 000000000000c1c2
425 inswl $17,2,$2 # U : 00000000c1c20000
426 bis $16,$16,$0 # E : return value
427 addq $18,$16,$6 # E : max address to write to
428
429 ble $18, end_w # U : zero length requested?
430 inswl $17,4,$3 # U : 0000c1c200000000
431 inswl $17,6,$4 # U : c1c2000000000000
432 xor $16,$6,$1 # E : will complete write be within one quadword?
433
434 or $2,$5,$2 # E : 00000000c1c2c1c2
435 or $3,$4,$17 # E : c1c2c1c200000000
436 bic $1,7,$1 # E : fit within a single quadword
437 and $16,7,$3 # E : Target addr misalignment
438
439 or $17,$2,$17 # E : c1c2c1c2c1c2c1c2
440 beq $1,within_quad_w # U :
441 nop
442 beq $3,aligned_w # U : target is 0mod8
443
444
445
446
447 ldq_u $4,0($16) # L : Fetch first partial
448 bis $16,$16,$5 # E : Save the address
449 insql $17,$16,$2 # U : Insert new bytes
450 subq $3,8,$3 # E : Invert (for addressing uses)
451
452 addq $18,$3,$18 # E : $18 is new count ($3 is negative)
453 mskql $4,$16,$4 # U : clear relevant parts of the quad
454 subq $16,$3,$16 # E : $16 is new aligned destination
455 bis $2,$4,$1 # E : Final bytes
456
457 nop
458 stq_u $1,0($5) # L : Store result
459 nop
460 nop
461
462 .align 4
463 aligned_w:
464
465
466
467
468
469 sra $18,3,$3 # U : Number of remaining quads to write
470 and $18,7,$18 # E : Number of trailing bytes to write
471 bis $16,$16,$5 # E : Save dest address
472 beq $3,no_quad_w # U : tail stuff only
473
474
475
476
477
478
479
480
481
482
483
484
485 and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
486 subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
487 subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
488 blt $4, loop_w # U :
489
490
491
492
493
494
495
496 nop # E :
497 nop # E :
498 nop # E :
499 beq $1, $bigalign_w # U :
500
501 $alignmod64_w:
502 stq $17, 0($5) # L :
503 subq $3, 1, $3 # E : For consistency later
504 addq $1, 8, $1 # E : Increment towards zero for alignment
505 addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
506
507 nop
508 nop
509 addq $5, 8, $5 # E : Inc address
510 blt $1, $alignmod64_w # U :
511
512 $bigalign_w:
513
514
515
516
517
518
519
520
521
522
523
524
525
526 $do_wh64_w:
527 wh64 ($4) # L1 : memory subsystem write hint
528 subq $3, 24, $2 # E : For determining future wh64 addresses
529 stq $17, 0($5) # L :
530 nop # E :
531
532 addq $5, 128, $4 # E : speculative target of next wh64
533 stq $17, 8($5) # L :
534 stq $17, 16($5) # L :
535 addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
536
537 stq $17, 24($5) # L :
538 stq $17, 32($5) # L :
539 cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
540 nop
541
542 stq $17, 40($5) # L :
543 stq $17, 48($5) # L :
544 subq $3, 16, $2 # E : Repeat the loop at least once more?
545 nop
546
547 stq $17, 56($5) # L :
548 addq $5, 64, $5 # E :
549 subq $3, 8, $3 # E :
550 bge $2, $do_wh64_w # U :
551
552 nop
553 nop
554 nop
555 beq $3, no_quad_w # U : Might have finished already
556
557 .align 4
558
559
560
561
562 loop_w:
563 stq $17,0($5) # L :
564 subq $3,1,$3 # E : Decrement number quads left
565 addq $5,8,$5 # E : Inc address
566 bne $3,loop_w # U : more?
567
568 no_quad_w:
569
570
571
572 nop # E :
573 beq $18,end_w # U : All done?
574 ldq $7,0($5) # L :
575 mskqh $7,$6,$2 # U : Mask final quad
576
577 insqh $17,$6,$4 # U : New bits
578 bis $2,$4,$1 # E : Put it all together
579 stq $1,0($5) # L : And back to memory
580 ret $31,($26),1 # L0 :
581
582 within_quad_w:
583 ldq_u $1,0($16) # L :
584 insql $17,$16,$2 # U : New bits
585 mskql $1,$16,$4 # U : Clear old
586 bis $2,$4,$2 # E : New result
587
588 mskql $2,$6,$4 # U :
589 mskqh $1,$6,$2 # U :
590 bis $2,$4,$1 # E :
591 stq_u $1,0($16) # L :
592
593 end_w:
594 nop
595 nop
596 nop
597 ret $31,($26),1 # L0 :
598
599 .end __memset16
600 EXPORT_SYMBOL(__memset16)
601
602 memset = ___memset
603 __memset = ___memset
604 EXPORT_SYMBOL(memset)
605 EXPORT_SYMBOL(__memset)