1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137 #define ptr_out R0
138 #define ptr_in R1
139 #define len R2
140
141 #define data70 R13:12
142 #define dataF8 R11:10
143 #define ldata0 R7:6
144 #define ldata1 R25:24
145 #define data1 R7
146 #define data0 R6
147
148 #define ifbyte p0
149 #define ifhword p0
150 #define ifword p0
151 #define noprolog p0
152 #define nokernel p1
153 #define noepilog p0
154 #define align p2
155 #define kernel1 p0
156
157 #define dalign R25
158 #define star3 R16
159 #define rest R8
160 #define back R7
161 #define epilog R3
162 #define inc R15:14
163 #define kernel R4
164 #define ptr_in_p_128 R5
165 #define mask R8
166 #define shift R8
167 #define shift2 R5
168 #define prolog R15
169 #define epilogdws R15
170 #define shiftb R14
171 #define offset R9
172 #define ptr_out_p_32 R17
173 #define align888 R14
174 #define len8 R9
175 #define over R20
176
177 #define ptr_in_p_128kernel R5:4
178
179 .section .text
180 .p2align 4
181 .global memcpy
182 .type memcpy, @function
183 memcpy:
184 {
185 p2 = cmp.eq(len, #0);
186 align888 = or(ptr_in, ptr_out);
187 p0 = cmp.gtu(len, #23);
188 p1 = cmp.eq(ptr_in, ptr_out);
189 }
190 {
191 p1 = or(p2, p1);
192 p3 = cmp.gtu(len, #95);
193 align888 = or(align888, len);
194 len8 = lsr(len, #3);
195 }
196 {
197 dcfetch(ptr_in);
198 p2 = bitsclr(align888, #7);
199 if(p1) jumpr r31;
200 }
201 {
202 p2 = and(p2,!p3);
203 if (p2.new) len = add(len, #-8);
204 if (p2.new) jump:NT .Ldwordaligned;
205 }
206 {
207 if(!p0) jump .Lbytes23orless;
208 mask.l = #LO(0x7fffffff);
209
210 prolog = sub(#0, ptr_out);
211 }
212 {
213
214 allocframe(#24);
215 mask.h = #HI(0x7fffffff);
216 ptr_in_p_128 = add(ptr_in, #32);
217 back = cl0(len);
218 }
219 {
220 memd(sp+#0) = R17:16;
221 r31.l = #LO(.Lmemcpy_return);
222 prolog &= lsr(mask, back);
223 offset = and(ptr_in, #7);
224 }
225 {
226 memd(sp+#8) = R25:24;
227 dalign = sub(ptr_out, ptr_in);
228 r31.h = #HI(.Lmemcpy_return);
229 }
230 {
231
232 over = add(len, ptr_in);
233 back = add(len, offset);
234 memd(sp+#16) = R21:20;
235 }
236 {
237 noprolog = bitsclr(prolog, #7);
238 prolog = and(prolog, #31);
239 dcfetch(ptr_in_p_128);
240 ptr_in_p_128 = add(ptr_in_p_128, #32);
241 }
242 {
243 kernel = sub(len, prolog);
244 shift = asl(prolog, #3);
245 star3 = and(prolog, #7);
246 ptr_in = and(ptr_in, #-8);
247 }
248 {
249 prolog = lsr(prolog, #3);
250 epilog = and(kernel, #31);
251 ptr_out_p_32 = add(ptr_out, prolog);
252 over = and(over, #7);
253 }
254 {
255 p3 = cmp.gtu(back, #8);
256 kernel = lsr(kernel, #5);
257 dcfetch(ptr_in_p_128);
258 ptr_in_p_128 = add(ptr_in_p_128, #32);
259 }
260 {
261 p1 = cmp.eq(prolog, #0);
262 if(!p1.new) prolog = add(prolog, #1);
263 dcfetch(ptr_in_p_128);
264 ptr_in_p_128 = add(ptr_in_p_128, #32);
265 }
266 {
267 nokernel = cmp.eq(kernel,#0);
268 dcfetch(ptr_in_p_128);
269 ptr_in_p_128 = add(ptr_in_p_128, #32);
270 shiftb = and(shift, #8);
271 }
272 {
273 dcfetch(ptr_in_p_128);
274 ptr_in_p_128 = add(ptr_in_p_128, #32);
275 if(nokernel) jump .Lskip64;
276 p2 = cmp.eq(kernel, #1);
277 }
278 {
279 dczeroa(ptr_out_p_32);
280
281 if(!p2) ptr_out_p_32 = add(ptr_out_p_32, #32);
282 }
283 {
284 dalign = and(dalign, #31);
285 dczeroa(ptr_out_p_32);
286 }
287 .Lskip64:
288 {
289 data70 = memd(ptr_in++#16);
290 if(p3) dataF8 = memd(ptr_in+#8);
291 if(noprolog) jump .Lnoprolog32;
292 align = offset;
293 }
294
295 {
296 ldata0 = valignb(dataF8, data70, align);
297 ifbyte = tstbit(shift,#3);
298 offset = add(offset, star3);
299 }
300 {
301 if(ifbyte) memb(ptr_out++#1) = data0;
302 ldata0 = lsr(ldata0, shiftb);
303 shiftb = and(shift, #16);
304 ifhword = tstbit(shift,#4);
305 }
306 {
307 if(ifhword) memh(ptr_out++#2) = data0;
308 ldata0 = lsr(ldata0, shiftb);
309 ifword = tstbit(shift,#5);
310 p2 = cmp.gtu(offset, #7);
311 }
312 {
313 if(ifword) memw(ptr_out++#4) = data0;
314 if(p2) data70 = dataF8;
315 if(p2) dataF8 = memd(ptr_in++#8);
316 align = offset;
317 }
318 .Lnoprolog32:
319 {
320 p3 = sp1loop0(.Ldword_loop_prolog, prolog)
321 rest = sub(len, star3);
322 p0 = cmp.gt(over, #0);
323 }
324 if(p0) rest = add(rest, #16);
325 .Ldword_loop_prolog:
326 {
327 if(p3) memd(ptr_out++#8) = ldata0;
328 ldata0 = valignb(dataF8, data70, align);
329 p0 = cmp.gt(rest, #16);
330 }
331 {
332 data70 = dataF8;
333 if(p0) dataF8 = memd(ptr_in++#8);
334 rest = add(rest, #-8);
335 }:endloop0
336 .Lkernel:
337 {
338
339 p3 = cmp.gtu(kernel, #0);
340
341 if(p3.new) kernel = add(kernel, #-1);
342
343 if(p3.new) epilog = add(epilog, #32);
344 }
345 {
346 nokernel = cmp.eq(kernel, #0);
347 if(nokernel.new) jump:NT .Lepilog;
348 inc = combine(#32, #-1);
349 p3 = cmp.gtu(dalign, #24);
350 }
351 {
352 if(p3) jump .Lodd_alignment;
353 }
354 {
355 loop0(.Loword_loop_25to31, kernel);
356 kernel1 = cmp.gtu(kernel, #1);
357 rest = kernel;
358 }
359 .falign
360 .Loword_loop_25to31:
361 {
362 dcfetch(ptr_in_p_128);
363 if(kernel1) ptr_out_p_32 = add(ptr_out_p_32, #32);
364 }
365 {
366 dczeroa(ptr_out_p_32);
367 p3 = cmp.eq(kernel, rest);
368 }
369 {
370
371 ptr_in_p_128kernel = vaddw(ptr_in_p_128kernel, inc);
372
373 if(!p3) memd(ptr_out++#8) = ldata1;
374 ldata1 = valignb(dataF8, data70, align);
375 data70 = memd(ptr_in++#8);
376 }
377 {
378 memd(ptr_out++#8) = ldata0;
379 ldata0 = valignb(data70, dataF8, align);
380 dataF8 = memd(ptr_in++#8);
381 }
382 {
383 memd(ptr_out++#8) = ldata1;
384 ldata1 = valignb(dataF8, data70, align);
385 data70 = memd(ptr_in++#8);
386 }
387 {
388 memd(ptr_out++#8) = ldata0;
389 ldata0 = valignb(data70, dataF8, align);
390 dataF8 = memd(ptr_in++#8);
391 kernel1 = cmp.gtu(kernel, #1);
392 }:endloop0
393 {
394 memd(ptr_out++#8) = ldata1;
395 jump .Lepilog;
396 }
397 .Lodd_alignment:
398 {
399 loop0(.Loword_loop_00to24, kernel);
400 kernel1 = cmp.gtu(kernel, #1);
401 rest = add(kernel, #-1);
402 }
403 .falign
404 .Loword_loop_00to24:
405 {
406 dcfetch(ptr_in_p_128);
407 ptr_in_p_128kernel = vaddw(ptr_in_p_128kernel, inc);
408 if(kernel1) ptr_out_p_32 = add(ptr_out_p_32, #32);
409 }
410 {
411 dczeroa(ptr_out_p_32);
412 }
413 {
414 memd(ptr_out++#8) = ldata0;
415 ldata0 = valignb(dataF8, data70, align);
416 data70 = memd(ptr_in++#8);
417 }
418 {
419 memd(ptr_out++#8) = ldata0;
420 ldata0 = valignb(data70, dataF8, align);
421 dataF8 = memd(ptr_in++#8);
422 }
423 {
424 memd(ptr_out++#8) = ldata0;
425 ldata0 = valignb(dataF8, data70, align);
426 data70 = memd(ptr_in++#8);
427 }
428 {
429 memd(ptr_out++#8) = ldata0;
430 ldata0 = valignb(data70, dataF8, align);
431 dataF8 = memd(ptr_in++#8);
432 kernel1 = cmp.gtu(kernel, #1);
433 }:endloop0
434 .Lepilog:
435 {
436 noepilog = cmp.eq(epilog,#0);
437 epilogdws = lsr(epilog, #3);
438 kernel = and(epilog, #7);
439 }
440 {
441 if(noepilog) jumpr r31;
442 if(noepilog) ptr_out = sub(ptr_out, len);
443 p3 = cmp.eq(epilogdws, #0);
444 shift2 = asl(epilog, #3);
445 }
446 {
447 shiftb = and(shift2, #32);
448 ifword = tstbit(epilog,#2);
449 if(p3) jump .Lepilog60;
450 if(!p3) epilog = add(epilog, #-16);
451 }
452 {
453 loop0(.Ldword_loop_epilog, epilogdws);
454
455 p3 = cmp.eq(kernel, #0);
456 if(p3.new) kernel= #8;
457 p1 = cmp.gt(over, #0);
458 }
459
460 if(p1) kernel= #0;
461 .Ldword_loop_epilog:
462 {
463 memd(ptr_out++#8) = ldata0;
464 ldata0 = valignb(dataF8, data70, align);
465 p3 = cmp.gt(epilog, kernel);
466 }
467 {
468 data70 = dataF8;
469 if(p3) dataF8 = memd(ptr_in++#8);
470 epilog = add(epilog, #-8);
471 }:endloop0
472
473 .Lepilog60:
474 {
475 if(ifword) memw(ptr_out++#4) = data0;
476 ldata0 = lsr(ldata0, shiftb);
477 ifhword = tstbit(epilog,#1);
478 shiftb = and(shift2, #16);
479 }
480 {
481 if(ifhword) memh(ptr_out++#2) = data0;
482 ldata0 = lsr(ldata0, shiftb);
483 ifbyte = tstbit(epilog,#0);
484 if(ifbyte.new) len = add(len, #-1);
485 }
486 {
487 if(ifbyte) memb(ptr_out) = data0;
488 ptr_out = sub(ptr_out, len);
489 jumpr r31;
490 }
491
492 .Lbytes23orless:
493 {
494 p3 = sp1loop0(.Lbyte_copy, len);
495 len = add(len, #-1);
496 }
497 .Lbyte_copy:
498 {
499 data0 = memb(ptr_in++#1);
500 if(p3) memb(ptr_out++#1) = data0;
501 }:endloop0
502 {
503 memb(ptr_out) = data0;
504 ptr_out = sub(ptr_out, len);
505 jumpr r31;
506 }
507
508 .Ldwordaligned:
509 {
510 p3 = sp1loop0(.Ldword_copy, len8);
511 }
512 .Ldword_copy:
513 {
514 if(p3) memd(ptr_out++#8) = ldata0;
515 ldata0 = memd(ptr_in++#8);
516 }:endloop0
517 {
518 memd(ptr_out) = ldata0;
519 ptr_out = sub(ptr_out, len);
520 jumpr r31;
521 }
522 .Lmemcpy_return:
523 r21:20 = memd(sp+#16);
524 {
525 r25:24 = memd(sp+#8);
526 r17:16 = memd(sp+#0);
527 }
528 deallocframe;
529 jumpr r31