1/* Copyright 2002 Andi Kleen */ 2 3#include <linux/linkage.h> 4#include <asm/cpufeature.h> 5#include <asm/dwarf2.h> 6#include <asm/alternative-asm.h> 7 8/* 9 * We build a jump to memcpy_orig by default which gets NOPped out on 10 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which 11 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs 12 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. 13 */ 14 15.weak memcpy 16 17/* 18 * memcpy - Copy a memory block. 19 * 20 * Input: 21 * rdi destination 22 * rsi source 23 * rdx count 24 * 25 * Output: 26 * rax original destination 27 */ 28ENTRY(__memcpy) 29ENTRY(memcpy) 30 ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ 31 "jmp memcpy_erms", X86_FEATURE_ERMS 32 33 movq %rdi, %rax 34 movq %rdx, %rcx 35 shrq $3, %rcx 36 andl $7, %edx 37 rep movsq 38 movl %edx, %ecx 39 rep movsb 40 ret 41ENDPROC(memcpy) 42ENDPROC(__memcpy) 43 44/* 45 * memcpy_erms() - enhanced fast string memcpy. This is faster and 46 * simpler than memcpy. Use memcpy_erms when possible. 47 */ 48ENTRY(memcpy_erms) 49 movq %rdi, %rax 50 movq %rdx, %rcx 51 rep movsb 52 ret 53ENDPROC(memcpy_erms) 54 55ENTRY(memcpy_orig) 56 CFI_STARTPROC 57 movq %rdi, %rax 58 59 cmpq $0x20, %rdx 60 jb .Lhandle_tail 61 62 /* 63 * We check whether memory false dependence could occur, 64 * then jump to corresponding copy mode. 65 */ 66 cmp %dil, %sil 67 jl .Lcopy_backward 68 subq $0x20, %rdx 69.Lcopy_forward_loop: 70 subq $0x20, %rdx 71 72 /* 73 * Move in blocks of 4x8 bytes: 74 */ 75 movq 0*8(%rsi), %r8 76 movq 1*8(%rsi), %r9 77 movq 2*8(%rsi), %r10 78 movq 3*8(%rsi), %r11 79 leaq 4*8(%rsi), %rsi 80 81 movq %r8, 0*8(%rdi) 82 movq %r9, 1*8(%rdi) 83 movq %r10, 2*8(%rdi) 84 movq %r11, 3*8(%rdi) 85 leaq 4*8(%rdi), %rdi 86 jae .Lcopy_forward_loop 87 addl $0x20, %edx 88 jmp .Lhandle_tail 89 90.Lcopy_backward: 91 /* 92 * Calculate copy position to tail. 93 */ 94 addq %rdx, %rsi 95 addq %rdx, %rdi 96 subq $0x20, %rdx 97 /* 98 * At most 3 ALU operations in one cycle, 99 * so append NOPS in the same 16 bytes trunk. 100 */ 101 .p2align 4 102.Lcopy_backward_loop: 103 subq $0x20, %rdx 104 movq -1*8(%rsi), %r8 105 movq -2*8(%rsi), %r9 106 movq -3*8(%rsi), %r10 107 movq -4*8(%rsi), %r11 108 leaq -4*8(%rsi), %rsi 109 movq %r8, -1*8(%rdi) 110 movq %r9, -2*8(%rdi) 111 movq %r10, -3*8(%rdi) 112 movq %r11, -4*8(%rdi) 113 leaq -4*8(%rdi), %rdi 114 jae .Lcopy_backward_loop 115 116 /* 117 * Calculate copy position to head. 118 */ 119 addl $0x20, %edx 120 subq %rdx, %rsi 121 subq %rdx, %rdi 122.Lhandle_tail: 123 cmpl $16, %edx 124 jb .Lless_16bytes 125 126 /* 127 * Move data from 16 bytes to 31 bytes. 128 */ 129 movq 0*8(%rsi), %r8 130 movq 1*8(%rsi), %r9 131 movq -2*8(%rsi, %rdx), %r10 132 movq -1*8(%rsi, %rdx), %r11 133 movq %r8, 0*8(%rdi) 134 movq %r9, 1*8(%rdi) 135 movq %r10, -2*8(%rdi, %rdx) 136 movq %r11, -1*8(%rdi, %rdx) 137 retq 138 .p2align 4 139.Lless_16bytes: 140 cmpl $8, %edx 141 jb .Lless_8bytes 142 /* 143 * Move data from 8 bytes to 15 bytes. 144 */ 145 movq 0*8(%rsi), %r8 146 movq -1*8(%rsi, %rdx), %r9 147 movq %r8, 0*8(%rdi) 148 movq %r9, -1*8(%rdi, %rdx) 149 retq 150 .p2align 4 151.Lless_8bytes: 152 cmpl $4, %edx 153 jb .Lless_3bytes 154 155 /* 156 * Move data from 4 bytes to 7 bytes. 157 */ 158 movl (%rsi), %ecx 159 movl -4(%rsi, %rdx), %r8d 160 movl %ecx, (%rdi) 161 movl %r8d, -4(%rdi, %rdx) 162 retq 163 .p2align 4 164.Lless_3bytes: 165 subl $1, %edx 166 jb .Lend 167 /* 168 * Move data from 1 bytes to 3 bytes. 169 */ 170 movzbl (%rsi), %ecx 171 jz .Lstore_1byte 172 movzbq 1(%rsi), %r8 173 movzbq (%rsi, %rdx), %r9 174 movb %r8b, 1(%rdi) 175 movb %r9b, (%rdi, %rdx) 176.Lstore_1byte: 177 movb %cl, (%rdi) 178 179.Lend: 180 retq 181 CFI_ENDPROC 182ENDPROC(memcpy_orig) 183