1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 #include <linux/linkage.h>
  11 
  12 #define         PASS0_SUMS      %xmm0
  13 #define         PASS1_SUMS      %xmm1
  14 #define         PASS2_SUMS      %xmm2
  15 #define         PASS3_SUMS      %xmm3
  16 #define         K0              %xmm4
  17 #define         K1              %xmm5
  18 #define         K2              %xmm6
  19 #define         K3              %xmm7
  20 #define         T0              %xmm8
  21 #define         T1              %xmm9
  22 #define         T2              %xmm10
  23 #define         T3              %xmm11
  24 #define         T4              %xmm12
  25 #define         T5              %xmm13
  26 #define         T6              %xmm14
  27 #define         T7              %xmm15
  28 #define         KEY             %rdi
  29 #define         MESSAGE         %rsi
  30 #define         MESSAGE_LEN     %rdx
  31 #define         HASH            %rcx
  32 
  33 .macro _nh_stride       k0, k1, k2, k3, offset
  34 
  35         
  36         movdqu          \offset(MESSAGE), T1
  37 
  38         
  39         movdqu          \offset(KEY), \k3
  40 
  41         
  42         movdqa          T1, T2
  43         movdqa          T1, T3
  44         paddd           T1, \k0    
  45         paddd           \k1, T1
  46         paddd           \k2, T2
  47         paddd           \k3, T3
  48 
  49         
  50         pshufd          $0x10, \k0, T4
  51         pshufd          $0x32, \k0, \k0
  52         pshufd          $0x10, T1, T5
  53         pshufd          $0x32, T1, T1
  54         pshufd          $0x10, T2, T6
  55         pshufd          $0x32, T2, T2
  56         pshufd          $0x10, T3, T7
  57         pshufd          $0x32, T3, T3
  58         pmuludq         T4, \k0
  59         pmuludq         T5, T1
  60         pmuludq         T6, T2
  61         pmuludq         T7, T3
  62         paddq           \k0, PASS0_SUMS
  63         paddq           T1, PASS1_SUMS
  64         paddq           T2, PASS2_SUMS
  65         paddq           T3, PASS3_SUMS
  66 .endm
  67 
  68 
  69 
  70 
  71 
  72 
  73 
  74 ENTRY(nh_sse2)
  75 
  76         movdqu          0x00(KEY), K0
  77         movdqu          0x10(KEY), K1
  78         movdqu          0x20(KEY), K2
  79         add             $0x30, KEY
  80         pxor            PASS0_SUMS, PASS0_SUMS
  81         pxor            PASS1_SUMS, PASS1_SUMS
  82         pxor            PASS2_SUMS, PASS2_SUMS
  83         pxor            PASS3_SUMS, PASS3_SUMS
  84 
  85         sub             $0x40, MESSAGE_LEN
  86         jl              .Lloop4_done
  87 .Lloop4:
  88         _nh_stride      K0, K1, K2, K3, 0x00
  89         _nh_stride      K1, K2, K3, K0, 0x10
  90         _nh_stride      K2, K3, K0, K1, 0x20
  91         _nh_stride      K3, K0, K1, K2, 0x30
  92         add             $0x40, KEY
  93         add             $0x40, MESSAGE
  94         sub             $0x40, MESSAGE_LEN
  95         jge             .Lloop4
  96 
  97 .Lloop4_done:
  98         and             $0x3f, MESSAGE_LEN
  99         jz              .Ldone
 100         _nh_stride      K0, K1, K2, K3, 0x00
 101 
 102         sub             $0x10, MESSAGE_LEN
 103         jz              .Ldone
 104         _nh_stride      K1, K2, K3, K0, 0x10
 105 
 106         sub             $0x10, MESSAGE_LEN
 107         jz              .Ldone
 108         _nh_stride      K2, K3, K0, K1, 0x20
 109 
 110 .Ldone:
 111         
 112         movdqa          PASS0_SUMS, T0
 113         movdqa          PASS2_SUMS, T1
 114         punpcklqdq      PASS1_SUMS, T0          
 115         punpcklqdq      PASS3_SUMS, T1          
 116         punpckhqdq      PASS1_SUMS, PASS0_SUMS  
 117         punpckhqdq      PASS3_SUMS, PASS2_SUMS  
 118         paddq           PASS0_SUMS, T0
 119         paddq           PASS2_SUMS, T1
 120         movdqu          T0, 0x00(HASH)
 121         movdqu          T1, 0x10(HASH)
 122         ret
 123 ENDPROC(nh_sse2)