1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 #include <linux/linkage.h>
  11 
  12 #define         PASS0_SUMS      %ymm0
  13 #define         PASS1_SUMS      %ymm1
  14 #define         PASS2_SUMS      %ymm2
  15 #define         PASS3_SUMS      %ymm3
  16 #define         K0              %ymm4
  17 #define         K0_XMM          %xmm4
  18 #define         K1              %ymm5
  19 #define         K1_XMM          %xmm5
  20 #define         K2              %ymm6
  21 #define         K2_XMM          %xmm6
  22 #define         K3              %ymm7
  23 #define         K3_XMM          %xmm7
  24 #define         T0              %ymm8
  25 #define         T1              %ymm9
  26 #define         T2              %ymm10
  27 #define         T2_XMM          %xmm10
  28 #define         T3              %ymm11
  29 #define         T3_XMM          %xmm11
  30 #define         T4              %ymm12
  31 #define         T5              %ymm13
  32 #define         T6              %ymm14
  33 #define         T7              %ymm15
  34 #define         KEY             %rdi
  35 #define         MESSAGE         %rsi
  36 #define         MESSAGE_LEN     %rdx
  37 #define         HASH            %rcx
  38 
  39 .macro _nh_2xstride     k0, k1, k2, k3
  40 
  41         
  42         vpaddd          \k0, T3, T0
  43         vpaddd          \k1, T3, T1
  44         vpaddd          \k2, T3, T2
  45         vpaddd          \k3, T3, T3
  46 
  47         
  48         vpshufd         $0x10, T0, T4
  49         vpshufd         $0x32, T0, T0
  50         vpshufd         $0x10, T1, T5
  51         vpshufd         $0x32, T1, T1
  52         vpshufd         $0x10, T2, T6
  53         vpshufd         $0x32, T2, T2
  54         vpshufd         $0x10, T3, T7
  55         vpshufd         $0x32, T3, T3
  56         vpmuludq        T4, T0, T0
  57         vpmuludq        T5, T1, T1
  58         vpmuludq        T6, T2, T2
  59         vpmuludq        T7, T3, T3
  60         vpaddq          T0, PASS0_SUMS, PASS0_SUMS
  61         vpaddq          T1, PASS1_SUMS, PASS1_SUMS
  62         vpaddq          T2, PASS2_SUMS, PASS2_SUMS
  63         vpaddq          T3, PASS3_SUMS, PASS3_SUMS
  64 .endm
  65 
  66 
  67 
  68 
  69 
  70 
  71 
  72 ENTRY(nh_avx2)
  73 
  74         vmovdqu         0x00(KEY), K0
  75         vmovdqu         0x10(KEY), K1
  76         add             $0x20, KEY
  77         vpxor           PASS0_SUMS, PASS0_SUMS, PASS0_SUMS
  78         vpxor           PASS1_SUMS, PASS1_SUMS, PASS1_SUMS
  79         vpxor           PASS2_SUMS, PASS2_SUMS, PASS2_SUMS
  80         vpxor           PASS3_SUMS, PASS3_SUMS, PASS3_SUMS
  81 
  82         sub             $0x40, MESSAGE_LEN
  83         jl              .Lloop4_done
  84 .Lloop4:
  85         vmovdqu         (MESSAGE), T3
  86         vmovdqu         0x00(KEY), K2
  87         vmovdqu         0x10(KEY), K3
  88         _nh_2xstride    K0, K1, K2, K3
  89 
  90         vmovdqu         0x20(MESSAGE), T3
  91         vmovdqu         0x20(KEY), K0
  92         vmovdqu         0x30(KEY), K1
  93         _nh_2xstride    K2, K3, K0, K1
  94 
  95         add             $0x40, MESSAGE
  96         add             $0x40, KEY
  97         sub             $0x40, MESSAGE_LEN
  98         jge             .Lloop4
  99 
 100 .Lloop4_done:
 101         and             $0x3f, MESSAGE_LEN
 102         jz              .Ldone
 103 
 104         cmp             $0x20, MESSAGE_LEN
 105         jl              .Llast
 106 
 107         
 108         vmovdqu         (MESSAGE), T3
 109         vmovdqu         0x00(KEY), K2
 110         vmovdqu         0x10(KEY), K3
 111         _nh_2xstride    K0, K1, K2, K3
 112         add             $0x20, MESSAGE
 113         add             $0x20, KEY
 114         sub             $0x20, MESSAGE_LEN
 115         jz              .Ldone
 116         vmovdqa         K2, K0
 117         vmovdqa         K3, K1
 118 .Llast:
 119         
 120         
 121         vmovdqu         (MESSAGE), T3_XMM
 122         vmovdqa         K0_XMM, K0_XMM
 123         vmovdqa         K1_XMM, K1_XMM
 124         vmovdqu         0x00(KEY), K2_XMM
 125         vmovdqu         0x10(KEY), K3_XMM
 126         _nh_2xstride    K0, K1, K2, K3
 127 
 128 .Ldone:
 129         
 130 
 131         
 132         
 133         
 134         
 135         
 136         
 137         
 138         
 139         
 140         
 141 
 142         vpunpcklqdq     PASS1_SUMS, PASS0_SUMS, T0      
 143         vpunpckhqdq     PASS1_SUMS, PASS0_SUMS, T1      
 144         vpunpcklqdq     PASS3_SUMS, PASS2_SUMS, T2      
 145         vpunpckhqdq     PASS3_SUMS, PASS2_SUMS, T3      
 146 
 147         vinserti128     $0x1, T2_XMM, T0, T4            
 148         vinserti128     $0x1, T3_XMM, T1, T5            
 149         vperm2i128      $0x31, T2, T0, T0               
 150         vperm2i128      $0x31, T3, T1, T1               
 151 
 152         vpaddq          T5, T4, T4
 153         vpaddq          T1, T0, T0
 154         vpaddq          T4, T0, T0
 155         vmovdqu         T0, (HASH)
 156         ret
 157 ENDPROC(nh_avx2)