root/arch/powerpc/crypto/aes-spe-core.S

/* [<][>][^][v][top][bottom][index][help] */
   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 /*
   3  * Fast AES implementation for SPE instruction set (PPC)
   4  *
   5  * This code makes use of the SPE SIMD instruction set as defined in
   6  * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
   7  * Implementation is based on optimization guide notes from
   8  * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
   9  *
  10  * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
  11  */
  12 
  13 #include <asm/ppc_asm.h>
  14 #include "aes-spe-regs.h"
  15 
  16 #define EAD(in, bpos) \
  17         rlwimi          rT0,in,28-((bpos+3)%4)*8,20,27;
  18 
  19 #define DAD(in, bpos) \
  20         rlwimi          rT1,in,24-((bpos+3)%4)*8,24,31;
  21 
  22 #define LWH(out, off) \
  23         evlwwsplat      out,off(rT0);   /* load word high               */
  24 
  25 #define LWL(out, off) \
  26         lwz             out,off(rT0);   /* load word low                */
  27 
  28 #define LBZ(out, tab, off) \
  29         lbz             out,off(tab);   /* load byte                    */
  30 
  31 #define LAH(out, in, bpos, off) \
  32         EAD(in, bpos)                   /* calc addr + load word high   */ \
  33         LWH(out, off)
  34 
  35 #define LAL(out, in, bpos, off) \
  36         EAD(in, bpos)                   /* calc addr + load word low    */ \
  37         LWL(out, off)
  38 
  39 #define LAE(out, in, bpos) \
  40         EAD(in, bpos)                   /* calc addr + load enc byte    */ \
  41         LBZ(out, rT0, 8)
  42 
  43 #define LBE(out) \
  44         LBZ(out, rT0, 8)                /* load enc byte                */
  45 
  46 #define LAD(out, in, bpos) \
  47         DAD(in, bpos)                   /* calc addr + load dec byte    */ \
  48         LBZ(out, rT1, 0)
  49 
  50 #define LBD(out) \
  51         LBZ(out, rT1, 0)
  52 
  53 /*
  54  * ppc_encrypt_block: The central encryption function for a single 16 bytes
  55  * block. It does no stack handling or register saving to support fast calls
  56  * via bl/blr. It expects that caller has pre-xored input data with first
  57  * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
  58  * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
  59  * and rW0-rW3 and caller must execute a final xor on the output registers.
  60  * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
  61  *
  62  */
  63 _GLOBAL(ppc_encrypt_block)
  64         LAH(rW4, rD1, 2, 4)
  65         LAH(rW6, rD0, 3, 0)
  66         LAH(rW3, rD0, 1, 8)
  67 ppc_encrypt_block_loop:
  68         LAH(rW0, rD3, 0, 12)
  69         LAL(rW0, rD0, 0, 12)
  70         LAH(rW1, rD1, 0, 12)
  71         LAH(rW2, rD2, 1, 8)
  72         LAL(rW2, rD3, 1, 8)
  73         LAL(rW3, rD1, 1, 8)
  74         LAL(rW4, rD2, 2, 4)
  75         LAL(rW6, rD1, 3, 0)
  76         LAH(rW5, rD3, 2, 4)
  77         LAL(rW5, rD0, 2, 4)
  78         LAH(rW7, rD2, 3, 0)
  79         evldw           rD1,16(rKP)
  80         EAD(rD3, 3)
  81         evxor           rW2,rW2,rW4
  82         LWL(rW7, 0)
  83         evxor           rW2,rW2,rW6
  84         EAD(rD2, 0)
  85         evxor           rD1,rD1,rW2
  86         LWL(rW1, 12)
  87         evxor           rD1,rD1,rW0
  88         evldw           rD3,24(rKP)
  89         evmergehi       rD0,rD0,rD1
  90         EAD(rD1, 2)
  91         evxor           rW3,rW3,rW5
  92         LWH(rW4, 4)
  93         evxor           rW3,rW3,rW7
  94         EAD(rD0, 3)
  95         evxor           rD3,rD3,rW3
  96         LWH(rW6, 0)
  97         evxor           rD3,rD3,rW1
  98         EAD(rD0, 1)
  99         evmergehi       rD2,rD2,rD3
 100         LWH(rW3, 8)
 101         LAH(rW0, rD3, 0, 12)
 102         LAL(rW0, rD0, 0, 12)
 103         LAH(rW1, rD1, 0, 12)
 104         LAH(rW2, rD2, 1, 8)
 105         LAL(rW2, rD3, 1, 8)
 106         LAL(rW3, rD1, 1, 8)
 107         LAL(rW4, rD2, 2, 4)
 108         LAL(rW6, rD1, 3, 0)
 109         LAH(rW5, rD3, 2, 4)
 110         LAL(rW5, rD0, 2, 4)
 111         LAH(rW7, rD2, 3, 0)
 112         evldw           rD1,32(rKP)
 113         EAD(rD3, 3)
 114         evxor           rW2,rW2,rW4
 115         LWL(rW7, 0)
 116         evxor           rW2,rW2,rW6
 117         EAD(rD2, 0)
 118         evxor           rD1,rD1,rW2
 119         LWL(rW1, 12)
 120         evxor           rD1,rD1,rW0
 121         evldw           rD3,40(rKP)
 122         evmergehi       rD0,rD0,rD1
 123         EAD(rD1, 2)
 124         evxor           rW3,rW3,rW5
 125         LWH(rW4, 4)
 126         evxor           rW3,rW3,rW7
 127         EAD(rD0, 3)
 128         evxor           rD3,rD3,rW3
 129         LWH(rW6, 0)
 130         evxor           rD3,rD3,rW1
 131         EAD(rD0, 1)
 132         evmergehi       rD2,rD2,rD3
 133         LWH(rW3, 8)
 134         addi            rKP,rKP,32
 135         bdnz            ppc_encrypt_block_loop
 136         LAH(rW0, rD3, 0, 12)
 137         LAL(rW0, rD0, 0, 12)
 138         LAH(rW1, rD1, 0, 12)
 139         LAH(rW2, rD2, 1, 8)
 140         LAL(rW2, rD3, 1, 8)
 141         LAL(rW3, rD1, 1, 8)
 142         LAL(rW4, rD2, 2, 4)
 143         LAH(rW5, rD3, 2, 4)
 144         LAL(rW6, rD1, 3, 0)
 145         LAL(rW5, rD0, 2, 4)
 146         LAH(rW7, rD2, 3, 0)
 147         evldw           rD1,16(rKP)
 148         EAD(rD3, 3)
 149         evxor           rW2,rW2,rW4
 150         LWL(rW7, 0)
 151         evxor           rW2,rW2,rW6
 152         EAD(rD2, 0)
 153         evxor           rD1,rD1,rW2
 154         LWL(rW1, 12)
 155         evxor           rD1,rD1,rW0
 156         evldw           rD3,24(rKP)
 157         evmergehi       rD0,rD0,rD1
 158         EAD(rD1, 0)
 159         evxor           rW3,rW3,rW5
 160         LBE(rW2)
 161         evxor           rW3,rW3,rW7
 162         EAD(rD0, 1)
 163         evxor           rD3,rD3,rW3
 164         LBE(rW6)
 165         evxor           rD3,rD3,rW1
 166         EAD(rD0, 0)
 167         evmergehi       rD2,rD2,rD3
 168         LBE(rW1)
 169         LAE(rW0, rD3, 0)
 170         LAE(rW1, rD0, 0)
 171         LAE(rW4, rD2, 1)
 172         LAE(rW5, rD3, 1)
 173         LAE(rW3, rD2, 0)
 174         LAE(rW7, rD1, 1)
 175         rlwimi          rW0,rW4,8,16,23
 176         rlwimi          rW1,rW5,8,16,23
 177         LAE(rW4, rD1, 2)
 178         LAE(rW5, rD2, 2)
 179         rlwimi          rW2,rW6,8,16,23
 180         rlwimi          rW3,rW7,8,16,23
 181         LAE(rW6, rD3, 2)
 182         LAE(rW7, rD0, 2)
 183         rlwimi          rW0,rW4,16,8,15
 184         rlwimi          rW1,rW5,16,8,15
 185         LAE(rW4, rD0, 3)
 186         LAE(rW5, rD1, 3)
 187         rlwimi          rW2,rW6,16,8,15
 188         lwz             rD0,32(rKP)
 189         rlwimi          rW3,rW7,16,8,15
 190         lwz             rD1,36(rKP)
 191         LAE(rW6, rD2, 3)
 192         LAE(rW7, rD3, 3)
 193         rlwimi          rW0,rW4,24,0,7
 194         lwz             rD2,40(rKP)
 195         rlwimi          rW1,rW5,24,0,7
 196         lwz             rD3,44(rKP)
 197         rlwimi          rW2,rW6,24,0,7
 198         rlwimi          rW3,rW7,24,0,7
 199         blr
 200 
 201 /*
 202  * ppc_decrypt_block: The central decryption function for a single 16 bytes
 203  * block. It does no stack handling or register saving to support fast calls
 204  * via bl/blr. It expects that caller has pre-xored input data with first
 205  * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
 206  * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
 207  * and rW0-rW3 and caller must execute a final xor on the output registers.
 208  * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
 209  *
 210  */
 211 _GLOBAL(ppc_decrypt_block)
 212         LAH(rW0, rD1, 0, 12)
 213         LAH(rW6, rD0, 3, 0)
 214         LAH(rW3, rD0, 1, 8)
 215 ppc_decrypt_block_loop:
 216         LAH(rW1, rD3, 0, 12)
 217         LAL(rW0, rD2, 0, 12)
 218         LAH(rW2, rD2, 1, 8)
 219         LAL(rW2, rD3, 1, 8)
 220         LAH(rW4, rD3, 2, 4)
 221         LAL(rW4, rD0, 2, 4)
 222         LAL(rW6, rD1, 3, 0)
 223         LAH(rW5, rD1, 2, 4)
 224         LAH(rW7, rD2, 3, 0)
 225         LAL(rW7, rD3, 3, 0)
 226         LAL(rW3, rD1, 1, 8)
 227         evldw           rD1,16(rKP)
 228         EAD(rD0, 0)
 229         evxor           rW4,rW4,rW6
 230         LWL(rW1, 12)
 231         evxor           rW0,rW0,rW4
 232         EAD(rD2, 2)
 233         evxor           rW0,rW0,rW2
 234         LWL(rW5, 4)
 235         evxor           rD1,rD1,rW0
 236         evldw           rD3,24(rKP)
 237         evmergehi       rD0,rD0,rD1
 238         EAD(rD1, 0)
 239         evxor           rW3,rW3,rW7
 240         LWH(rW0, 12)
 241         evxor           rW3,rW3,rW1
 242         EAD(rD0, 3)
 243         evxor           rD3,rD3,rW3
 244         LWH(rW6, 0)
 245         evxor           rD3,rD3,rW5
 246         EAD(rD0, 1)
 247         evmergehi       rD2,rD2,rD3
 248         LWH(rW3, 8)
 249         LAH(rW1, rD3, 0, 12)
 250         LAL(rW0, rD2, 0, 12)
 251         LAH(rW2, rD2, 1, 8)
 252         LAL(rW2, rD3, 1, 8)
 253         LAH(rW4, rD3, 2, 4)
 254         LAL(rW4, rD0, 2, 4)
 255         LAL(rW6, rD1, 3, 0)
 256         LAH(rW5, rD1, 2, 4)
 257         LAH(rW7, rD2, 3, 0)
 258         LAL(rW7, rD3, 3, 0)
 259         LAL(rW3, rD1, 1, 8)
 260         evldw            rD1,32(rKP)
 261         EAD(rD0, 0)
 262         evxor           rW4,rW4,rW6
 263         LWL(rW1, 12)
 264         evxor           rW0,rW0,rW4
 265         EAD(rD2, 2)
 266         evxor           rW0,rW0,rW2
 267         LWL(rW5, 4)
 268         evxor           rD1,rD1,rW0
 269         evldw           rD3,40(rKP)
 270         evmergehi       rD0,rD0,rD1
 271         EAD(rD1, 0)
 272         evxor           rW3,rW3,rW7
 273         LWH(rW0, 12)
 274         evxor           rW3,rW3,rW1
 275         EAD(rD0, 3)
 276         evxor           rD3,rD3,rW3
 277         LWH(rW6, 0)
 278         evxor           rD3,rD3,rW5
 279         EAD(rD0, 1)
 280         evmergehi       rD2,rD2,rD3
 281         LWH(rW3, 8)
 282         addi            rKP,rKP,32
 283         bdnz            ppc_decrypt_block_loop
 284         LAH(rW1, rD3, 0, 12)
 285         LAL(rW0, rD2, 0, 12)
 286         LAH(rW2, rD2, 1, 8)
 287         LAL(rW2, rD3, 1, 8)
 288         LAH(rW4, rD3, 2, 4)
 289         LAL(rW4, rD0, 2, 4)
 290         LAL(rW6, rD1, 3, 0)
 291         LAH(rW5, rD1, 2, 4)
 292         LAH(rW7, rD2, 3, 0)
 293         LAL(rW7, rD3, 3, 0)
 294         LAL(rW3, rD1, 1, 8)
 295         evldw            rD1,16(rKP)
 296         EAD(rD0, 0)
 297         evxor           rW4,rW4,rW6
 298         LWL(rW1, 12)
 299         evxor           rW0,rW0,rW4
 300         EAD(rD2, 2)
 301         evxor           rW0,rW0,rW2
 302         LWL(rW5, 4)
 303         evxor           rD1,rD1,rW0
 304         evldw           rD3,24(rKP)
 305         evmergehi       rD0,rD0,rD1
 306         DAD(rD1, 0)
 307         evxor           rW3,rW3,rW7
 308         LBD(rW0)
 309         evxor           rW3,rW3,rW1
 310         DAD(rD0, 1)
 311         evxor           rD3,rD3,rW3
 312         LBD(rW6)
 313         evxor           rD3,rD3,rW5
 314         DAD(rD0, 0)
 315         evmergehi       rD2,rD2,rD3
 316         LBD(rW3)
 317         LAD(rW2, rD3, 0)
 318         LAD(rW1, rD2, 0)
 319         LAD(rW4, rD2, 1)
 320         LAD(rW5, rD3, 1)
 321         LAD(rW7, rD1, 1)
 322         rlwimi          rW0,rW4,8,16,23
 323         rlwimi          rW1,rW5,8,16,23
 324         LAD(rW4, rD3, 2)
 325         LAD(rW5, rD0, 2)
 326         rlwimi          rW2,rW6,8,16,23
 327         rlwimi          rW3,rW7,8,16,23
 328         LAD(rW6, rD1, 2)
 329         LAD(rW7, rD2, 2)
 330         rlwimi          rW0,rW4,16,8,15
 331         rlwimi          rW1,rW5,16,8,15
 332         LAD(rW4, rD0, 3)
 333         LAD(rW5, rD1, 3)
 334         rlwimi          rW2,rW6,16,8,15
 335         lwz             rD0,32(rKP)
 336         rlwimi          rW3,rW7,16,8,15
 337         lwz             rD1,36(rKP)
 338         LAD(rW6, rD2, 3)
 339         LAD(rW7, rD3, 3)
 340         rlwimi          rW0,rW4,24,0,7
 341         lwz             rD2,40(rKP)
 342         rlwimi          rW1,rW5,24,0,7
 343         lwz             rD3,44(rKP)
 344         rlwimi          rW2,rW6,24,0,7
 345         rlwimi          rW3,rW7,24,0,7
 346         blr

/* [<][>][^][v][top][bottom][index][help] */