root/arch/alpha/lib/ev67-strrchr.S

/* [<][>][^][v][top][bottom][index][help] */
   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 /*
   3  * arch/alpha/lib/ev67-strrchr.S
   4  * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com>
   5  *
   6  * Finds length of a 0-terminated string.  Optimized for the
   7  * Alpha architecture:
   8  *
   9  *      - memory accessed as aligned quadwords only
  10  *      - uses bcmpge to compare 8 bytes in parallel
  11  *
  12  * Much of the information about 21264 scheduling/coding comes from:
  13  *      Compiler Writer's Guide for the Alpha 21264
  14  *      abbreviated as 'CWG' in other comments here
  15  *      ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
  16  * Scheduling notation:
  17  *      E       - either cluster
  18  *      U       - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
  19  *      L       - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
  20  */
  21 
  22 #include <asm/export.h>
  23 #include <asm/regdef.h>
  24 
  25         .set noreorder
  26         .set noat
  27 
  28         .align 4
  29         .ent strrchr
  30         .globl strrchr
  31 strrchr:
  32         .frame sp, 0, ra
  33         .prologue 0
  34 
  35         and     a1, 0xff, t2    # E : 00000000000000ch
  36         insbl   a1, 1, t4       # U : 000000000000ch00
  37         insbl   a1, 2, t5       # U : 0000000000ch0000
  38         ldq_u   t0, 0(a0)       # L : load first quadword Latency=3
  39 
  40         mov     zero, t6        # E : t6 is last match aligned addr
  41         or      t2, t4, a1      # E : 000000000000chch
  42         sll     t5, 8, t3       # U : 00000000ch000000
  43         mov     zero, t8        # E : t8 is last match byte compare mask
  44 
  45         andnot  a0, 7, v0       # E : align source addr
  46         or      t5, t3, t3      # E : 00000000chch0000
  47         sll     a1, 32, t2      # U : 0000chch00000000
  48         sll     a1, 48, t4      # U : chch000000000000
  49 
  50         or      t4, a1, a1      # E : chch00000000chch
  51         or      t2, t3, t2      # E : 0000chchchch0000
  52         or      a1, t2, a1      # E : chchchchchchchch
  53         lda     t5, -1          # E : build garbage mask
  54 
  55         cmpbge  zero, t0, t1    # E : bits set iff byte == zero
  56         mskqh   t5, a0, t4      # E : Complete garbage mask
  57         xor     t0, a1, t2      # E : make bytes == c zero
  58         cmpbge  zero, t4, t4    # E : bits set iff byte is garbage
  59 
  60         cmpbge  zero, t2, t3    # E : bits set iff byte == c
  61         andnot  t1, t4, t1      # E : clear garbage from null test
  62         andnot  t3, t4, t3      # E : clear garbage from char test
  63         bne     t1, $eos        # U : did we already hit the terminator?
  64 
  65         /* Character search main loop */
  66 $loop:
  67         ldq     t0, 8(v0)       # L : load next quadword
  68         cmovne  t3, v0, t6      # E : save previous comparisons match
  69         nop                     #   : Latency=2, extra map slot (keep nop with cmov)
  70         nop
  71 
  72         cmovne  t3, t3, t8      # E : Latency=2, extra map slot
  73         nop                     #   : keep with cmovne
  74         addq    v0, 8, v0       # E :
  75         xor     t0, a1, t2      # E :
  76 
  77         cmpbge  zero, t0, t1    # E : bits set iff byte == zero
  78         cmpbge  zero, t2, t3    # E : bits set iff byte == c
  79         beq     t1, $loop       # U : if we havnt seen a null, loop
  80         nop
  81 
  82         /* Mask out character matches after terminator */
  83 $eos:
  84         negq    t1, t4          # E : isolate first null byte match
  85         and     t1, t4, t4      # E :
  86         subq    t4, 1, t5       # E : build a mask of the bytes up to...
  87         or      t4, t5, t4      # E : ... and including the null
  88 
  89         and     t3, t4, t3      # E : mask out char matches after null
  90         cmovne  t3, t3, t8      # E : save it, if match found Latency=2, extra map slot
  91         nop                     #   : Keep with cmovne
  92         nop
  93 
  94         cmovne  t3, v0, t6      # E :
  95         nop                     #   : Keep with cmovne
  96         /* Locate the address of the last matched character */
  97         ctlz    t8, t2          # U0 : Latency=3 (0x40 for t8=0)
  98         nop
  99 
 100         cmoveq  t8, 0x3f, t2    # E : Compensate for case when no match is seen
 101         nop                     # E : hide the cmov latency (2) behind ctlz latency
 102         lda     t5, 0x3f($31)   # E :
 103         subq    t5, t2, t5      # E : Normalize leading zero count
 104 
 105         addq    t6, t5, v0      # E : and add to quadword address
 106         ret                     # L0 : Latency=3
 107         nop
 108         nop
 109 
 110         .end strrchr
 111         EXPORT_SYMBOL(strrchr)

/* [<][>][^][v][top][bottom][index][help] */