root/arch/sparc/lib/M7memset.S

/* [<][>][^][v][top][bottom][index][help] */
   1 /*
   2  * M7memset.S: SPARC M7 optimized memset.
   3  *
   4  * Copyright (c) 2016, Oracle and/or its affiliates.  All rights reserved.
   5  */
   6 
   7 /*
   8  * M7memset.S: M7 optimized memset.
   9  *
  10  * char *memset(sp, c, n)
  11  *
  12  * Set an array of n chars starting at sp to the character c.
  13  * Return sp.
  14  *
  15  * Fast assembler language version of the following C-program for memset
  16  * which represents the `standard' for the C-library.
  17  *
  18  *      void *
  19  *      memset(void *sp1, int c, size_t n)
  20  *      {
  21  *          if (n != 0) {
  22  *              char *sp = sp1;
  23  *              do {
  24  *                  *sp++ = (char)c;
  25  *              } while (--n != 0);
  26  *          }
  27  *          return (sp1);
  28  *      }
  29  *
  30  * The algorithm is as follows :
  31  *
  32  *      For small 6 or fewer bytes stores, bytes will be stored.
  33  *
  34  *      For less than 32 bytes stores, align the address on 4 byte boundary.
  35  *      Then store as many 4-byte chunks, followed by trailing bytes.
  36  *
  37  *      For sizes greater than 32 bytes, align the address on 8 byte boundary.
  38  *      if (count >= 64) {
  39  *              store 8-bytes chunks to align the address on 64 byte boundary
  40  *              if (value to be set is zero && count >= MIN_ZERO) {
  41  *                      Using BIS stores, set the first long word of each
  42  *                      64-byte cache line to zero which will also clear the
  43  *                      other seven long words of the cache line.
  44  *              }
  45  *              else if (count >= MIN_LOOP) {
  46  *                      Using BIS stores, set the first long word of each of
  47  *                      ST_CHUNK cache lines (64 bytes each) before the main
  48  *                      loop is entered.
  49  *                      In the main loop, continue pre-setting the first long
  50  *                      word of each cache line ST_CHUNK lines in advance while
  51  *                      setting the other seven long words (56 bytes) of each
  52  *                      cache line until fewer than ST_CHUNK*64 bytes remain.
  53  *                      Then set the remaining seven long words of each cache
  54  *                      line that has already had its first long word set.
  55  *              }
  56  *              store remaining data in 64-byte chunks until less than
  57  *              64 bytes remain.
  58  *       }
  59  *       Store as many 8-byte chunks, followed by trailing bytes.
  60  *
  61  * BIS = Block Init Store
  62  *   Doing the advance store of the first element of the cache line
  63  *   initiates the displacement of a cache line while only using a single
  64  *   instruction in the pipeline. That avoids various pipeline delays,
  65  *   such as filling the miss buffer. The performance effect is
  66  *   similar to prefetching for normal stores.
  67  *   The special case for zero fills runs faster and uses fewer instruction
  68  *   cycles than the normal memset loop.
  69  *
  70  * We only use BIS for memset of greater than MIN_LOOP bytes because a sequence
  71  * BIS stores must be followed by a membar #StoreStore. The benefit of
  72  * the BIS store must be balanced against the cost of the membar operation.
  73  */
  74 
  75 /*
  76  * ASI_STBI_P marks the cache line as "least recently used"
  77  * which means if many threads are active, it has a high chance
  78  * of being pushed out of the cache between the first initializing
  79  * store and the final stores.
  80  * Thus, we use ASI_STBIMRU_P which marks the cache line as
  81  * "most recently used" for all but the last store to the cache line.
  82  */
  83 
  84 #include <asm/asi.h>
  85 #include <asm/page.h>
  86 
  87 #define ASI_STBI_P      ASI_BLK_INIT_QUAD_LDD_P
  88 #define ASI_STBIMRU_P   ASI_ST_BLKINIT_MRU_P
  89 
  90 
  91 #define ST_CHUNK        24   /* multiple of 4 due to loop unrolling */
  92 #define MIN_LOOP        16320
  93 #define MIN_ZERO        512
  94 
  95         .section        ".text"
  96         .align          32
  97 
  98 /*
  99  * Define clear_page(dest) as memset(dest, 0, PAGE_SIZE)
 100  * (can create a more optimized version later.)
 101  */
 102         .globl          M7clear_page
 103         .globl          M7clear_user_page
 104 M7clear_page:           /* clear_page(dest) */
 105 M7clear_user_page:
 106         set     PAGE_SIZE, %o1
 107         /* fall through into bzero code */
 108 
 109         .size           M7clear_page,.-M7clear_page
 110         .size           M7clear_user_page,.-M7clear_user_page
 111 
 112 /*
 113  * Define bzero(dest, n) as memset(dest, 0, n)
 114  * (can create a more optimized version later.)
 115  */
 116         .globl          M7bzero
 117 M7bzero:                /* bzero(dest, size) */
 118         mov     %o1, %o2
 119         mov     0, %o1
 120         /* fall through into memset code */
 121 
 122         .size           M7bzero,.-M7bzero
 123 
 124         .global         M7memset
 125         .type           M7memset, #function
 126         .register       %g3, #scratch
 127 M7memset:
 128         mov     %o0, %o5                ! copy sp1 before using it
 129         cmp     %o2, 7                  ! if small counts, just write bytes
 130         bleu,pn %xcc, .wrchar
 131          and     %o1, 0xff, %o1          ! o1 is (char)c
 132 
 133         sll     %o1, 8, %o3
 134         or      %o1, %o3, %o1           ! now o1 has 2 bytes of c
 135         sll     %o1, 16, %o3
 136         cmp     %o2, 32
 137         blu,pn  %xcc, .wdalign
 138          or      %o1, %o3, %o1           ! now o1 has 4 bytes of c
 139 
 140         sllx    %o1, 32, %o3
 141         or      %o1, %o3, %o1           ! now o1 has 8 bytes of c
 142 
 143 .dbalign:
 144         andcc   %o5, 7, %o3             ! is sp1 aligned on a 8 byte bound?
 145         bz,pt   %xcc, .blkalign         ! already long word aligned
 146          sub     %o3, 8, %o3             ! -(bytes till long word aligned)
 147 
 148         add     %o2, %o3, %o2           ! update o2 with new count
 149         ! Set -(%o3) bytes till sp1 long word aligned
 150 1:      stb     %o1, [%o5]              ! there is at least 1 byte to set
 151         inccc   %o3                     ! byte clearing loop
 152         bl,pt   %xcc, 1b
 153          inc     %o5
 154 
 155         ! Now sp1 is long word aligned (sp1 is found in %o5)
 156 .blkalign:
 157         cmp     %o2, 64                 ! check if there are 64 bytes to set
 158         blu,pn  %xcc, .wrshort
 159          mov     %o2, %o3
 160 
 161         andcc   %o5, 63, %o3            ! is sp1 block aligned?
 162         bz,pt   %xcc, .blkwr            ! now block aligned
 163          sub     %o3, 64, %o3            ! o3 is -(bytes till block aligned)
 164         add     %o2, %o3, %o2           ! o2 is the remainder
 165 
 166         ! Store -(%o3) bytes till dst is block (64 byte) aligned.
 167         ! Use long word stores.
 168         ! Recall that dst is already long word aligned
 169 1:
 170         addcc   %o3, 8, %o3
 171         stx     %o1, [%o5]
 172         bl,pt   %xcc, 1b
 173          add     %o5, 8, %o5
 174 
 175         ! Now sp1 is block aligned
 176 .blkwr:
 177         andn    %o2, 63, %o4            ! calculate size of blocks in bytes
 178         brz,pn  %o1, .wrzero            ! special case if c == 0
 179          and     %o2, 63, %o3            ! %o3 = bytes left after blk stores.
 180 
 181         set     MIN_LOOP, %g1
 182         cmp     %o4, %g1                ! check there are enough bytes to set
 183         blu,pn  %xcc, .short_set        ! to justify cost of membar
 184                                         ! must be > pre-cleared lines
 185          nop
 186 
 187         ! initial cache-clearing stores
 188         ! get store pipeline moving
 189         rd      %asi, %g3               ! save %asi to be restored later
 190         wr     %g0, ASI_STBIMRU_P, %asi
 191 
 192         ! Primary memset loop for large memsets
 193 .wr_loop:
 194         sub     %o5, 8, %o5             ! adjust %o5 for ASI store alignment
 195         mov     ST_CHUNK, %g1
 196 .wr_loop_start:
 197         stxa    %o1, [%o5+8]%asi
 198         subcc   %g1, 4, %g1
 199         stxa    %o1, [%o5+8+64]%asi
 200         add     %o5, 256, %o5
 201         stxa    %o1, [%o5+8-128]%asi
 202         bgu     %xcc, .wr_loop_start
 203          stxa    %o1, [%o5+8-64]%asi
 204 
 205         sub     %o5, ST_CHUNK*64, %o5   ! reset %o5
 206         mov     ST_CHUNK, %g1
 207 
 208 .wr_loop_rest:
 209         stxa    %o1, [%o5+8+8]%asi
 210         sub     %o4, 64, %o4
 211         stxa    %o1, [%o5+16+8]%asi
 212         subcc   %g1, 1, %g1
 213         stxa    %o1, [%o5+24+8]%asi
 214         stxa    %o1, [%o5+32+8]%asi
 215         stxa    %o1, [%o5+40+8]%asi
 216         add     %o5, 64, %o5
 217         stxa    %o1, [%o5-8]%asi
 218         bgu     %xcc, .wr_loop_rest
 219          stxa    %o1, [%o5]ASI_STBI_P
 220 
 221         ! If more than ST_CHUNK*64 bytes remain to set, continue
 222         ! setting the first long word of each cache line in advance
 223         ! to keep the store pipeline moving.
 224 
 225         cmp     %o4, ST_CHUNK*64
 226         bge,pt  %xcc, .wr_loop_start
 227          mov     ST_CHUNK, %g1
 228 
 229         brz,a,pn %o4, .asi_done
 230          add     %o5, 8, %o5             ! restore %o5 offset
 231 
 232 .wr_loop_small:
 233         stxa    %o1, [%o5+8]%asi
 234         stxa    %o1, [%o5+8+8]%asi
 235         stxa    %o1, [%o5+16+8]%asi
 236         stxa    %o1, [%o5+24+8]%asi
 237         stxa    %o1, [%o5+32+8]%asi
 238         subcc   %o4, 64, %o4
 239         stxa    %o1, [%o5+40+8]%asi
 240         add     %o5, 64, %o5
 241         stxa    %o1, [%o5-8]%asi
 242         bgu,pt  %xcc, .wr_loop_small
 243          stxa    %o1, [%o5]ASI_STBI_P
 244 
 245         ba      .asi_done
 246          add     %o5, 8, %o5             ! restore %o5 offset
 247 
 248         ! Special case loop for zero fill memsets
 249         ! For each 64 byte cache line, single STBI to first element
 250         ! clears line
 251 .wrzero:
 252         cmp     %o4, MIN_ZERO           ! check if enough bytes to set
 253                                         ! to pay %asi + membar cost
 254         blu     %xcc, .short_set
 255          nop
 256         sub     %o4, 256, %o4
 257 
 258 .wrzero_loop:
 259         mov     64, %g3
 260         stxa    %o1, [%o5]ASI_STBI_P
 261         subcc   %o4, 256, %o4
 262         stxa    %o1, [%o5+%g3]ASI_STBI_P
 263         add     %o5, 256, %o5
 264         sub     %g3, 192, %g3
 265         stxa    %o1, [%o5+%g3]ASI_STBI_P
 266         add %g3, 64, %g3
 267         bge,pt  %xcc, .wrzero_loop
 268          stxa    %o1, [%o5+%g3]ASI_STBI_P
 269         add     %o4, 256, %o4
 270 
 271         brz,pn  %o4, .bsi_done
 272          nop
 273 
 274 .wrzero_small:
 275         stxa    %o1, [%o5]ASI_STBI_P
 276         subcc   %o4, 64, %o4
 277         bgu,pt  %xcc, .wrzero_small
 278          add     %o5, 64, %o5
 279         ba,a    .bsi_done
 280 
 281 .asi_done:
 282         wr      %g3, 0x0, %asi          ! restored saved %asi
 283 .bsi_done:
 284         membar  #StoreStore             ! required by use of Block Store Init
 285 
 286 .short_set:
 287         cmp     %o4, 64                 ! check if 64 bytes to set
 288         blu     %xcc, 5f
 289          nop
 290 4:                                      ! set final blocks of 64 bytes
 291         stx     %o1, [%o5]
 292         stx     %o1, [%o5+8]
 293         stx     %o1, [%o5+16]
 294         stx     %o1, [%o5+24]
 295         subcc   %o4, 64, %o4
 296         stx     %o1, [%o5+32]
 297         stx     %o1, [%o5+40]
 298         add     %o5, 64, %o5
 299         stx     %o1, [%o5-16]
 300         bgu,pt  %xcc, 4b
 301          stx     %o1, [%o5-8]
 302 
 303 5:
 304         ! Set the remaining long words
 305 .wrshort:
 306         subcc   %o3, 8, %o3             ! Can we store any long words?
 307         blu,pn  %xcc, .wrchars
 308          and     %o2, 7, %o2             ! calc bytes left after long words
 309 6:
 310         subcc   %o3, 8, %o3
 311         stx     %o1, [%o5]              ! store the long words
 312         bgeu,pt %xcc, 6b
 313          add     %o5, 8, %o5
 314 
 315 .wrchars:                               ! check for extra chars
 316         brnz    %o2, .wrfin
 317          nop
 318         retl
 319          nop
 320 
 321 .wdalign:
 322         andcc   %o5, 3, %o3             ! is sp1 aligned on a word boundary
 323         bz,pn   %xcc, .wrword
 324          andn    %o2, 3, %o3             ! create word sized count in %o3
 325 
 326         dec     %o2                     ! decrement count
 327         stb     %o1, [%o5]              ! clear a byte
 328         b       .wdalign
 329          inc     %o5                     ! next byte
 330 
 331 .wrword:
 332         subcc   %o3, 4, %o3
 333         st      %o1, [%o5]              ! 4-byte writing loop
 334         bnz,pt  %xcc, .wrword
 335          add     %o5, 4, %o5
 336 
 337         and     %o2, 3, %o2             ! leftover count, if any
 338 
 339 .wrchar:
 340         ! Set the remaining bytes, if any
 341         brz     %o2, .exit
 342          nop
 343 .wrfin:
 344         deccc   %o2
 345         stb     %o1, [%o5]
 346         bgu,pt  %xcc, .wrfin
 347          inc     %o5
 348 .exit:
 349         retl                            ! %o0 was preserved
 350          nop
 351 
 352         .size           M7memset,.-M7memset

/* [<][>][^][v][top][bottom][index][help] */