x86_64/string/strncpy.s

   1 /*
   2  * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24 #include <machine/cpu_capabilities.h>
  25
  26
  27 // *****************
  28 // * S T R N C P Y *
  29 // *****************
  30 //
  31 // char  *strncpy(const char *dst, const char *src, size_t n);
  32 //
  33 // We optimize the move by doing it vector parallel.  This introduces
  34 // a complication: if we blindly did vector load/stores until finding
  35 // a 0, we might get a spurious page fault by touching bytes past it.
  36 // To avoid this, we never do a load that crosses a page boundary,
  37 // and never store a byte we don't have to.
  38 //
  39 // We align the destination, because unaligned vector stores are slow.
  40 //
  41 // Recall that strncpy() zero fills the remainder of the dest buffer,
  42 // and does not terminate the string if its length is greater than or
  43 // equal to n.
  44
  45 #define kShort  31                      // too short to bother with vector loop
  46
  47         .text
  48         .globl _strncpy
  49
  50         .align  4
  51 _strncpy:                               // char  *strncpy(const char *dst, const char *src, size_t n);
  52         movq    %rdi,%r8        // preserve destination pointer so we can return it
  53         movl    %edi,%ecx               // copy low 4 bytes of dest ptr
  54         negl    %ecx
  55         andl    $15,%ecx                // how many bytes to align dest ptr?
  56         jnz     LCheckShortCopy         // align destination first
  57
  58
  59 // In order to avoid spurious page faults, we loop until nearing the source page
  60 // end.  Then we revert to a byte-by-byte loop for 16 bytes until the page is crossed,
  61 // then resume the vector loop.
  62 //      %rsi = source ptr (unaligned)
  63 //      %rdi = dest ptr (aligned)
  64 //      %rdx = buffer length remaining
  65
  66 LNextChunk:                             // NB: can drop down to here
  67         movl    %esi,%eax               // copy the low 4 bytes of the source ptr
  68         movl    $4096,%ecx
  69         andl    $4095,%eax              // get offset into source page
  70         subl    %eax,%ecx               // get #bytes remaining in source page
  71         cmpq    %rdx,%rcx               // will buffer run out before the page end?
  72         cmova   %rdx,%rcx               // get min(length remaining, bytes to page end)
  73         shrl    $4,%ecx                 // get #chunks till end of page
  74         jnz     LLoopOverChunks         // enter vector loop
  75
  76 // We can't use the chunk loop yet.  Check for short and empty buffers, then use byte loop.
  77
  78 LCrossPage:                             // if buffer is large enough, cross source page
  79         movl    $16,%ecx                // move 16 bytes to cross page but keep dest aligned
  80 LCheckShortCopy:                        // we propose to copy %ecx bytes in byte loop
  81         cmpq    $(kShort),%rdx          // much left?
  82         ja      LLoopOverBytes          // yes, loop over bytes then more chunks
  83         movl    %edx,%ecx               // no, use the byte loop for everything
  84         testl   %edx,%edx               // have we filled buffer?
  85         jnz     LLoopOverBytes          // no
  86         jmp     LDone
  87
  88
  89 // Loop over bytes.
  90 //      %rsi = source ptr
  91 //      %rdi = dest ptr
  92 //      %rdx = buffer length remaining
  93 //      %rcx = count of bytes to loop over (<= buffer length)
  94
  95         .align  4,0x90                  // align inner loops to optimize I-fetch
  96 LLoopOverBytes:
  97         movzb   (%rsi),%eax             // get source byte
  98         addq    $1,%rsi
  99         subq    $1,%rdx                 // decrement length
 100         movb    %al,(%rdi)              // pack into dest
 101         addq    $1,%rdi
 102         testl   %eax,%eax               // 0?
 103         jz      LZeroBuffer             // yes, we're done copying string
 104         subq    $1,%rcx                 // more to go?
 105         jnz     LLoopOverBytes
 106
 107         testq   %rdx,%rdx               // at end of buffer?
 108         jnz     LNextChunk              // no, xfer chunks
 109         jmp     LDone                   // yes
 110
 111
 112 // Loop over 16-byte chunks.
 113 //      %rsi = source ptr (unaligned)
 114 //      %rdi = dest ptr (aligned)
 115 //      %rdx = buffer length remaining
 116 //      %ecx = chunk count
 117
 118         .align  4,0x90                  // align inner loops to optimize I-fetch
 119 LLoopOverChunks:
 120         movdqu  (%rsi),%xmm1            // get source
 121         pxor    %xmm0,%xmm0             // get some 0s
 122         addq    $16,%rsi
 123         pcmpeqb %xmm1,%xmm0             // compare source to 0s
 124         pmovmskb %xmm0,%eax             // get result mask for 0 check
 125         testl   %eax,%eax               // any 0s?
 126         jnz     LFound0                 // yes, exit loop
 127         movdqa  %xmm1,(%rdi)            // no 0s so do aligned store into destination
 128         addq    $16,%rdi
 129         subq    $16,%rdx                // decrement length remaining
 130         subl    $1,%ecx                 // more to go?
 131         jnz     LLoopOverChunks
 132
 133         jmp     LCrossPage              // cross page but keep dest aligned
 134
 135
 136 // Found a zero in the vector.  Figure out where it is, and store the bytes
 137 // up to it.  It is possible that we should check to be sure (%rdx >= 16), and
 138 // just do an aligned store of %xmm1 if so.  But if we did, we'd be doing byte
 139 // stores into the same double quadword in bzero(), which might hit a hazard.
 140 // Experimentation needed.
 141 //      %rdi = dest ptr (aligned)
 142 //      %eax = result mask
 143 //      %rdx = buffer length remaining
 144 //      %xmm1 = source vector
 145
 146 LFound0:
 147         bsf     %eax,%ecx               // find first 0
 148         subq    %rcx,%rdx               // decrement remaining buffer length
 149         test    $8,%cl                  // 8-byte store required?
 150         jz      4f                      // no
 151         movq    %xmm1,(%rdi)            // pack in 8 low bytes
 152         psrldq  $8,%xmm1                // then shift vector down 8 bytes
 153         addq    $8,%rdi
 154 4:
 155         test    $4,%cl                  // 4-byte store required?
 156         jz      3f                      // no
 157         movd    %xmm1,(%rdi)            // pack in 4 low bytes
 158         psrldq  $4,%xmm1                // then shift vector down 4 bytes
 159         addq    $4,%rdi
 160 3:
 161         andl    $3,%ecx                 // more to go?
 162         jz      LZeroBuffer             // no
 163         movd    %xmm1,%eax              // move remainders out of vector into %eax
 164 1:                                      // loop on up to three bytes
 165         movb    %al,(%rdi)              // pack in next byte
 166         shrl    $8,%eax                 // shift next byte into position
 167         addq    $1,%rdi
 168         subl    $1,%ecx
 169         jnz     1b
 170
 171 // We've copied the string.  Now zero the rest of the buffer, using commpage bzero().
 172 //      %rdi = dest ptr
 173 //      %rcx = buffer length remaining
 174
 175 LZeroBuffer:
 176         movq    %rdx,%rsi               // remaining buffer size (2nd argument)
 177         movq    $(_COMM_PAGE_BZERO),%rax
 178         call    *%rax
 179
 180 LDone:
 181         movq    %r8,%rax                // original dest ptr is return value
 182         ret