i386/string/strncpy.s

   1 /*
   2  * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24 #include <machine/cpu_capabilities.h>
  25
  26
  27 // *****************
  28 // * S T R N C P Y *
  29 // *****************
  30 //
  31 // char  *strncpy(const char *dst, const char *src, size_t n);
  32 //
  33 // We optimize the move by doing it vector parallel.  This introduces
  34 // a complication: if we blindly did vector load/stores until finding
  35 // a 0, we might get a spurious page fault by touching bytes past it.
  36 // To avoid this, we never do a load that crosses a page boundary,
  37 // and never store a byte we don't have to.
  38 //
  39 // We align the destination, because unaligned vector stores are slow.
  40 //
  41 // Recall that strncpy() zero fills the remainder of the dest buffer,
  42 // and does not terminate the string if it's length is greater than or
  43 // equal to n.
  44
  45 #define kShort  31                      // too short to bother with vector loop
  46
  47         .text
  48         .globl _strncpy
  49
  50         .align  4
  51 _strncpy:                               // char  *strncpy(const char *dst, const char *src, size_t n);
  52         pushl   %edi
  53         pushl   %esi
  54         movl    12(%esp),%edi           // get dest ptr
  55         movl    16(%esp),%esi           // get source ptr
  56         movl    20(%esp),%ecx           // get length
  57         movl    %edi,%edx               // copy dest ptr
  58         negl    %edx
  59         andl    $15,%edx                // how many bytes to align dest ptr?
  60         jnz     LCheckShortCopy         // align destination first
  61
  62
  63 // In order to avoid spurious page faults, we loop until nearing the source page
  64 // end.  Then we revert to a byte-by-byte loop for 16 bytes until the page is crossed,
  65 // then resume the vector loop.
  66 //      %esi = source ptr (unaligned)
  67 //      %edi = dest ptr (aligned)
  68 //      %ecx = buffer length remaining
  69
  70 LNextChunk:                             // NB: can drop down to here
  71         movl    %esi,%eax               // copy source ptr
  72         movl    $4096,%edx
  73         andl    $4095,%eax              // get offset into source page
  74         subl    %eax,%edx               // get #bytes remaining in source page
  75         cmpl    %ecx,%edx               // will buffer run out before the page end?
  76         cmova   %ecx,%edx               // get min(length remaining, bytes to page end)
  77         shrl    $4,%edx                 // get #chunks till end of page
  78         jnz     LLoopOverChunks         // enter vector loop
  79
  80 // We can't use the chunk loop yet.  Check for short and empty buffers, then use byte loop.
  81
  82 LCrossPage:                             // if buffer is large enough, cross source page
  83         movl    $16,%edx                // move 16 bytes to cross page but keep dest aligned
  84 LCheckShortCopy:                        // we propose to copy %edx bytes in byte loop
  85         cmpl    $(kShort),%ecx          // much left?
  86         ja      LLoopOverBytes          // yes, loop over bytes then more chunks
  87         movl    %ecx,%edx               // no, use the byte loop for everything
  88         testl   %ecx,%ecx               // have we filled buffer?
  89         jnz     LLoopOverBytes          // no
  90         jmp     LDone
  91
  92
  93 // Loop over bytes.
  94 //      %esi = source ptr
  95 //      %edi = dest ptr
  96 //      %ecx = buffer length remaining
  97 //      %edx = count of bytes to loop over (<= buffer length)
  98
  99         .align  4,0x90                  // align inner loops to optimize I-fetch
 100 LLoopOverBytes:
 101         movzb   (%esi),%eax             // get source byte
 102         inc     %esi
 103         dec     %ecx                    // decrement length
 104         movb    %al,(%edi)              // pack into dest
 105         inc     %edi
 106         testl   %eax,%eax               // 0?
 107         jz      LZeroBuffer             // yes, we're done copying string
 108         dec     %edx                    // more to go?
 109         jnz     LLoopOverBytes
 110
 111         testl   %ecx,%ecx               // at end of buffer?
 112         jnz     LNextChunk              // no, xfer chunks
 113         jmp     LDone                   // yes
 114
 115
 116 // Loop over 16-byte chunks.
 117 //      %esi = source ptr (unaligned)
 118 //      %edi = dest ptr (aligned)
 119 //      %ecx = buffer length remaining
 120 //      %edx = chunk count
 121
 122         .align  4,0x90                  // align inner loops to optimize I-fetch
 123 LLoopOverChunks:
 124         movdqu  (%esi),%xmm1            // get source
 125         pxor    %xmm0,%xmm0             // get some 0s
 126         addl    $16,%esi
 127         pcmpeqb %xmm1,%xmm0             // compare source to 0s
 128         pmovmskb %xmm0,%eax             // get result mask for 0 check
 129         testl   %eax,%eax               // any 0s?
 130         jnz     LFound0                 // yes, exit loop
 131         movdqa  %xmm1,(%edi)            // no 0s so do aligned store into destination
 132         addl    $16,%edi
 133         subl    $16,%ecx                // decrement length remaining
 134         dec     %edx                    // more to go?
 135         jnz     LLoopOverChunks
 136
 137         jmp     LCrossPage              // cross page but keep dest aligned
 138
 139
 140 // Found a zero in the vector.  Figure out where it is, and store the bytes
 141 // up to it.  It is possible that we should check to be sure (%ecx >= 16), and
 142 // just do an aligned store of %xmm1 if so.  But if we did, we'd be doing byte
 143 // stores into the same double quadword in bzero(), which might hit a hazard.
 144 // Experimentation needed.
 145 //      %edi = dest ptr (aligned)
 146 //      %eax = result mask
 147 //      %ecx = buffer length remaining
 148 //      %xmm1 = source vector
 149
 150 LFound0:
 151         bsf     %eax,%edx               // find first 0
 152         subl    %edx,%ecx               // decrement remaining buffer length
 153         test    $8,%dl                  // 8-byte store required?
 154         jz      4f                      // no
 155         movq    %xmm1,(%edi)            // pack in 8 low bytes
 156         psrldq  $8,%xmm1                // then shift vector down 8 bytes
 157         addl    $8,%edi
 158 4:
 159         test    $4,%dl                  // 4-byte store required?
 160         jz      3f                      // no
 161         movd    %xmm1,(%edi)            // pack in 4 low bytes
 162         psrldq  $4,%xmm1                // then shift vector down 4 bytes
 163         addl    $4,%edi
 164 3:
 165         andl    $3,%edx                 // more to go?
 166         jz      LZeroBuffer             // no
 167         movd    %xmm1,%eax              // move remainders out of vector into %eax
 168 1:                                      // loop on up to three bytes
 169         movb    %al,(%edi)              // pack in next byte
 170         shrl    $8,%eax                 // shift next byte into position
 171         inc     %edi
 172         dec     %edx
 173         jnz     1b
 174
 175 // We've copied the string.  Now zero the rest of the buffer, using commpage bzero().
 176 //      %edi = dest ptr
 177 //      %ecx = buffer length remaining
 178
 179 LZeroBuffer:
 180 //      The stack currently is aligned to 4 mod 16 (it was 0 mod 16 at the time of
 181 //      the call, and the return address, edi, and esi have been pushed).  It needs
 182 //      to aligned 0 mod 16 when we call bzero, so we subtract 20 from esp (not 4
 183 //      because we need to have 8 bytes for the arguments to bzero).
 184         subl    $20,%esp
 185         movl    %ecx,4(%esp)    // remaining buffer size
 186         movl    %edi, (%esp)    // pointer to first unstored byte
 187         call    _bzero
 188         addl    $20,%esp
 189
 190 LDone:
 191         movl    12(%esp),%eax           // original dest ptr is return value
 192         popl    %esi
 193         popl    %edi
 194         ret