i386/string/strcpy.s

   1 /*
   2  * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24
  25 // ***************
  26 // * S T R C P Y *
  27 // ***************
  28 //
  29 // char  *strcpy(const char *dst, const char *src);
  30 //
  31 // We optimize the move by doing it vector parallel.  This introduces
  32 // a complication: if we blindly did vector load/stores until finding
  33 // a 0, we might get a spurious page fault by touching bytes past it.
  34 // To avoid this, we never do a load that crosses a page boundary,
  35 // and never store a byte we don't have to.
  36 //
  37 // We align the destination, because unaligned vector stores are slow.
  38
  39         .text
  40         .globl _strcpy
  41
  42         .align  4
  43 _strcpy:                                // char *strcpy(const char *dst, const char *src);
  44         pushl   %edi
  45         movl    8(%esp),%edi            // get dest ptr
  46         movl    12(%esp),%ecx           // get source ptr
  47         movl    %edi,%edx               // copy dest ptr
  48         negl    %edx
  49         andl    $15,%edx                // how many bytes to align dest ptr?
  50         jnz     LLoopOverBytes          // not aligned, so go do so
  51
  52
  53 // In order to avoid spurious page faults, we loop until nearing the source page
  54 // end.  Then we revert to a byte-by-byte loop for 16 bytes until the page is crossed,
  55 // then resume the vector loop.
  56 //      %ecx = source ptr (unaligned)
  57 //      %edi = dest ptr (aligned)
  58
  59 LNextChunk:
  60         movl    %ecx,%eax               // copy source ptr
  61         movl    $4096,%edx
  62         andl    $4095,%eax              // get offset into source page
  63         subl    %eax,%edx               // get #bytes remaining in source page
  64         shrl    $4,%edx                 // get #chunks till end of page
  65         jnz     LLoopOverChunks         // enter vector loop
  66         movl    $16,%edx                // move 16 bytes to cross page but keep dest aligned
  67         jmp     LLoopOverBytes
  68
  69
  70 // Loop over bytes.
  71 //      %ecx = source ptr
  72 //      %edi = dest ptr
  73 //      %edx = byte count
  74
  75         .align  4,0x90                  // align inner loops to optimize I-fetch
  76 LLoopOverBytes:
  77         movzb   (%ecx),%eax             // get source byte
  78         inc     %ecx
  79         movb    %al,(%edi)              // pack into dest
  80         inc     %edi
  81         testl   %eax,%eax               // 0?
  82         jz      LDone                   // yes, we're done
  83         dec     %edx                    // more to go?
  84         jnz     LLoopOverBytes
  85
  86         jmp     LNextChunk              // we've come to end of page
  87
  88
  89 // Loop over 16-byte chunks.
  90 //      %ecx = source ptr (unaligned)
  91 //      %edi = dest ptr (aligned)
  92 //      %edx = chunk count
  93
  94         .align  4,0x90                  // align inner loops to optimize I-fetch
  95 LLoopOverChunks:
  96         movdqu  (%ecx),%xmm1            // get source
  97         pxor    %xmm0,%xmm0             // get some 0s
  98         addl    $16,%ecx
  99         pcmpeqb %xmm1,%xmm0             // compare source to 0s
 100         pmovmskb %xmm0,%eax             // get result mask for 0 check
 101         testl   %eax,%eax               // any 0s?
 102         jnz     LFound0                 // yes, exit loop
 103         movdqa  %xmm1,(%edi)            // no 0s so do aligned store into destination
 104         addl    $16,%edi
 105         dec     %edx                    // more to go?
 106         jnz     LLoopOverChunks
 107
 108         movl    $16,%edx                // move 16 bytes
 109         jmp     LLoopOverBytes          // cross page but keep dest aligned
 110
 111
 112 // Found a zero in the vector.  Figure out where it is, and store the bytes
 113 // up to it.
 114 //      %edi = dest ptr (aligned)
 115 //      %eax = result mask
 116 //      %xmm1 = source vector
 117
 118 LFound0:
 119         bsf     %eax,%edx               // find first 0
 120         inc     %edx                    // we need to store the 0 too
 121         test    $16,%dl                 // was 0 last byte?
 122         jz      8f                      // no
 123         movdqa  %xmm1,(%edi)            // yes, store entire vector
 124         jmp     LDone
 125 8:
 126         test    $8,%dl                  // 8-byte store required?
 127         jz      4f                      // no
 128         movq    %xmm1,(%edi)            // pack in 8 low bytes
 129         psrldq  $8,%xmm1                // then shift vector down 8 bytes
 130         addl    $8,%edi
 131 4:
 132         test    $4,%dl                  // 4-byte store required?
 133         jz      3f                      // no
 134         movd    %xmm1,(%edi)            // pack in 4 low bytes
 135         psrldq  $4,%xmm1                // then shift vector down 4 bytes
 136         addl    $4,%edi
 137 3:
 138         andl    $3,%edx                 // more to go?
 139         jz      LDone                   // no
 140         movd    %xmm1,%eax              // move remainders out of vector into %eax
 141 1:                                      // loop on up to three bytes
 142         movb    %al,(%edi)              // pack in next byte
 143         shrl    $8,%eax                 // shift next byte into position
 144         inc     %edi
 145         dec     %edx
 146         jnz     1b
 147
 148 LDone:
 149         movl    8(%esp),%eax            // original dest ptr is return value
 150         popl    %edi
 151         ret