x86_64/string/longcopy_sse3x.s

   1 /*
   2  * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <machine/cpu_capabilities.h>
  30
  31 /*
  32  * The bcopy/memcpy loops for very long operands, tuned for 64-bit
  33  * Pentium-M class processors with Supplemental SSE3 and 64-byte cache lines.
  34  * This is the 64-bit version.
  35  *
  36  * The following #defines are tightly coupled to the u-architecture:
  37  */
  38
  39 #define kBigChunk   (256*1024)          // outer loop chunk size for kVeryLong sized operands
  40
  41
  42 // Very long forward moves.  These are at least several pages, so we loop over big
  43 // chunks of memory (kBigChunk in size.)  We first prefetch the chunk, and then copy
  44 // it using non-temporal stores.  Hopefully all the reads occur in the prefetch loop,
  45 // so the copy loop reads from L2 and writes directly to memory (with write combining.)
  46 // This minimizes bus turnaround and maintains good DRAM page locality.
  47 // Note that for this scheme to work, kVeryLong must be a large fraction of L2 cache
  48 // size.  Otherwise, it is counter-productive to bypass L2 on the stores.
  49 //
  50 // We are called from the platfunc bcopy loops when they encounter very long
  51 // operands, with the standard ABI:
  52 //      rdi = dest ptr
  53 //      rsi = source ptr
  54 //      rdx = length (>= 8kb, probably much bigger)
  55
  56 // void longcopy(const void *dest, void *sou, size_t len)
  57
  58         .private_extern _longcopy
  59 _longcopy:
  60         pushq   %rbp                    // set up a frame for backtraces
  61         movq    %rsp,%rbp
  62         movl    %edi,%eax               // copy dest ptr
  63         negl    %eax
  64         andl    $63,%eax                // get #bytes to cache line align destination
  65         jz      LBigChunkLoop           // already aligned
  66
  67         // Cache line align destination, so temporal stores in copy loops work right.
  68         // The recursive call returns with the source and dest ptrs properly updated.
  69
  70         subq    %rax,%rdx               // get length remaining after dest is aligned
  71         pushq   %rdx                    // save length remaining
  72         movl    %eax,%edx               // #bytes to copy to align destination
  73         call    _memcpy
  74         popq    %rdx                    // recover adjusted length
  75
  76 // Loop over big chunks.
  77 //      rdx = length remaining (>= 4096)
  78 //      rdi = dest (64-byte aligned)
  79 //      rsi = source (may be unaligned)
  80
  81 LBigChunkLoop:
  82         movl    $(kBigChunk),%r8d       // assume we can do a full chunk
  83         cmpq    %r8,%rdx                // do we have a full chunk left to do?
  84         cmovbl  %edx,%r8d               // if not, only move what we have left
  85         andl    $-4096,%r8d             // we work in page multiples
  86         xorl    %eax,%eax               // initialize chunk offset
  87         jmp     LTouchLoop
  88
  89 // Touch in the next chunk.  We try to keep the prefetch unit in "kick-start" mode,
  90 // by touching two adjacent cache lines every 8 lines of each page, in four slices.
  91 // Because the source may be unaligned, we use byte loads to touch.
  92 //      rdx = length remaining (including this chunk)
  93 //      rdi = ptr to start of dest chunk
  94 //      rsi = ptr to start of source chunk
  95 //      r8d = chunk length (multiples of pages, less than  2**32)
  96 //      ecx = scratch reg used to read a byte of each cache line
  97 //      eax = chunk offset
  98
  99         .align  4,0x90                  // 16-byte align inner loops
 100 LTouchLoop:
 101         movzb   (%rsi,%rax),%ecx        // touch line 0, 2, 4, or 6 of page
 102         movzb   1*64(%rsi,%rax),%ecx    // touch line 1, 3, 5, or 7
 103         movzb   8*64(%rsi,%rax),%ecx    // touch line 8, 10, 12, or 14
 104         movzb   9*64(%rsi,%rax),%ecx    // etc
 105
 106         movzb   16*64(%rsi,%rax),%ecx
 107         movzb   17*64(%rsi,%rax),%ecx
 108         movzb   24*64(%rsi,%rax),%ecx
 109         movzb   25*64(%rsi,%rax),%ecx
 110
 111         movzb   32*64(%rsi,%rax),%ecx
 112         movzb   33*64(%rsi,%rax),%ecx
 113         movzb   40*64(%rsi,%rax),%ecx
 114         movzb   41*64(%rsi,%rax),%ecx
 115
 116         movzb   48*64(%rsi,%rax),%ecx
 117         movzb   49*64(%rsi,%rax),%ecx
 118         movzb   56*64(%rsi,%rax),%ecx
 119         movzb   57*64(%rsi,%rax),%ecx
 120
 121         subl    $-128,%eax              // next slice of page (adding 128 w 8-bit immediate)
 122         testl   $512,%eax               // done with this page?
 123         jz      LTouchLoop              // no, next of four slices
 124         addl    $(4096-512),%eax        // move on to next page
 125         cmpl    %eax,%r8d               // done with this chunk?
 126         jnz     LTouchLoop              // no, do next page
 127
 128         // The chunk has been pre-fetched, now copy it using non-temporal stores.
 129         // There are two copy loops, depending on whether the source is 16-byte aligned
 130         // or not.
 131
 132         movl    %r8d,%ecx               // copy chunk size to a reg that doesn't use REX prefix
 133         addq    %rcx,%rsi               // increment ptrs by chunk length
 134         addq    %rcx,%rdi
 135         subq    %rcx,%rdx               // adjust remaining length
 136         negq    %rcx                    // prepare loop index (counts up to 0)
 137         testl   $15,%esi                // is source 16-byte aligned?
 138         jnz     LVeryLongUnaligned      // no
 139         jmp     LVeryLongAligned
 140
 141         .align  4,0x90                  // 16-byte align inner loops
 142 LVeryLongAligned:                       // aligned loop over 128-bytes
 143         movdqa  (%rsi,%rcx),%xmm0
 144         movdqa  16(%rsi,%rcx),%xmm1
 145         movdqa  32(%rsi,%rcx),%xmm2
 146         movdqa  48(%rsi,%rcx),%xmm3
 147         movdqa  64(%rsi,%rcx),%xmm4
 148         movdqa  80(%rsi,%rcx),%xmm5
 149         movdqa  96(%rsi,%rcx),%xmm6
 150         movdqa  112(%rsi,%rcx),%xmm7
 151
 152         movntdq %xmm0,(%rdi,%rcx)
 153         movntdq %xmm1,16(%rdi,%rcx)
 154         movntdq %xmm2,32(%rdi,%rcx)
 155         movntdq %xmm3,48(%rdi,%rcx)
 156         movntdq %xmm4,64(%rdi,%rcx)
 157         movntdq %xmm5,80(%rdi,%rcx)
 158         movntdq %xmm6,96(%rdi,%rcx)
 159         movntdq %xmm7,112(%rdi,%rcx)
 160
 161         subq    $-128,%rcx              // add 128 with an 8-bit immediate
 162         jnz     LVeryLongAligned
 163         jmp     LVeryLongChunkEnd
 164
 165         .align  4,0x90                  // 16-byte align inner loops
 166 LVeryLongUnaligned:                     // unaligned loop over 128-bytes
 167         movdqu  (%rsi,%rcx),%xmm0
 168         movdqu  16(%rsi,%rcx),%xmm1
 169         movdqu  32(%rsi,%rcx),%xmm2
 170         movdqu  48(%rsi,%rcx),%xmm3
 171         movdqu  64(%rsi,%rcx),%xmm4
 172         movdqu  80(%rsi,%rcx),%xmm5
 173         movdqu  96(%rsi,%rcx),%xmm6
 174         movdqu  112(%rsi,%rcx),%xmm7
 175
 176         movntdq %xmm0,(%rdi,%rcx)
 177         movntdq %xmm1,16(%rdi,%rcx)
 178         movntdq %xmm2,32(%rdi,%rcx)
 179         movntdq %xmm3,48(%rdi,%rcx)
 180         movntdq %xmm4,64(%rdi,%rcx)
 181         movntdq %xmm5,80(%rdi,%rcx)
 182         movntdq %xmm6,96(%rdi,%rcx)
 183         movntdq %xmm7,112(%rdi,%rcx)
 184
 185         subq    $-128,%rcx              // add 128 with an 8-bit immediate
 186         jnz     LVeryLongUnaligned
 187
 188 LVeryLongChunkEnd:
 189         cmpq    $4096,%rdx              // at least another page to go?
 190         jae     LBigChunkLoop           // yes
 191
 192         // Done.  Call memcpy() again to handle the 0-4095 bytes at the end.
 193         // We still have the args in the right registers:
 194         //      rdi = destination ptr
 195         //      rsi = source ptr
 196         //      rdx = length remaining (0..4095)
 197
 198         sfence                          // required by non-temporal stores
 199         testl   %edx,%edx               // anything left to copy?
 200         jz      1f
 201         call    _memcpy
 202 1:
 203         popq    %rbp                    // restore frame ptr
 204         ret