osfmk/i386/commpage/longcopy_sse3x.s

   1 /*
   2  * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <machine/cpu_capabilities.h>
  30 #include <machine/commpage.h>
  31
  32
  33 /*
  34  * The bcopy/memcpy loops for very long operands, tuned for Pentium-M
  35  * class processors with Supplemental SSE3 and 64-byte cache lines.
  36  *
  37  * The following #defines are tightly coupled to the u-architecture:
  38  */
  39
  40 #define kBigChunk   (256*1024)          // outer loop chunk size for kVeryLong sized operands
  41
  42
  43 // Very long forward moves.  These are at least several pages, so we loop over big
  44 // chunks of memory (kBigChunk in size.)  We first prefetch the chunk, and then copy
  45 // it using non-temporal stores.  Hopefully all the reads occur in the prefetch loop,
  46 // so the copy loop reads from L2 and writes directly to memory (with write combining.)
  47 // This minimizes bus turnaround and maintains good DRAM page locality.
  48 // Note that for this scheme to work, kVeryLong must be a large fraction of L2 cache
  49 // size.  Otherwise, it is counter-productive to bypass L2 on the stores.
  50 //
  51 // We are called from the commpage bcopy loops when they encounter very long
  52 // operands, with the standard ABI.
  53 //
  54 //      void longcopy(const void *dest, void *sou, size_t len)
  55
  56 // void longcopy(const void *dest, void *sou, size_t len)
  57
  58 COMMPAGE_FUNCTION_START(longcopy_sse3x, 32, 5)
  59         pushl   %ebp                    // set up a frame for backtraces
  60         movl    %esp,%ebp
  61         pushl   %esi
  62         pushl   %edi
  63         pushl   %ebx                    // we'll need to use this too
  64         movl    8(%ebp),%edi            // get dest ptr
  65         movl    12(%ebp),%esi           // get source ptr
  66         movl    16(%ebp),%ecx           // get length
  67         movl    %edi,%ebx               // copy dest ptr
  68         negl    %ebx
  69         andl    $63,%ebx                // get #bytes to cache line align destination
  70         jz      LBigChunkLoop           // already aligned
  71
  72 // Cache line align destination, so temporal stores in copy loops work right.
  73
  74         pushl   %ebx                    // arg3 - #bytes to align destination (1..63)
  75         pushl   %esi                    // arg2 - source
  76         pushl   %edi                    // arg1 - dest
  77         movl    $(_COMM_PAGE_MEMCPY),%eax
  78         call    *%eax                   // align the destination
  79         addl    $12,%esp
  80         movl    8(%ebp),%edi            // recover dest ptr
  81         movl    12(%ebp),%esi           // recover source ptr
  82         movl    16(%ebp),%ecx           // recover length
  83         addl    %ebx,%esi               // adjust ptrs and lengths past copy
  84         addl    %ebx,%edi
  85         subl    %ebx,%ecx
  86
  87 // Loop over big chunks.
  88 //      ecx = length remaining (>= 4096)
  89 //      edi = dest (64-byte aligned)
  90 //      esi = source (may be unaligned)
  91
  92 LBigChunkLoop:
  93         movl    $(kBigChunk),%edx       // assume we can do a full chunk
  94         cmpl    %edx,%ecx               // do we have a full chunk left to do?
  95         cmovbl  %ecx,%edx               // if not, only move what we have left
  96         andl    $-4096,%edx             // we work in page multiples
  97         xor     %eax,%eax               // initialize chunk offset
  98         jmp     LTouchLoop
  99
 100 // Touch in the next chunk.  We try to keep the prefetch unit in "kick-start" mode,
 101 // by touching two adjacent cache lines every 8 lines of each page, in four slices.
 102 // Because the source may be unaligned, we use byte loads to touch.
 103 //      ecx = length remaining (including this chunk)
 104 //      edi = ptr to start of dest chunk
 105 //      esi = ptr to start of source chunk
 106 //      edx = chunk length (multiples of pages)
 107 //      ebx = scratch reg used to read a byte of each cache line
 108 //      eax = chunk offset
 109
 110         .align  4,0x90                  // 16-byte align inner loops
 111 LTouchLoop:
 112         movzb   (%esi,%eax),%ebx        // touch line 0, 2, 4, or 6 of page
 113         movzb   1*64(%esi,%eax),%ebx    // touch line 1, 3, 5, or 7
 114         movzb   8*64(%esi,%eax),%ebx    // touch line 8, 10, 12, or 14
 115         movzb   9*64(%esi,%eax),%ebx    // etc
 116
 117         movzb   16*64(%esi,%eax),%ebx
 118         movzb   17*64(%esi,%eax),%ebx
 119         movzb   24*64(%esi,%eax),%ebx
 120         movzb   25*64(%esi,%eax),%ebx
 121
 122         movzb   32*64(%esi,%eax),%ebx
 123         movzb   33*64(%esi,%eax),%ebx
 124         movzb   40*64(%esi,%eax),%ebx
 125         movzb   41*64(%esi,%eax),%ebx
 126
 127         movzb   48*64(%esi,%eax),%ebx
 128         movzb   49*64(%esi,%eax),%ebx
 129         movzb   56*64(%esi,%eax),%ebx
 130         movzb   57*64(%esi,%eax),%ebx
 131
 132         subl    $-128,%eax              // next slice of page (adding 128 w 8-bit immediate)
 133         testl   $512,%eax               // done with this page?
 134         jz      LTouchLoop              // no, next of four slices
 135         addl    $(4096-512),%eax        // move on to next page
 136         cmpl    %eax,%edx               // done with this chunk?
 137         jnz     LTouchLoop              // no, do next page
 138
 139 // The chunk has been pre-fetched, now copy it using non-temporal stores.
 140 // There are two copy loops, depending on whether the source is 16-byte aligned
 141 // or not.
 142
 143         addl    %edx,%esi               // increment ptrs by chunk length
 144         addl    %edx,%edi
 145         subl    %edx,%ecx               // adjust remaining length
 146         negl    %edx                    // prepare loop index (counts up to 0)
 147         testl   $15,%esi                // is source 16-byte aligned?
 148         jnz     LVeryLongUnaligned      // source is not aligned
 149         jmp     LVeryLongAligned
 150
 151         .align  4,0x90                  // 16-byte align inner loops
 152 LVeryLongAligned:                       // aligned loop over 128-bytes
 153         movdqa  (%esi,%edx),%xmm0
 154         movdqa  16(%esi,%edx),%xmm1
 155         movdqa  32(%esi,%edx),%xmm2
 156         movdqa  48(%esi,%edx),%xmm3
 157         movdqa  64(%esi,%edx),%xmm4
 158         movdqa  80(%esi,%edx),%xmm5
 159         movdqa  96(%esi,%edx),%xmm6
 160         movdqa  112(%esi,%edx),%xmm7
 161
 162         movntdq %xmm0,(%edi,%edx)
 163         movntdq %xmm1,16(%edi,%edx)
 164         movntdq %xmm2,32(%edi,%edx)
 165         movntdq %xmm3,48(%edi,%edx)
 166         movntdq %xmm4,64(%edi,%edx)
 167         movntdq %xmm5,80(%edi,%edx)
 168         movntdq %xmm6,96(%edi,%edx)
 169         movntdq %xmm7,112(%edi,%edx)
 170
 171         subl    $-128,%edx              // add 128 with an 8-bit immediate
 172         jnz     LVeryLongAligned
 173         jmp     LVeryLongChunkEnd
 174
 175         .align  4,0x90                  // 16-byte align inner loops
 176 LVeryLongUnaligned:                     // unaligned loop over 128-bytes
 177         movdqu  (%esi,%edx),%xmm0
 178         movdqu  16(%esi,%edx),%xmm1
 179         movdqu  32(%esi,%edx),%xmm2
 180         movdqu  48(%esi,%edx),%xmm3
 181         movdqu  64(%esi,%edx),%xmm4
 182         movdqu  80(%esi,%edx),%xmm5
 183         movdqu  96(%esi,%edx),%xmm6
 184         movdqu  112(%esi,%edx),%xmm7
 185
 186         movntdq %xmm0,(%edi,%edx)
 187         movntdq %xmm1,16(%edi,%edx)
 188         movntdq %xmm2,32(%edi,%edx)
 189         movntdq %xmm3,48(%edi,%edx)
 190         movntdq %xmm4,64(%edi,%edx)
 191         movntdq %xmm5,80(%edi,%edx)
 192         movntdq %xmm6,96(%edi,%edx)
 193         movntdq %xmm7,112(%edi,%edx)
 194
 195         subl    $-128,%edx              // add 128 with an 8-bit immediate
 196         jnz     LVeryLongUnaligned
 197
 198 LVeryLongChunkEnd:
 199         cmpl    $4096,%ecx              // at least another page to go?
 200         jae     LBigChunkLoop           // yes
 201
 202 // Done.  Call memcpy() again to handle the 0-4095 bytes at the end.
 203
 204         sfence                          // required by non-temporal stores
 205         testl   %ecx,%ecx               // anything left to copy?
 206         jz      1f
 207         pushl   %ecx                    // arg3 - #bytes to align destination (1..63)
 208         pushl   %esi                    // arg2 - source
 209         pushl   %edi                    // arg1 - dest
 210         movl    $(_COMM_PAGE_MEMCPY),%eax
 211         call    *%eax                   // align the destination
 212         addl    $12,%esp                // pop off arguments
 213 1:
 214         popl    %ebx
 215         popl    %edi
 216         popl    %esi
 217         popl    %ebp
 218         ret
 219
 220 /* always match for now, as commpage_stuff_routine() will panic if no match */
 221 COMMPAGE_DESCRIPTOR(longcopy_sse3x, _COMM_PAGE_LONGCOPY, 0 ,0)