osfmk/i386/commpage/longcopy_sse4.s

   1 /*
   2  * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22
  23 #include <machine/cpu_capabilities.h>
  24 #include <machine/commpage.h>
  25
  26
  27 /*
  28  * The bcopy/memcpy loops for very long operands, tuned for Pentium-M
  29  * class processors with SSE4 and 64-byte cache lines.
  30  *
  31  * The following #defines are tightly coupled to the u-architecture:
  32  */
  33
  34 #define kBigChunk   (256*1024)          // outer loop chunk size for kVeryLong sized operands
  35
  36
  37 // Very long forward moves.  These are at least several pages, so we loop over big
  38 // chunks of memory (kBigChunk in size.)  We first prefetch the chunk, and then copy
  39 // it using non-temporal stores.  Hopefully all the reads occur in the prefetch loop,
  40 // so the copy loop reads from L2 and writes directly to memory (with write combining.)
  41 // This minimizes bus turnaround and maintains good DRAM page locality.
  42 // Note that for this scheme to work, kVeryLong must be a large fraction of L2 cache
  43 // size.  Otherwise, it is counter-productive to bypass L2 on the stores.
  44 //
  45 // We are called from the commpage bcopy loops when they encounter very long
  46 // operands, with the standard ABI.
  47 //
  48 //      void longcopy(const void *dest, void *sou, size_t len)
  49
  50         .text
  51         .align  5, 0x90
  52 Llongcopy_sse4:                         // void longcopy(const void *dest, void *sou, size_t len)
  53         pushl   %ebp                    // set up a frame for backtraces
  54         movl    %esp,%ebp
  55         pushl   %esi
  56         pushl   %edi
  57         pushl   %ebx                    // we'll need to use this too
  58         movl    8(%ebp),%edi            // get dest ptr
  59         movl    12(%ebp),%esi           // get source ptr
  60         movl    16(%ebp),%ecx           // get length
  61         movl    %edi,%ebx               // copy dest ptr
  62         negl    %ebx
  63         andl    $63,%ebx                // get #bytes to cache line align destination
  64         jz      LBigChunkLoop           // already aligned
  65
  66 // Cache line align destination, so temporal stores in copy loops work right.
  67
  68         pushl   %ebx                    // arg3 - #bytes to align destination (1..63)
  69         pushl   %esi                    // arg2 - source
  70         pushl   %edi                    // arg1 - dest
  71         movl    $(_COMM_PAGE_MEMCPY),%eax
  72         call    *%eax                   // align the destination
  73         addl    $12,%esp
  74         movl    8(%ebp),%edi            // recover dest ptr
  75         movl    12(%ebp),%esi           // recover source ptr
  76         movl    16(%ebp),%ecx           // recover length
  77         addl    %ebx,%esi               // adjust ptrs and lengths past copy
  78         addl    %ebx,%edi
  79         subl    %ebx,%ecx
  80
  81 // Loop over big chunks.
  82 //      ecx = length remaining (>= 4096)
  83 //      edi = dest (64-byte aligned)
  84 //      esi = source (may be unaligned)
  85
  86 LBigChunkLoop:
  87         movl    $(kBigChunk),%edx       // assume we can do a full chunk
  88         cmpl    %edx,%ecx               // do we have a full chunk left to do?
  89         cmovbl  %ecx,%edx               // if not, only move what we have left
  90         andl    $-4096,%edx             // we work in page multiples
  91         xor     %eax,%eax               // initialize chunk offset
  92         jmp     LTouchLoop
  93
  94 // Touch in the next chunk.  We try to keep the prefetch unit in "kick-start" mode,
  95 // by touching two adjacent cache lines every 8 lines of each page, in four slices.
  96 // Because the source may be unaligned, we use byte loads to touch.
  97 //      ecx = length remaining (including this chunk)
  98 //      edi = ptr to start of dest chunk
  99 //      esi = ptr to start of source chunk
 100 //      edx = chunk length (multiples of pages)
 101 //      ebx = scratch reg used to read a byte of each cache line
 102 //      eax = chunk offset
 103
 104         .align  4,0x90                  // 16-byte align inner loops
 105 LTouchLoop:
 106         movzb   (%esi,%eax),%ebx        // touch line 0, 2, 4, or 6 of page
 107         movzb   1*64(%esi,%eax),%ebx    // touch line 1, 3, 5, or 7
 108         movzb   8*64(%esi,%eax),%ebx    // touch line 8, 10, 12, or 14
 109         movzb   9*64(%esi,%eax),%ebx    // etc
 110
 111         movzb   16*64(%esi,%eax),%ebx
 112         movzb   17*64(%esi,%eax),%ebx
 113         movzb   24*64(%esi,%eax),%ebx
 114         movzb   25*64(%esi,%eax),%ebx
 115
 116         movzb   32*64(%esi,%eax),%ebx
 117         movzb   33*64(%esi,%eax),%ebx
 118         movzb   40*64(%esi,%eax),%ebx
 119         movzb   41*64(%esi,%eax),%ebx
 120
 121         movzb   48*64(%esi,%eax),%ebx
 122         movzb   49*64(%esi,%eax),%ebx
 123         movzb   56*64(%esi,%eax),%ebx
 124         movzb   57*64(%esi,%eax),%ebx
 125
 126         subl    $-128,%eax              // next slice of page (adding 128 w 8-bit immediate)
 127         testl   $512,%eax               // done with this page?
 128         jz      LTouchLoop              // no, next of four slices
 129         addl    $(4096-512),%eax        // move on to next page
 130         cmpl    %eax,%edx               // done with this chunk?
 131         jnz     LTouchLoop              // no, do next page
 132
 133 // The chunk has been pre-fetched, now copy it using non-temporal stores.
 134 // There are two copy loops, depending on whether the source is 16-byte aligned
 135 // or not.
 136
 137         addl    %edx,%esi               // increment ptrs by chunk length
 138         addl    %edx,%edi
 139         subl    %edx,%ecx               // adjust remaining length
 140         negl    %edx                    // prepare loop index (counts up to 0)
 141         testl   $15,%esi                // is source 16-byte aligned?
 142         jnz     LVeryLongUnaligned      // source is not aligned
 143         jmp     LVeryLongAligned
 144
 145         .align  4,0x90                  // 16-byte align inner loops
 146 LVeryLongAligned:                       // aligned loop over 128-bytes
 147         movdqa  (%esi,%edx),%xmm0
 148         movdqa  16(%esi,%edx),%xmm1
 149         movdqa  32(%esi,%edx),%xmm2
 150         movdqa  48(%esi,%edx),%xmm3
 151         movdqa  64(%esi,%edx),%xmm4
 152         movdqa  80(%esi,%edx),%xmm5
 153         movdqa  96(%esi,%edx),%xmm6
 154         movdqa  112(%esi,%edx),%xmm7
 155
 156         movntdq %xmm0,(%edi,%edx)
 157         movntdq %xmm1,16(%edi,%edx)
 158         movntdq %xmm2,32(%edi,%edx)
 159         movntdq %xmm3,48(%edi,%edx)
 160         movntdq %xmm4,64(%edi,%edx)
 161         movntdq %xmm5,80(%edi,%edx)
 162         movntdq %xmm6,96(%edi,%edx)
 163         movntdq %xmm7,112(%edi,%edx)
 164
 165         subl    $-128,%edx              // add 128 with an 8-bit immediate
 166         jnz     LVeryLongAligned
 167         jmp     LVeryLongChunkEnd
 168
 169         .align  4,0x90                  // 16-byte align inner loops
 170 LVeryLongUnaligned:                     // unaligned loop over 128-bytes
 171         movdqu  (%esi,%edx),%xmm0
 172         movdqu  16(%esi,%edx),%xmm1
 173         movdqu  32(%esi,%edx),%xmm2
 174         movdqu  48(%esi,%edx),%xmm3
 175         movdqu  64(%esi,%edx),%xmm4
 176         movdqu  80(%esi,%edx),%xmm5
 177         movdqu  96(%esi,%edx),%xmm6
 178         movdqu  112(%esi,%edx),%xmm7
 179
 180         movntdq %xmm0,(%edi,%edx)
 181         movntdq %xmm1,16(%edi,%edx)
 182         movntdq %xmm2,32(%edi,%edx)
 183         movntdq %xmm3,48(%edi,%edx)
 184         movntdq %xmm4,64(%edi,%edx)
 185         movntdq %xmm5,80(%edi,%edx)
 186         movntdq %xmm6,96(%edi,%edx)
 187         movntdq %xmm7,112(%edi,%edx)
 188
 189         subl    $-128,%edx              // add 128 with an 8-bit immediate
 190         jnz     LVeryLongUnaligned
 191
 192 LVeryLongChunkEnd:
 193         cmpl    $4096,%ecx              // at least another page to go?
 194         jae     LBigChunkLoop           // yes
 195
 196 // Done.  Call memcpy() again to handle the 0-4095 bytes at the end.
 197
 198         sfence                          // required by non-temporal stores
 199         testl   %ecx,%ecx               // anything left to copy?
 200         jz      1f
 201         pushl   %ecx                    // arg3 - #bytes to align destination (1..63)
 202         pushl   %esi                    // arg2 - source
 203         pushl   %edi                    // arg1 - dest
 204         movl    $(_COMM_PAGE_MEMCPY),%eax
 205         call    *%eax                   // align the destination
 206         addl    $12,%esp                // pop off arguments
 207 1:
 208         popl    %ebx
 209         popl    %edi
 210         popl    %esi
 211         popl    %ebp
 212         ret
 213
 214         /* always match for now, as commpage_stuff_routine() will panic if no match */
 215         COMMPAGE_DESCRIPTOR(longcopy_sse4, _COMM_PAGE_LONGCOPY, 0 ,0)