osfmk/i386/commpage/longcopy_sse4_64.s

   1 /*
   2  * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <machine/cpu_capabilities.h>
  30 #include <machine/commpage.h>
  31
  32
  33 /*
  34  * The bcopy/memcpy loops for very long operands, tuned for 64-bit
  35  * Pentium-M class processors with SSE4 and 64-byte cache lines.
  36  * This is the 64-bit version.
  37  *
  38  * The following #defines are tightly coupled to the u-architecture:
  39  */
  40
  41 #define kBigChunk   (256*1024)          // outer loop chunk size for kVeryLong sized operands
  42
  43
  44 // Very long forward moves.  These are at least several pages, so we loop over big
  45 // chunks of memory (kBigChunk in size.)  We first prefetch the chunk, and then copy
  46 // it using non-temporal stores.  Hopefully all the reads occur in the prefetch loop,
  47 // so the copy loop reads from L2 and writes directly to memory (with write combining.)
  48 // This minimizes bus turnaround and maintains good DRAM page locality.
  49 // Note that for this scheme to work, kVeryLong must be a large fraction of L2 cache
  50 // size.  Otherwise, it is counter-productive to bypass L2 on the stores.
  51 //
  52 // We are called from the commpage bcopy loops when they encounter very long
  53 // operands, with the standard ABI:
  54 //      rdi = dest ptr
  55 //      rsi = source ptr
  56 //      rdx = length (>= 8kb, probably much bigger)
  57
  58         .text
  59         .code64
  60         .align 5, 0x90
  61 Llongcopy_sse4_64:                      // void longcopy(const void *dest, void *sou, size_t len)
  62         pushq   %rbp                    // set up a frame for backtraces
  63         movq    %rsp,%rbp
  64         movl    %edi,%eax               // copy dest ptr
  65         negl    %eax
  66         andl    $63,%eax                // get #bytes to cache line align destination
  67         jz      LBigChunkLoop           // already aligned
  68
  69 // Cache line align destination, so temporal stores in copy loops work right.
  70 // The recursive call returns with the source and dest ptrs properly updated.
  71
  72         subq    %rax,%rdx               // get length remaining after dest is aligned
  73         pushq   %rdx                    // save length remaining
  74         movl    %eax,%edx               // #bytes to copy to align destination
  75         movq    $_COMM_PAGE_32_TO_64(_COMM_PAGE_MEMCPY),%rax
  76         call    *%rax
  77         popq    %rdx                    // recover adjusted length
  78
  79 // Loop over big chunks.
  80 //      rdx = length remaining (>= 4096)
  81 //      rdi = dest (64-byte aligned)
  82 //      rsi = source (may be unaligned)
  83
  84 LBigChunkLoop:
  85         movl    $(kBigChunk),%r8d       // assume we can do a full chunk
  86         cmpq    %r8,%rdx                // do we have a full chunk left to do?
  87         cmovbl  %edx,%r8d               // if not, only move what we have left
  88         andl    $-4096,%r8d             // we work in page multiples
  89         xorl    %eax,%eax               // initialize chunk offset
  90         jmp     LTouchLoop
  91
  92 // Touch in the next chunk.  We try to keep the prefetch unit in "kick-start" mode,
  93 // by touching two adjacent cache lines every 8 lines of each page, in four slices.
  94 // Because the source may be unaligned, we use byte loads to touch.
  95 //      rdx = length remaining (including this chunk)
  96 //      rdi = ptr to start of dest chunk
  97 //      rsi = ptr to start of source chunk
  98 //      r8d = chunk length (multiples of pages, less than  2**32)
  99 //      ecx = scratch reg used to read a byte of each cache line
 100 //      eax = chunk offset
 101
 102         .align  4,0x90                  // 16-byte align inner loops
 103 LTouchLoop:
 104         movzb   (%rsi,%rax),%ecx        // touch line 0, 2, 4, or 6 of page
 105         movzb   1*64(%rsi,%rax),%ecx    // touch line 1, 3, 5, or 7
 106         movzb   8*64(%rsi,%rax),%ecx    // touch line 8, 10, 12, or 14
 107         movzb   9*64(%rsi,%rax),%ecx    // etc
 108
 109         movzb   16*64(%rsi,%rax),%ecx
 110         movzb   17*64(%rsi,%rax),%ecx
 111         movzb   24*64(%rsi,%rax),%ecx
 112         movzb   25*64(%rsi,%rax),%ecx
 113
 114         movzb   32*64(%rsi,%rax),%ecx
 115         movzb   33*64(%rsi,%rax),%ecx
 116         movzb   40*64(%rsi,%rax),%ecx
 117         movzb   41*64(%rsi,%rax),%ecx
 118
 119         movzb   48*64(%rsi,%rax),%ecx
 120         movzb   49*64(%rsi,%rax),%ecx
 121         movzb   56*64(%rsi,%rax),%ecx
 122         movzb   57*64(%rsi,%rax),%ecx
 123
 124         subl    $-128,%eax              // next slice of page (adding 128 w 8-bit immediate)
 125         testl   $512,%eax               // done with this page?
 126         jz      LTouchLoop              // no, next of four slices
 127         addl    $(4096-512),%eax        // move on to next page
 128         cmpl    %eax,%r8d               // done with this chunk?
 129         jnz     LTouchLoop              // no, do next page
 130
 131 // The chunk has been pre-fetched, now copy it using non-temporal stores.
 132 // There are two copy loops, depending on whether the source is 16-byte aligned
 133 // or not.
 134
 135         movl    %r8d,%ecx               // copy chunk size to a reg that doesn't use REX prefix
 136         addq    %rcx,%rsi               // increment ptrs by chunk length
 137         addq    %rcx,%rdi
 138         subq    %rcx,%rdx               // adjust remaining length
 139         negq    %rcx                    // prepare loop index (counts up to 0)
 140         testl   $15,%esi                // is source 16-byte aligned?
 141         jnz     LVeryLongUnaligned      // no
 142         jmp     LVeryLongAligned
 143
 144         .align  4,0x90                  // 16-byte align inner loops
 145 LVeryLongAligned:                       // aligned loop over 128-bytes
 146         movdqa  (%rsi,%rcx),%xmm0
 147         movdqa  16(%rsi,%rcx),%xmm1
 148         movdqa  32(%rsi,%rcx),%xmm2
 149         movdqa  48(%rsi,%rcx),%xmm3
 150         movdqa  64(%rsi,%rcx),%xmm4
 151         movdqa  80(%rsi,%rcx),%xmm5
 152         movdqa  96(%rsi,%rcx),%xmm6
 153         movdqa  112(%rsi,%rcx),%xmm7
 154
 155         movntdq %xmm0,(%rdi,%rcx)
 156         movntdq %xmm1,16(%rdi,%rcx)
 157         movntdq %xmm2,32(%rdi,%rcx)
 158         movntdq %xmm3,48(%rdi,%rcx)
 159         movntdq %xmm4,64(%rdi,%rcx)
 160         movntdq %xmm5,80(%rdi,%rcx)
 161         movntdq %xmm6,96(%rdi,%rcx)
 162         movntdq %xmm7,112(%rdi,%rcx)
 163
 164         subq    $-128,%rcx              // add 128 with an 8-bit immediate
 165         jnz     LVeryLongAligned
 166         jmp     LVeryLongChunkEnd
 167
 168         .align  4,0x90                  // 16-byte align inner loops
 169 LVeryLongUnaligned:                     // unaligned loop over 128-bytes
 170         movdqu  (%rsi,%rcx),%xmm0
 171         movdqu  16(%rsi,%rcx),%xmm1
 172         movdqu  32(%rsi,%rcx),%xmm2
 173         movdqu  48(%rsi,%rcx),%xmm3
 174         movdqu  64(%rsi,%rcx),%xmm4
 175         movdqu  80(%rsi,%rcx),%xmm5
 176         movdqu  96(%rsi,%rcx),%xmm6
 177         movdqu  112(%rsi,%rcx),%xmm7
 178
 179         movntdq %xmm0,(%rdi,%rcx)
 180         movntdq %xmm1,16(%rdi,%rcx)
 181         movntdq %xmm2,32(%rdi,%rcx)
 182         movntdq %xmm3,48(%rdi,%rcx)
 183         movntdq %xmm4,64(%rdi,%rcx)
 184         movntdq %xmm5,80(%rdi,%rcx)
 185         movntdq %xmm6,96(%rdi,%rcx)
 186         movntdq %xmm7,112(%rdi,%rcx)
 187
 188         subq    $-128,%rcx              // add 128 with an 8-bit immediate
 189         jnz     LVeryLongUnaligned
 190
 191 LVeryLongChunkEnd:
 192         cmpq    $4096,%rdx              // at least another page to go?
 193         jae     LBigChunkLoop           // yes
 194
 195 // Done.  Call memcpy() again to handle the 0-4095 bytes at the end.
 196 // We still have the args in the right registers:
 197 //      rdi = destination ptr
 198 //      rsi = source ptr
 199 //      rdx = length remaining (0..4095)
 200
 201         sfence                          // required by non-temporal stores
 202         testl   %edx,%edx               // anything left to copy?
 203         jz      1f
 204         movq    $_COMM_PAGE_32_TO_64(_COMM_PAGE_MEMCPY),%rax
 205         call    *%rax
 206 1:
 207         popq    %rbp                    // restore frame ptr
 208         ret
 209
 210         /* always match for now, as commpage_stuff_routine() will panic if no match */
 211         COMMPAGE_DESCRIPTOR(longcopy_sse4_64, _COMM_PAGE_LONGCOPY, 0 ,0)