From: Apple Date: Wed, 9 Oct 2013 15:32:24 +0000 (+0000) Subject: Libc-825.40.1.tar.gz X-Git-Tag: mac-os-x-1085^0 X-Git-Url: https://git.saurik.com/apple/libc.git/commitdiff_plain/a28bf75d63c6a64e4c3b417c6052e45f42c6cedd Libc-825.40.1.tar.gz --- diff --git a/Libc.xcodeproj/project.pbxproj b/Libc.xcodeproj/project.pbxproj index 987a2e0..102a28f 100644 --- a/Libc.xcodeproj/project.pbxproj +++ b/Libc.xcodeproj/project.pbxproj @@ -681,7 +681,6 @@ C942110F13900C8A004BA536 /* memcpy.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53838138D9E990028D27C /* memcpy.c */; }; C942111013900C8A004BA536 /* memmove.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53839138D9E990028D27C /* memmove.c */; }; C942111113900C8A004BA536 /* atomic.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53846138D9E990028D27C /* atomic.c */; }; - C942111213900C8A004BA536 /* mach_absolute_time.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53848138D9E990028D27C /* mach_absolute_time.c */; }; C942111313900C8A004BA536 /* spinlocks.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B5384D138D9E990028D27C /* spinlocks.c */; }; C942112613900C8A004BA536 /* ascii.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53902138D9E990028D27C /* ascii.c */; }; C942112713900C8A004BA536 /* big5.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53905138D9E990028D27C /* big5.c */; }; @@ -1407,7 +1406,6 @@ C95B7FBA138F3C55004311DA /* memcpy.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53838138D9E990028D27C /* memcpy.c */; }; C95B7FBB138F3C55004311DA /* memmove.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53839138D9E990028D27C /* memmove.c */; }; C95B7FBC138F3C55004311DA /* atomic.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53846138D9E990028D27C /* atomic.c */; }; - C95B7FBD138F3C55004311DA /* mach_absolute_time.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53848138D9E990028D27C /* mach_absolute_time.c */; }; C95B7FBE138F3C55004311DA /* spinlocks.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B5384D138D9E990028D27C /* spinlocks.c */; }; C95B7FD1138F3C55004311DA /* ascii.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53902138D9E990028D27C /* ascii.c */; }; C95B7FD2138F3C55004311DA /* big5.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53905138D9E990028D27C /* big5.c */; }; @@ -2027,7 +2025,6 @@ C95B8265138F52B0004311DA /* memcpy.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53838138D9E990028D27C /* memcpy.c */; }; C95B8266138F52B0004311DA /* memmove.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53839138D9E990028D27C /* memmove.c */; }; C95B8267138F52B0004311DA /* atomic.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53846138D9E990028D27C /* atomic.c */; }; - C95B8268138F52B0004311DA /* mach_absolute_time.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53848138D9E990028D27C /* mach_absolute_time.c */; }; C95B8269138F52B0004311DA /* spinlocks.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B5384D138D9E990028D27C /* spinlocks.c */; }; C95B827C138F52B0004311DA /* ascii.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53902138D9E990028D27C /* ascii.c */; }; C95B827D138F52B0004311DA /* big5.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53905138D9E990028D27C /* big5.c */; }; @@ -2647,7 +2644,6 @@ C95B850B138F53DB004311DA /* memcpy.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53838138D9E990028D27C /* memcpy.c */; }; C95B850C138F53DB004311DA /* memmove.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53839138D9E990028D27C /* memmove.c */; }; C95B850D138F53DB004311DA /* atomic.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53846138D9E990028D27C /* atomic.c */; }; - C95B850E138F53DB004311DA /* mach_absolute_time.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53848138D9E990028D27C /* mach_absolute_time.c */; }; C95B850F138F53DB004311DA /* spinlocks.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B5384D138D9E990028D27C /* spinlocks.c */; }; C95B8522138F53DB004311DA /* ascii.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53902138D9E990028D27C /* ascii.c */; }; C95B8523138F53DB004311DA /* big5.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53905138D9E990028D27C /* big5.c */; }; @@ -3268,7 +3264,6 @@ C9765F8F138EC61900741512 /* memcpy.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53838138D9E990028D27C /* memcpy.c */; }; C9765F90138EC61900741512 /* memmove.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53839138D9E990028D27C /* memmove.c */; }; C9765F91138EC61900741512 /* atomic.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53846138D9E990028D27C /* atomic.c */; }; - C9765F92138EC61900741512 /* mach_absolute_time.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53848138D9E990028D27C /* mach_absolute_time.c */; }; C9765F93138EC61900741512 /* spinlocks.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B5384D138D9E990028D27C /* spinlocks.c */; }; C9765FA6138EC61900741512 /* ascii.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53902138D9E990028D27C /* ascii.c */; }; C9765FA7138EC61900741512 /* big5.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53905138D9E990028D27C /* big5.c */; }; @@ -3875,7 +3870,6 @@ C9EB2F79138F68A80075BB52 /* _sigtramp.s in Sources */ = {isa = PBXBuildFile; fileRef = C9B53845138D9E990028D27C /* _sigtramp.s */; }; C9EB2F7A138F68A80075BB52 /* atomic.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53846138D9E990028D27C /* atomic.c */; }; C9EB2F7B138F68A80075BB52 /* i386_gettimeofday_asm.s in Sources */ = {isa = PBXBuildFile; fileRef = C9B53847138D9E990028D27C /* i386_gettimeofday_asm.s */; }; - C9EB2F7C138F68A80075BB52 /* mach_absolute_time.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53848138D9E990028D27C /* mach_absolute_time.c */; }; C9EB2F7D138F68A80075BB52 /* mach_absolute_time_asm.s in Sources */ = {isa = PBXBuildFile; fileRef = C9B53849138D9E990028D27C /* mach_absolute_time_asm.s */; }; C9EB2F7E138F68A80075BB52 /* OSAtomic.s in Sources */ = {isa = PBXBuildFile; fileRef = C9B5384B138D9E990028D27C /* OSAtomic.s */; }; C9EB2F7F138F68A80075BB52 /* setjmp.s in Sources */ = {isa = PBXBuildFile; fileRef = C9B5384C138D9E990028D27C /* setjmp.s */; }; @@ -4141,7 +4135,6 @@ C9EB30AA138F6D880075BB52 /* memcpy.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53838138D9E990028D27C /* memcpy.c */; }; C9EB30AB138F6D880075BB52 /* memmove.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53839138D9E990028D27C /* memmove.c */; }; C9EB30AC138F6D880075BB52 /* atomic.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53846138D9E990028D27C /* atomic.c */; }; - C9EB30AD138F6D880075BB52 /* mach_absolute_time.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53848138D9E990028D27C /* mach_absolute_time.c */; }; C9EB30AE138F6D880075BB52 /* spinlocks.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B5384D138D9E990028D27C /* spinlocks.c */; }; C9EB30C1138F6D880075BB52 /* ascii.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53902138D9E990028D27C /* ascii.c */; }; C9EB30C2138F6D880075BB52 /* big5.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53905138D9E990028D27C /* big5.c */; }; @@ -4761,7 +4754,6 @@ C9EB3351138F75580075BB52 /* memcpy.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53838138D9E990028D27C /* memcpy.c */; }; C9EB3352138F75580075BB52 /* memmove.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53839138D9E990028D27C /* memmove.c */; }; C9EB3353138F75580075BB52 /* atomic.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53846138D9E990028D27C /* atomic.c */; }; - C9EB3354138F75580075BB52 /* mach_absolute_time.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53848138D9E990028D27C /* mach_absolute_time.c */; }; C9EB3355138F75580075BB52 /* spinlocks.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B5384D138D9E990028D27C /* spinlocks.c */; }; C9EB3368138F75580075BB52 /* ascii.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53902138D9E990028D27C /* ascii.c */; }; C9EB3369138F75580075BB52 /* big5.c in Sources */ = {isa = PBXBuildFile; fileRef = C9B53905138D9E990028D27C /* big5.c */; }; @@ -5858,7 +5850,6 @@ C9B53845138D9E990028D27C /* _sigtramp.s */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.asm; path = _sigtramp.s; sourceTree = ""; }; C9B53846138D9E990028D27C /* atomic.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; path = atomic.c; sourceTree = ""; }; C9B53847138D9E990028D27C /* i386_gettimeofday_asm.s */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.asm; path = i386_gettimeofday_asm.s; sourceTree = ""; }; - C9B53848138D9E990028D27C /* mach_absolute_time.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; path = mach_absolute_time.c; sourceTree = ""; }; C9B53849138D9E990028D27C /* mach_absolute_time_asm.s */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.asm; path = mach_absolute_time_asm.s; sourceTree = ""; }; C9B5384B138D9E990028D27C /* OSAtomic.s */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.asm; path = OSAtomic.s; sourceTree = ""; }; C9B5384C138D9E990028D27C /* setjmp.s */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.asm; path = setjmp.s; sourceTree = ""; }; @@ -7753,7 +7744,6 @@ C9B53845138D9E990028D27C /* _sigtramp.s */, C9B53846138D9E990028D27C /* atomic.c */, C9B53847138D9E990028D27C /* i386_gettimeofday_asm.s */, - C9B53848138D9E990028D27C /* mach_absolute_time.c */, C9B53849138D9E990028D27C /* mach_absolute_time_asm.s */, C9B5384B138D9E990028D27C /* OSAtomic.s */, C9B5384C138D9E990028D27C /* setjmp.s */, @@ -10733,7 +10723,6 @@ C942110F13900C8A004BA536 /* memcpy.c in Sources */, C942111013900C8A004BA536 /* memmove.c in Sources */, C942111113900C8A004BA536 /* atomic.c in Sources */, - C942111213900C8A004BA536 /* mach_absolute_time.c in Sources */, C942111313900C8A004BA536 /* spinlocks.c in Sources */, C942112613900C8A004BA536 /* ascii.c in Sources */, C942112713900C8A004BA536 /* big5.c in Sources */, @@ -11378,7 +11367,6 @@ C95B7FBA138F3C55004311DA /* memcpy.c in Sources */, C95B7FBB138F3C55004311DA /* memmove.c in Sources */, C95B7FBC138F3C55004311DA /* atomic.c in Sources */, - C95B7FBD138F3C55004311DA /* mach_absolute_time.c in Sources */, C95B7FBE138F3C55004311DA /* spinlocks.c in Sources */, C95B7FD1138F3C55004311DA /* ascii.c in Sources */, C95B7FD2138F3C55004311DA /* big5.c in Sources */, @@ -12009,7 +11997,6 @@ C95B8265138F52B0004311DA /* memcpy.c in Sources */, C95B8266138F52B0004311DA /* memmove.c in Sources */, C95B8267138F52B0004311DA /* atomic.c in Sources */, - C95B8268138F52B0004311DA /* mach_absolute_time.c in Sources */, C95B8269138F52B0004311DA /* spinlocks.c in Sources */, C95B827C138F52B0004311DA /* ascii.c in Sources */, C95B827D138F52B0004311DA /* big5.c in Sources */, @@ -12640,7 +12627,6 @@ C95B850B138F53DB004311DA /* memcpy.c in Sources */, C95B850C138F53DB004311DA /* memmove.c in Sources */, C95B850D138F53DB004311DA /* atomic.c in Sources */, - C95B850E138F53DB004311DA /* mach_absolute_time.c in Sources */, C95B850F138F53DB004311DA /* spinlocks.c in Sources */, C95B8522138F53DB004311DA /* ascii.c in Sources */, C95B8523138F53DB004311DA /* big5.c in Sources */, @@ -13129,7 +13115,6 @@ C9EB2F79138F68A80075BB52 /* _sigtramp.s in Sources */, C9EB2F7A138F68A80075BB52 /* atomic.c in Sources */, C9EB2F7B138F68A80075BB52 /* i386_gettimeofday_asm.s in Sources */, - C9EB2F7C138F68A80075BB52 /* mach_absolute_time.c in Sources */, C9EB2F7D138F68A80075BB52 /* mach_absolute_time_asm.s in Sources */, C9EB2F7E138F68A80075BB52 /* OSAtomic.s in Sources */, C9EB2F7F138F68A80075BB52 /* setjmp.s in Sources */, @@ -13584,7 +13569,6 @@ C9765F8F138EC61900741512 /* memcpy.c in Sources */, C9765F90138EC61900741512 /* memmove.c in Sources */, C9765F91138EC61900741512 /* atomic.c in Sources */, - C9765F92138EC61900741512 /* mach_absolute_time.c in Sources */, C9765F93138EC61900741512 /* spinlocks.c in Sources */, C9765FA6138EC61900741512 /* ascii.c in Sources */, C9765FA7138EC61900741512 /* big5.c in Sources */, @@ -14215,7 +14199,6 @@ C9EB30AA138F6D880075BB52 /* memcpy.c in Sources */, C9EB30AB138F6D880075BB52 /* memmove.c in Sources */, C9EB30AC138F6D880075BB52 /* atomic.c in Sources */, - C9EB30AD138F6D880075BB52 /* mach_absolute_time.c in Sources */, C9EB30AE138F6D880075BB52 /* spinlocks.c in Sources */, C9EB30C1138F6D880075BB52 /* ascii.c in Sources */, C9EB30C2138F6D880075BB52 /* big5.c in Sources */, @@ -14847,7 +14830,6 @@ C9EB3351138F75580075BB52 /* memcpy.c in Sources */, C9EB3352138F75580075BB52 /* memmove.c in Sources */, C9EB3353138F75580075BB52 /* atomic.c in Sources */, - C9EB3354138F75580075BB52 /* mach_absolute_time.c in Sources */, C9EB3355138F75580075BB52 /* spinlocks.c in Sources */, C9EB3368138F75580075BB52 /* ascii.c in Sources */, C9EB3369138F75580075BB52 /* big5.c in Sources */, diff --git a/i386/string/bcopy_sse2.s b/i386/string/bcopy_sse2.s index 4e001b7..e69de29 100644 --- a/i386/string/bcopy_sse2.s +++ b/i386/string/bcopy_sse2.s @@ -1,471 +0,0 @@ -/* - * Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include - -/* - * The bcopy/memcpy loops, tuned for Pentium-M class processors with SSE2 - * and 64-byte cache lines, such as Core and Core 2. - * - * The following #defines are tightly coupled to the u-architecture: - */ - -#define kShort 80 // too short to bother with SSE (must be >=80) -#define kVeryLong (500*1024) // large enough for non-temporal stores (must be >= 8192) -#define kBigChunk (256*1024) // outer loop chunk size for kVeryLong sized operands -#define kFastUCode (16*1024) // cutoff for microcode fastpath for "rep/movsl" - - -// void bcopy(const void *src, void *dst, size_t len); - -PLATFUNC_FUNCTION_START(bcopy, sse2, 32, 5) - pushl %ebp // set up a frame for backtraces - movl %esp,%ebp - pushl %esi - pushl %edi - movl 8(%ebp),%esi // get source ptr - movl 12(%ebp),%edi // get dest ptr - jmp Ljoin - -// -// void *memcpy(void *dst, const void *src, size_t len); -// void *memmove(void *dst, const void *src, size_t len); -// - -PLATFUNC_FUNCTION_START(memcpy, sse2, 32, 0) // void *memcpy(void *dst, const void *src, size_t len) -PLATFUNC_FUNCTION_START(memmove, sse2, 32, 0) // void *memmove(void *dst, const void *src, size_t len) -Lmemcpy_sse2: - pushl %ebp // set up a frame for backtraces - movl %esp,%ebp - pushl %esi - pushl %edi - movl 8(%ebp),%edi // get dest ptr - movl 12(%ebp),%esi // get source ptr - -Ljoin: // here from bcopy() with esi and edi loaded - movl 16(%ebp),%ecx // get length - movl %edi,%edx - subl %esi,%edx // (dest - source) - cmpl %ecx,%edx // must move in reverse if (dest - source) < length - jb LReverseIsland -Lrejoin: // here from very-long-operand copies - cmpl $(kShort),%ecx // long enough to bother with SSE? - ja LNotShort // yes - -// Handle short forward copies. As the most common case, this is the fall-through path. -// ecx = length (<= kShort) -// esi = source ptr -// edi = dest ptr - -Lshort: - movl %ecx,%edx // copy length - shrl $2,%ecx // get #doublewords - jz LLeftovers -2: // loop copying doublewords - movl (%esi),%eax - addl $4,%esi - movl %eax,(%edi) - addl $4,%edi - dec %ecx - jnz 2b -LLeftovers: // handle leftover bytes (0..3) in last word - andl $3,%edx // any leftover bytes? - jz 5f -4: // loop copying bytes - movb (%esi),%al - inc %esi -movb %al,(%edi) - inc %edi - dec %edx - jnz 4b -5: - movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove - popl %edi - popl %esi - popl %ebp - ret - - -LReverseIsland: // keep the "jb" above a short branch... - jmp LReverse // ...because reverse moves are uncommon - - -// Handle forward moves that are long enough to justify use of SSE3. -// First, 16-byte align the destination. -// ecx = length (> kShort) -// esi = source ptr -// edi = dest ptr - -LNotShort: - cmpl $(kVeryLong),%ecx // long enough to justify heavyweight loops? - movl %edi,%edx // copy destination - jae LVeryLong // use very-long-operand path - negl %edx - andl $15,%edx // get #bytes to align destination - jz LDestAligned // already aligned - subl %edx,%ecx // decrement length -1: // loop copying 1..15 bytes - movb (%esi),%al - inc %esi - movb %al,(%edi) - inc %edi - dec %edx - jnz 1b - -// Destination is now aligned. Prepare for forward loops over 64-byte chunks. -// Since kShort>=80 and we've moved at most 15 bytes already, there is at least one chunk. - -LDestAligned: - movl %ecx,%edx // copy length - movl %ecx,%eax // twice - andl $63,%ecx // get remaining bytes for Lshort - andl $-64,%edx // get number of bytes we will copy in inner loop - addl %edx,%esi // point to 1st byte not copied - addl %edx,%edi - negl %edx // now generate offset to 1st byte to be copied - testl $15,%esi // is source aligned too? - jnz LUnalignedLoop // no - cmpl $(kFastUCode),%eax // long enough for the fastpath in microcode? - jb LAlignedLoop // no, use SSE - cld // we'll move forward - movl %eax,%ecx // copy length again - shrl $2,%ecx // compute #words to move - addl %edx,%esi // restore ptrs to 1st byte of source and dest - addl %edx,%edi - rep // the u-code will optimize this - movsl - movl %eax,%edx // original length - jmp LLeftovers // handle 0..3 leftover bytes - - -// Forward aligned loop for medium length operands (kShort < n < kVeryLong). - - .align 4,0x90 // 16-byte align inner loops -LAlignedLoop: // loop over 64-byte chunks - movdqa (%esi,%edx),%xmm0 - movdqa 16(%esi,%edx),%xmm1 - movdqa 32(%esi,%edx),%xmm2 - movdqa 48(%esi,%edx),%xmm3 - - movdqa %xmm0,(%edi,%edx) - movdqa %xmm1,16(%edi,%edx) - movdqa %xmm2,32(%edi,%edx) - movdqa %xmm3,48(%edi,%edx) - - addl $64,%edx - jnz LAlignedLoop - - jmp Lshort // copy remaining 0..15 bytes and done - - -// Forward unaligned loop for medium length operands (kShort < n < kVeryLong). -// Note that LDDQU==MOVDQU on these machines, ie we don't care when we cross -// source cache lines. - - .align 4,0x90 // 16-byte align inner loops -LUnalignedLoop: // loop over 64-byte chunks - movdqu (%esi,%edx),%xmm0 // the loads are unaligned - movdqu 16(%esi,%edx),%xmm1 - movdqu 32(%esi,%edx),%xmm2 - movdqu 48(%esi,%edx),%xmm3 - - movdqa %xmm0,(%edi,%edx) // we can use aligned stores - movdqa %xmm1,16(%edi,%edx) - movdqa %xmm2,32(%edi,%edx) - movdqa %xmm3,48(%edi,%edx) - - addl $64,%edx - jnz LUnalignedLoop - - jmp Lshort // copy remaining 0..63 bytes and done - - -// Very long forward moves. These are at least several pages, so we loop over big -// chunks of memory (kBigChunk in size.) We first prefetch the chunk, and then copy -// it using non-temporal stores. Hopefully all the reads occur in the prefetch loop, -// so the copy loop reads from L2 and writes directly to memory (with write combining.) -// This minimizes bus turnaround and maintains good DRAM page locality. -// Note that for this scheme to work, kVeryLong must be a large fraction of L2 cache -// size. Otherwise, it is counter-productive to bypass L2 on the stores. -// ecx = length (>= kVeryLong bytes) -// edi = dest (aligned) -// esi = source - -LVeryLong: - pushl %ebx // we'll need to use this - movl %edi,%ebx // copy dest ptr - negl %ebx - andl $63,%ebx // get #bytes to cache line align destination - jz LBigChunkLoop // already aligned - -// Cache line align destination, so temporal stores in copy loops work right. - - pushl %ecx // save total length remaining - pushl %ebx // arg3 - #bytes to align destination (1..63) - pushl %esi // arg2 - source - pushl %edi // arg1 - dest - call Lmemcpy_sse2 // align the destination - movl 12(%esp),%ecx // recover total length - addl $16,%esp - addl %ebx,%esi // adjust ptrs and lengths past copy - addl %ebx,%edi - subl %ebx,%ecx - -// Loop over big chunks. -// ecx = length remaining (>= 4096) -// edi = dest (64-byte aligned) -// esi = source (may be unaligned) - -LBigChunkLoop: - movl $(kBigChunk),%edx // assume we can do a full chunk - cmpl %edx,%ecx // do we have a full chunk left to do? - cmovbl %ecx,%edx // if not, only move what we have left - andl $-4096,%edx // we work in page multiples - xor %eax,%eax // initialize chunk offset - jmp LTouchLoop - -// Because the source may be unaligned, we use byte loads to touch. -// ecx = length remaining (including this chunk) -// edi = ptr to start of dest chunk -// esi = ptr to start of source chunk -// edx = chunk length (multiples of pages) -// ebx = scratch reg used to read a byte of each cache line -// eax = chunk offset - - .align 4,0x90 // 16-byte align inner loops -LTouchLoop: - movzb (%esi,%eax),%ebx // touch line 0, 2, 4, or 6 of page - movzb 1*64(%esi,%eax),%ebx // touch line 1, 3, 5, or 7 - movzb 8*64(%esi,%eax),%ebx // touch line 8, 10, 12, or 14 - movzb 9*64(%esi,%eax),%ebx // etc - - movzb 16*64(%esi,%eax),%ebx - movzb 17*64(%esi,%eax),%ebx - movzb 24*64(%esi,%eax),%ebx - movzb 25*64(%esi,%eax),%ebx - - movzb 32*64(%esi,%eax),%ebx - movzb 33*64(%esi,%eax),%ebx - movzb 40*64(%esi,%eax),%ebx - movzb 41*64(%esi,%eax),%ebx - - movzb 48*64(%esi,%eax),%ebx - movzb 49*64(%esi,%eax),%ebx - movzb 56*64(%esi,%eax),%ebx - movzb 57*64(%esi,%eax),%ebx - - subl $-128,%eax // next slice of page (adding 128 w 8-bit immediate) - testl $512,%eax // done with this page? - jz LTouchLoop // no, next of four slices - addl $(4096-512),%eax // move on to next page - cmpl %eax,%edx // done with this chunk? - jnz LTouchLoop // no, do next page - -// The chunk has been pre-fetched, now copy it using non-temporal stores. -// There are two copy loops, depending on whether the source is 16-byte aligned -// or not. - - addl %edx,%esi // increment ptrs by chunk length - addl %edx,%edi - subl %edx,%ecx // adjust remaining length - negl %edx // prepare loop index (counts up to 0) - testl $15,%esi // is source 16-byte aligned? - jnz LVeryLongUnaligned // source is not aligned - jmp LVeryLongAligned - - .align 4,0x90 // 16-byte align inner loops -LVeryLongAligned: // aligned loop over 128-bytes - movdqa (%esi,%edx),%xmm0 - movdqa 16(%esi,%edx),%xmm1 - movdqa 32(%esi,%edx),%xmm2 - movdqa 48(%esi,%edx),%xmm3 - movdqa 64(%esi,%edx),%xmm4 - movdqa 80(%esi,%edx),%xmm5 - movdqa 96(%esi,%edx),%xmm6 - movdqa 112(%esi,%edx),%xmm7 - - movntdq %xmm0,(%edi,%edx) - movntdq %xmm1,16(%edi,%edx) - movntdq %xmm2,32(%edi,%edx) - movntdq %xmm3,48(%edi,%edx) - movntdq %xmm4,64(%edi,%edx) - movntdq %xmm5,80(%edi,%edx) - movntdq %xmm6,96(%edi,%edx) - movntdq %xmm7,112(%edi,%edx) - - subl $-128,%edx // add 128 with an 8-bit immediate - jnz LVeryLongAligned - jmp LVeryLongChunkEnd - - .align 4,0x90 // 16-byte align inner loops -LVeryLongUnaligned: // unaligned loop over 128-bytes - movdqu (%esi,%edx),%xmm0 - movdqu 16(%esi,%edx),%xmm1 - movdqu 32(%esi,%edx),%xmm2 - movdqu 48(%esi,%edx),%xmm3 - movdqu 64(%esi,%edx),%xmm4 - movdqu 80(%esi,%edx),%xmm5 - movdqu 96(%esi,%edx),%xmm6 - movdqu 112(%esi,%edx),%xmm7 - - movntdq %xmm0,(%edi,%edx) - movntdq %xmm1,16(%edi,%edx) - movntdq %xmm2,32(%edi,%edx) - movntdq %xmm3,48(%edi,%edx) - movntdq %xmm4,64(%edi,%edx) - movntdq %xmm5,80(%edi,%edx) - movntdq %xmm6,96(%edi,%edx) - movntdq %xmm7,112(%edi,%edx) - - subl $-128,%edx // add 128 with an 8-bit immediate - jnz LVeryLongUnaligned - -LVeryLongChunkEnd: - cmpl $4096,%ecx // at least another page to go? - jae LBigChunkLoop // yes - - sfence // required by non-temporal stores - popl %ebx - jmp Lrejoin // handle remaining (0..4095) bytes - - -// Reverse moves. -// ecx = length -// esi = source ptr -// edi = dest ptr - -LReverse: - addl %ecx,%esi // point to end of strings - addl %ecx,%edi - cmpl $(kShort),%ecx // long enough to bother with SSE? - ja LReverseNotShort // yes - -// Handle reverse short copies. -// ecx = length -// esi = one byte past end of source -// edi = one byte past end of dest - -LReverseShort: - movl %ecx,%edx // copy length - shrl $2,%ecx // #words - jz 3f -1: - subl $4,%esi - movl (%esi),%eax - subl $4,%edi - movl %eax,(%edi) - dec %ecx - jnz 1b -3: - andl $3,%edx // bytes? - jz 5f -4: - dec %esi - movb (%esi),%al - dec %edi - movb %al,(%edi) - dec %edx - jnz 4b -5: - movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove - popl %edi - popl %esi - popl %ebp - ret - -// Handle a reverse move long enough to justify using SSE. -// ecx = length -// esi = one byte past end of source -// edi = one byte past end of dest - -LReverseNotShort: - movl %edi,%edx // copy destination - andl $15,%edx // get #bytes to align destination - je LReverseDestAligned // already aligned - subl %edx,%ecx // adjust length -1: // loop copying 1..15 bytes - dec %esi - movb (%esi),%al - dec %edi - movb %al,(%edi) - dec %edx - jnz 1b - -// Destination is now aligned. Prepare for reverse loops. - -LReverseDestAligned: - movl %ecx,%edx // copy length - andl $63,%ecx // get remaining bytes for Lshort - andl $-64,%edx // get number of bytes we will copy in inner loop - subl %edx,%esi // point to endpoint of copy - subl %edx,%edi - testl $15,%esi // is source aligned too? - jnz LReverseUnalignedLoop // no - jmp LReverseAlignedLoop // use aligned loop - - .align 4,0x90 // 16-byte align inner loops -LReverseAlignedLoop: // loop over 64-byte chunks - movdqa -16(%esi,%edx),%xmm0 - movdqa -32(%esi,%edx),%xmm1 - movdqa -48(%esi,%edx),%xmm2 - movdqa -64(%esi,%edx),%xmm3 - - movdqa %xmm0,-16(%edi,%edx) - movdqa %xmm1,-32(%edi,%edx) - movdqa %xmm2,-48(%edi,%edx) - movdqa %xmm3,-64(%edi,%edx) - - subl $64,%edx - jne LReverseAlignedLoop - - jmp LReverseShort // copy remaining 0..63 bytes and done - - -// Reverse, unaligned loop. LDDQU==MOVDQU on these machines. - - .align 4,0x90 // 16-byte align inner loops -LReverseUnalignedLoop: // loop over 64-byte chunks - movdqu -16(%esi,%edx),%xmm0 - movdqu -32(%esi,%edx),%xmm1 - movdqu -48(%esi,%edx),%xmm2 - movdqu -64(%esi,%edx),%xmm3 - - movdqa %xmm0,-16(%edi,%edx) - movdqa %xmm1,-32(%edi,%edx) - movdqa %xmm2,-48(%edi,%edx) - movdqa %xmm3,-64(%edi,%edx) - - subl $64,%edx - jne LReverseUnalignedLoop - - jmp LReverseShort // copy remaining 0..63 bytes and done - -PLATFUNC_DESCRIPTOR(bcopy,sse2,kHasSSE2|kCache64,kHasSupplementalSSE3) -PLATFUNC_DESCRIPTOR(memcpy,sse2,kHasSSE2|kCache64,kHasSupplementalSSE3) -PLATFUNC_DESCRIPTOR(memmove,sse2,kHasSSE2|kCache64,kHasSupplementalSSE3) diff --git a/i386/string/bzero_sse2.s b/i386/string/bzero_sse2.s index 6d03019..e69de29 100644 --- a/i386/string/bzero_sse2.s +++ b/i386/string/bzero_sse2.s @@ -1,162 +0,0 @@ -/* - * Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include - -/* - * Bzero, tuned for Pentium-M class processors with SSE2 - * and 64-byte cache lines. - * - * This routine is also used for memset(p,0,n), which is a common case - * since gcc sometimes silently maps bzero() into memset(). As a result, - * we always load the original ptr into %eax before returning. - */ - -#define kShort 80 // too short to bother with SSE (must be >=80) -#define kVeryLong (1024*1024) - -// void bzero(void *b, size_t len); - -PLATFUNC_FUNCTION_START(bzero, sse2, 32, 5) - pushl %ebp // set up a frame for backtraces - movl %esp,%ebp - pushl %edi - movl 8(%ebp),%edi // get ptr - movl 12(%ebp),%edx // get length - - xorl %eax,%eax // set fill data to 0 - cmpl $(kShort),%edx // long enough for SSE? - jg LNotShort // yes - -// Here for short operands or the end of long ones. -// %edx = length -// %edi = ptr -// %eax = zero - -Lshort: - cmpl $16,%edx // long enough to word align? - jge 3f // yes - test %edx,%edx // length==0? - jz 6f -1: - movb %al,(%edi) // zero a byte - inc %edi - dec %edx - jnz 1b - jmp 6f -2: - movb %al,(%edi) // zero a byte - inc %edi - dec %edx -3: - test $3,%edi // is ptr doubleword aligned? - jnz 2b // no - movl %edx,%ecx // copy length - shrl $2,%edx // #doublewords to store -4: - movl %eax,(%edi) // zero an aligned doubleword - addl $4,%edi - dec %edx - jnz 4b - andl $3,%ecx // mask down to #bytes at end (0..3) - jz 6f // none -5: - movb %al,(%edi) // zero a byte - inc %edi - dec %ecx - jnz 5b -6: - movl 8(%ebp),%eax // get return value in case this was a call of memset() - popl %edi - popl %ebp - ret - - -// We will be using SSE, so align ptr. - -LNotShort: - movl %edi,%ecx - negl %ecx - andl $15,%ecx // mask down to #bytes to 16-byte align - jz LDestAligned // already aligned - subl %ecx,%edx // decrement length -0: // loop storing bytes to align the ptr - movb %al,(%edi) // pack in a byte - inc %edi - dec %ecx - jnz 0b - -// Destination is now 16-byte aligned. Prepare to loop over 64-byte chunks. -// %edx = length -// %edi = ptr -// %eax = zero - -LDestAligned: - movl %edx,%ecx - andl $63,%edx // mask down to residual length (0..63) - andl $-64,%ecx // get #bytes we will zero in this loop - pxor %xmm0,%xmm0 // zero an SSE register - addl %ecx,%edi // increment ptr by length to move - cmpl $(kVeryLong),%ecx // long enough to justify non-temporal stores? - jae LVeryLong // yes - negl %ecx // negate length to move - jmp 1f - -// Loop over 64-byte chunks, storing into cache. - - .align 4,0x90 // keep inner loops 16-byte aligned -1: - movdqa %xmm0,(%edi,%ecx) - movdqa %xmm0,16(%edi,%ecx) - movdqa %xmm0,32(%edi,%ecx) - movdqa %xmm0,48(%edi,%ecx) - addl $64,%ecx - jne 1b - - jmp Lshort - -// Very long operands: use non-temporal stores to bypass cache. - -LVeryLong: - negl %ecx // negate length to move - jmp 1f - - .align 4,0x90 // keep inner loops 16-byte aligned -1: - movntdq %xmm0,(%edi,%ecx) - movntdq %xmm0,16(%edi,%ecx) - movntdq %xmm0,32(%edi,%ecx) - movntdq %xmm0,48(%edi,%ecx) - addl $64,%ecx - jne 1b - - sfence // required by non-temporal stores - jmp Lshort - -PLATFUNC_DESCRIPTOR(bzero,sse2,kHasSSE2,kHasSSE4_2) diff --git a/i386/sys/i386_gettimeofday_asm.s b/i386/sys/i386_gettimeofday_asm.s index c798528..6c1df64 100644 --- a/i386/sys/i386_gettimeofday_asm.s +++ b/i386/sys/i386_gettimeofday_asm.s @@ -28,7 +28,6 @@ #include #include -#include #define NSEC_PER_SEC 1000*1000*1000 #define NSEC_PER_USEC 1000 @@ -46,7 +45,7 @@ ___commpage_gettimeofday: testl %esi,%esi /* disabled? */ jz 4f - call _mach_absolute_time_direct + call _mach_absolute_time /* get nanotime in %edx:%eax */ sub _COMM_PAGE_GTOD_NS_BASE,%eax sbb _COMM_PAGE_GTOD_NS_BASE+4,%edx diff --git a/i386/sys/mach_absolute_time.c b/i386/sys/mach_absolute_time.c deleted file mode 100644 index d78ebf9..0000000 --- a/i386/sys/mach_absolute_time.c +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2007, 2008, 2009, 2010 Apple Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this - * file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ - -#include -#include - -PLATFUNC_DESCRIPTOR_PROTOTYPE(mach_absolute_time,fast) -PLATFUNC_DESCRIPTOR_PROTOTYPE(mach_absolute_time,slow) - -static const platfunc_descriptor *mach_absolute_time_platfunc_descriptors[] = { - PLATFUNC_DESCRIPTOR_REFERENCE(mach_absolute_time,fast), - PLATFUNC_DESCRIPTOR_REFERENCE(mach_absolute_time,slow), - 0 -}; - -void *mach_absolute_time_chooser() __asm__("_mach_absolute_time"); -void *mach_absolute_time_chooser() { - __asm__(".desc _mach_absolute_time, 0x100"); - return find_platform_function((const platfunc_descriptor **) mach_absolute_time_platfunc_descriptors); -} diff --git a/i386/sys/mach_absolute_time_asm.s b/i386/sys/mach_absolute_time_asm.s index 4d3bc25..4347d5d 100644 --- a/i386/sys/mach_absolute_time_asm.s +++ b/i386/sys/mach_absolute_time_asm.s @@ -28,24 +28,26 @@ #include #include -#include -#if defined(VARIANT_DYLD) -/* For dyld, we need to decide upon call whether to jump to fast or slow */ - .globl _mach_absolute_time - .align 2, 0x90 -_mach_absolute_time: - movl _COMM_PAGE_CPU_CAPABILITIES, %eax - andl $(kSlow), %eax - jnz PLATFUNC_VARIANT_NAME(mach_absolute_time, slow) - jmp PLATFUNC_VARIANT_NAME(mach_absolute_time, fast) -#endif -/* return mach_absolute_time in %edx:%eax */ +/* return mach_absolute_time in %edx:%eax + * + * The algorithm we use is: + * + * ns = ((((rdtsc - rnt_tsc_base)< SLOW_TSC_THRESHOLD + * + * Where SLOW_TSC_THRESHOLD is about 10e9. Since most processor's tscFreq is greater + * than 1GHz, rnt_shift is usually 0. rnt_tsc_scale is also a 32-bit constant: + * + * rnt_tsc_scale = (10e9 * 2**32) / (tscFreq << rnt_shift); + */ -PLATFUNC_FUNCTION_START(mach_absolute_time, fast, 32, 4) - .private_extern _mach_absolute_time_direct -_mach_absolute_time_direct: + .globl _mach_absolute_time +_mach_absolute_time: pushl %ebp movl %esp,%ebp pushl %esi @@ -62,6 +64,15 @@ _mach_absolute_time_direct: subl _COMM_PAGE_NT_TSC_BASE,%eax sbbl _COMM_PAGE_NT_TSC_BASE+4,%edx + + /* + * Prior to supporting "slow" processors, xnu always set _NT_SHIFT to 32. + * Now it defaults to 0, unless the processor is slow. The shifts + * below implicitly mask the count down to 5 bits, handling either default. + */ + movl _COMM_PAGE_NT_SHIFT,%ecx + shldl %cl,%eax,%edx /* shift %edx left, filling in from %eax */ + shll %cl,%eax /* finish shifting %edx:%eax left by _COMM_PAGE_NT_SHIFT bits */ movl _COMM_PAGE_NT_SCALE,%ecx @@ -83,66 +94,3 @@ _mach_absolute_time_direct: popl %esi popl %ebp ret -PLATFUNC_DESCRIPTOR(mach_absolute_time,fast,0,kSlow) - - -/* mach_absolute_time routine for machines slower than ~1Gz (SLOW_TSC_THRESHOLD) */ -PLATFUNC_FUNCTION_START(mach_absolute_time, slow, 32, 4) - push %ebp - mov %esp,%ebp - push %esi - push %edi - push %ebx - -0: - movl _COMM_PAGE_NT_GENERATION,%esi - testl %esi,%esi /* if generation is 0, data being changed */ - jz 0b /* so loop until stable */ - - lfence - rdtsc /* get TSC in %edx:%eax */ - lfence - subl _COMM_PAGE_NT_TSC_BASE,%eax - sbbl _COMM_PAGE_NT_TSC_BASE+4,%edx - - pushl %esi /* save generation */ - /* - * Do the math to convert tsc ticks to nanoseconds. We first - * do long multiply of 1 billion times the tsc. Then we do - * long division by the tsc frequency - */ - mov $1000000000, %ecx /* number of nanoseconds in a second */ - mov %edx, %ebx - mul %ecx - mov %edx, %edi - mov %eax, %esi - mov %ebx, %eax - mul %ecx - add %edi, %eax - adc $0, %edx /* result in edx:eax:esi */ - mov %eax, %edi - mov _COMM_PAGE_NT_SHIFT,%ecx /* overloaded as the low 32 tscFreq */ - xor %eax, %eax - xchg %edx, %eax - div %ecx - xor %eax, %eax - mov %edi, %eax - div %ecx - mov %eax, %ebx - mov %esi, %eax - div %ecx - mov %ebx, %edx /* result in edx:eax */ - popl %esi /* recover generation */ - - add _COMM_PAGE_NT_NS_BASE,%eax - adc _COMM_PAGE_NT_NS_BASE+4,%edx - - cmpl _COMM_PAGE_NT_GENERATION,%esi /* have the parameters changed? */ - jne 0b /* yes, loop until stable */ - - pop %ebx - pop %edi - pop %esi - pop %ebp - ret /* result in edx:eax */ -PLATFUNC_DESCRIPTOR(mach_absolute_time,slow,kSlow,0) diff --git a/x86_64/string/bzero_sse2.s b/x86_64/string/bzero_sse2.s index 3d7d596..e69de29 100644 --- a/x86_64/string/bzero_sse2.s +++ b/x86_64/string/bzero_sse2.s @@ -1,161 +0,0 @@ -/* - * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include "platfunc.h" - -/* - * Bzero, tuned for Pentium-M class processors with SSE2 - * and 64-byte cache lines. This is the 64-bit version. - * - * This routine is also used for memset(p,0,n), which is a common case - * since gcc sometimes silently maps bzero() into memset(). As a result, - * we always load the original ptr into %eax before returning. - */ - -#define kShort 80 // too short to bother with SSE (must be >=80) -#define kVeryLong (1024*1024) - -// void bzero(void *b, size_t len); - -PLATFUNC_FUNCTION_START_GENERIC(bzero, sse2, 64, 5) - pushq %rbp // set up a frame for backtraces - movq %rsp,%rbp - xorl %eax,%eax // set fill data to 0 - movq %rdi,%r11 // save original ptr as return value - cmpq $(kShort),%rsi // long enough for SSE? - jg LNotShort // yes - -// Here for short operands or the end of long ones. -// %esi = length (<= kShort) -// %rdi = ptr -// %eax = zero - -Lshort: - cmpl $16,%esi // long enough to word align? - jge 3f // yes - test %esi,%esi // length==0? - jz 6f -1: - movb %al,(%rdi) // zero a byte - incq %rdi - decl %esi - jnz 1b - jmp 6f -2: - movb %al,(%rdi) // zero a byte - incq %rdi - decl %esi -3: - testl $3,%edi // is ptr doubleword aligned? - jnz 2b // no - movl %esi,%ecx // copy length - shrl $2,%esi // #doublewords to store -4: - movl %eax,(%rdi) // zero an aligned doubleword - addq $4,%rdi - decl %esi - jnz 4b - andl $3,%ecx // mask down to #bytes at end (0..3) - jz 6f // none -5: - movb %al,(%rdi) // zero a byte - incq %rdi - decl %ecx - jnz 5b -6: - movq %r11,%rax // set return value in case this was a call of memset() - popq %rbp - ret - - -// We will be using SSE, so align ptr. -// %rsi = length (> kShort) -// %rdi = ptr -// %eax = zero - -LNotShort: - movl %edi,%ecx // get #bytes to 16-byte align ptr - negl %ecx - andl $15,%ecx - jz LDestAligned // already aligned - subq %rcx,%rsi // decrement length -0: // loop storing bytes to align the ptr - movb %al,(%rdi) // pack in a byte - incq %rdi - decl %ecx - jnz 0b - -// Destination is now 16-byte aligned. Prepare to loop over 64-byte chunks. -// %rsi = length (> (kShort-15)) -// %rdi = ptr (aligned) -// %eax = zero - -LDestAligned: - movq %rsi,%rcx - andl $63,%esi // mask down to residual length (0..63) - andq $-64,%rcx // get #bytes we will zero in this loop - pxor %xmm0,%xmm0 // zero an SSE register - addq %rcx,%rdi // increment ptr by length to move - cmpq $(kVeryLong),%rcx // long enough to justify non-temporal stores? - jae LVeryLong // yes - negq %rcx // negate length to move - jmp 1f - -// Loop over 64-byte chunks, storing into cache. - - .align 4,0x90 // keep inner loops 16-byte aligned -1: - movdqa %xmm0,(%rdi,%rcx) - movdqa %xmm0,16(%rdi,%rcx) - movdqa %xmm0,32(%rdi,%rcx) - movdqa %xmm0,48(%rdi,%rcx) - addq $64,%rcx - jne 1b - - jmp Lshort - -// Very long operands: use non-temporal stores to bypass cache. - -LVeryLong: - negq %rcx // negate length to move - jmp 1f - - .align 4,0x90 // keep inner loops 16-byte aligned -1: - movntdq %xmm0,(%rdi,%rcx) - movntdq %xmm0,16(%rdi,%rcx) - movntdq %xmm0,32(%rdi,%rcx) - movntdq %xmm0,48(%rdi,%rcx) - addq $64,%rcx - jne 1b - - sfence // required by non-temporal stores - jmp Lshort - -PLATFUNC_DESCRIPTOR(bzero,sse2,kHasSSE2,kHasSSE4_2) diff --git a/x86_64/sys/i386_gettimeofday_asm.s b/x86_64/sys/i386_gettimeofday_asm.s index 28a3bd2..d20feb1 100644 --- a/x86_64/sys/i386_gettimeofday_asm.s +++ b/x86_64/sys/i386_gettimeofday_asm.s @@ -28,7 +28,6 @@ #include #include -#include "platfunc.h" #define NSEC_PER_SEC 1000*1000*1000 #define NSEC_PER_USEC 1000 diff --git a/x86_64/sys/nanotime.s b/x86_64/sys/nanotime.s index f5413be..7a65749 100644 --- a/x86_64/sys/nanotime.s +++ b/x86_64/sys/nanotime.s @@ -30,7 +30,21 @@ #include /* - * 64-bit version _mach_absolute_time. We return the 64-bit nanotime in %rax, + * 64-bit version _mach_absolute_time. We return the 64-bit nanotime in %rax. + * + * The algorithm we use is: + * + * ns = ((((rdtsc - rnt_tsc_base)< SLOW_TSC_THRESHOLD + * + * Where SLOW_TSC_THRESHOLD is about 10e9. Since most processor's tscFreqs are greater + * than 1GHz, rnt_shift is usually 0. rnt_tsc_scale is also a 32-bit constant: + * + * rnt_tsc_scale = (10e9 * 2**32) / (tscFreq << rnt_shift); + * */ .globl _mach_absolute_time _mach_absolute_time: @@ -40,19 +54,29 @@ _mach_absolute_time: 1: movl _NT_GENERATION(%rsi),%r8d // get generation testl %r8d,%r8d // if 0, data is being changed... - jz 1b // ...so loop until stable + jz 1b // ...so loop until stable lfence rdtsc // edx:eax := tsc lfence shlq $32,%rdx // rax := ((edx << 32) | eax), ie 64-bit tsc - orq %rdx,%rax + orq %rdx,%rax + + /* + * Prior to supporting "slow" processors, xnu always set _NT_SHIFT to 32. + * Now it defaults to 0, unless the processor is slow. In order to maintain + * compatibility with both old and new versions of xnu, we mask the shift + * down to 0x1F, which maps the old default (32) into the new default (0). + */ + movl _NT_SHIFT(%rsi),%ecx + andl $0x1F,%ecx // *** remove this line once 10.9 is GM *** subq _NT_TSC_BASE(%rsi), %rax // rax := (tsc - base_tsc) + shlq %cl,%rax // rax := (tsc - base_tsc) << NT_SHIFT movl _NT_SCALE(%rsi),%ecx - mulq %rcx // rdx:rax := (tsc - base_tsc) * scale - shrdq $32,%rdx,%rax // _COMM_PAGE_NT_SHIFT is always 32 + mulq %rcx // rdx:rax := ((tsc - base_tsc)<> 32) + ns_base cmpl _NT_GENERATION(%rsi),%r8d // did the data change during computation? - jne 1b + jne 1b popq %rbp ret