/*
- * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
*
* @APPLE_LICENSE_HEADER_START@
*
- * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
- *
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
*
* @APPLE_LICENSE_HEADER_END@
*/
-/* =======================================
- * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
- * =======================================
- *
- * Version of 6/17/2002, for G3, G4, and G4+.
- *
- * There are many paths through this code, depending on length, reverse/forward,
- * processor type, and alignment. We use reverse paths only when the operands
- * overlap and the destination is higher than the source. They are not quite as
- * fast as the forward paths.
- *
- * Judicious use of DCBTs, just far enough ahead to minimize waiting, is critical in
- * the inner loops for long operands. DST is less effective than DCBT, because it
- * can get out of sync with the inner loop. DCBTST is usually not a win, so we
- * don't use it except during initialization when we're not using the LSU.
- * We don't DCBT on G3, which only handles one load miss at a time.
- *
- * We don't use DCBZ, because it takes an alignment exception on uncached memory
- * like frame buffers. Bcopy to frame buffers must work. This hurts G3 in the
- * cold-cache case, but G4 can use DCBA (which does not take alignment exceptions.)
- *
- * Using DCBA on G4 is a tradeoff. For the cold-cache case it can be a big win,
- * since it avoids the read of destination cache lines. But for the hot-cache case
- * it is always slower, because of the cycles spent needlessly zeroing data. Some
- * machines store-gather and can cancel the read if all bytes of a line are stored,
- * others cannot. Unless explicitly told which is better, we time loops with and
- * without DCBA and use the fastest. Note that we never DCBA in reverse loops,
- * since by definition they are overlapped so dest lines will be in the cache.
- *
- * For longer operands we use an 8-element branch table, based on the CPU type,
- * to select the appropriate inner loop. The branch table is indexed as follows:
- *
- * bit 10000 set if a Reverse move is required
- * bits 01100 set on the relative operand alignment: 0=unaligned, 1=word,
- * 2=doubleword, and 3=quadword.
- *
- * By "relatively" n-byte aligned, we mean the source and destination are a multiple
- * of n bytes apart (they need not be absolutely aligned.)
- *
- * The branch table for the running CPU type is pointed to by LBranchTablePtr.
- * Initially, LBranchtablePtr points to G3's table, since that is the lowest
- * common denominator that will run on any CPU. Later, pthread initialization
- * sets up the _cpu_capabilities vector and calls _bcopy_initialize, which sets
- * up the correct pointer for the running CPU.
- *
- * We distinguish between "short", "medium", and "long" operands:
- * short (<= 32 bytes) most common case, minimum path length is important
- * medium (> 32, < kLong) too short for Altivec or use of cache ops like DCBA
- * long (>= kLong) long enough for cache ops and to amortize use of Altivec
- *
- * WARNING: kLong must be >=96, due to implicit assumptions about operand length.
- */
-#define kLong 96
-
-/* Register usage. Note we use R2, so this code will not run in a PEF/CFM
- * environment. Note also the rather delicate way we assign multiple uses
- * to the same register. Beware.
- *
- * r0 = "w7" or "r0" (NB: cannot use r0 for any constant such as "c16")
- * r2 = "w8" or VRSave ("rv")
- * r3 = not used, as memcpy and memmove return 1st parameter as a value
- * r4 = source ptr ("rs")
- * r5 = count of bytes to move ("rc")
- * r6 = "w1", "c16", or "cm17"
- * r7 = "w2", "c32", or "cm33"
- * r8 = "w3", "c48", or "cm49"
- * r9 = "w4", "c64", or "cm1"
- * r10 = "w5", "c96", or "cm97"
- * r11 = "w6", "c128", "cm129", or return address ("ra")
- * r12 = destination ptr ("rd")
- * f0-f8 = used for moving 8-byte aligned data
- * v0 = permute vector ("vp")
- * v1-v4 = qw's loaded from source ("v1", "v2", "v3", and "v4")
- * v5-v7 = permuted qw's ("vx", "vy", and "vz")
- */
-#define rs r4
-#define rd r12
-#define rc r5
-#define ra r11
-#define rv r2
-
-#define w1 r6
-#define w2 r7
-#define w3 r8
-#define w4 r9
-#define w5 r10
-#define w6 r11
-#define w7 r0
-#define w8 r2
-
-#define c16 r6
-#define cm17 r6
-#define c32 r7
-#define cm33 r7
-#define c48 r8
-#define cm49 r8
-#define c64 r9
-#define cm1 r9
-#define c96 r10
-#define cm97 r10
-#define c128 r11
-#define cm129 r11
-
-#define vp v0
-#define vx v5
-#define vy v6
-#define vz v7
-#define VRSave 256
-
-#include <architecture/ppc/asm_help.h>
-
-// The branch tables, 8 entries per CPU type.
-// NB: we depend on 5 low-order 0s in the address of branch tables.
-
- .data
- .align 5 // must be 32-byte aligned
-
- // G3 (the default CPU type)
-
-LG3:
- .long LForwardWord // 000: forward, unaligned
- .long LForwardFloat // 001: forward, 4-byte aligned
- .long LForwardFloat // 010: forward, 8-byte aligned
- .long LForwardFloat // 011: forward, 16-byte aligned
- .long LReverseWord // 100: reverse, unaligned
- .long LReverseFloat // 101: reverse, 4-byte aligned
- .long LReverseFloat // 110: reverse, 8-byte aligned
- .long LReverseFloat // 111: reverse, 16-byte aligned
-
- // G4s that benefit from DCBA.
-
-LG4UseDcba:
- .long LForwardVecUnal32Dcba // 000: forward, unaligned
- .long LForwardVecUnal32Dcba // 001: forward, 4-byte aligned
- .long LForwardVecUnal32Dcba // 010: forward, 8-byte aligned
- .long LForwardVecAlig32Dcba // 011: forward, 16-byte aligned
- .long LReverseVectorUnal32 // 100: reverse, unaligned
- .long LReverseVectorUnal32 // 101: reverse, 4-byte aligned
- .long LReverseVectorUnal32 // 110: reverse, 8-byte aligned
- .long LReverseVectorAligned32 // 111: reverse, 16-byte aligned
-
- // G4s that should not use DCBA.
-
-LG4NoDcba:
- .long LForwardVecUnal32NoDcba // 000: forward, unaligned
- .long LForwardVecUnal32NoDcba // 001: forward, 4-byte aligned
- .long LForwardVecUnal32NoDcba // 010: forward, 8-byte aligned
- .long LForwardVecAlig32NoDcba // 011: forward, 16-byte aligned
- .long LReverseVectorUnal32 // 100: reverse, unaligned
- .long LReverseVectorUnal32 // 101: reverse, 4-byte aligned
- .long LReverseVectorUnal32 // 110: reverse, 8-byte aligned
- .long LReverseVectorAligned32 // 111: reverse, 16-byte aligned
-
-
-// Pointer to the 8-element branch table for running CPU type:
-
-LBranchTablePtr:
- .long LG3 // default to G3 until "bcopy_initialize" called
-
-
-// The CPU capability vector, initialized in pthread_init().
-// "_bcopy_initialize" uses this to set up LBranchTablePtr:
-
- .globl __cpu_capabilities
-__cpu_capabilities:
- .long 0
-
-// Bit definitions for _cpu_capabilities:
-
-#define kHasAltivec 0x01
-#define k64Bit 0x02
-#define kCache32 0x04
-#define kCache64 0x08
-#define kCache128 0x10
-#define kUseDcba 0x20
-#define kNoDcba 0x40
-
-
-.text
-.globl _bcopy
-.globl _memcpy
-.globl _memmove
-.globl __bcopy_initialize
-
-
-// Main entry points.
+#define __APPLE_API_PRIVATE
+#include <machine/cpu_capabilities.h>
+#undef __APPLE_API_PRIVATE
+
+ // These functions have migrated to the comm page.
+
+ .text
+ .globl _bcopy
+ .globl _memcpy
+ .globl _memmove
.align 5
_bcopy: // void bcopy(const void *src, void *dst, size_t len)
- mr r10,r3 // reverse source and dest ptrs, to be like memcpy
- mr r3,r4
- mr r4,r10
+ ba _COMM_PAGE_BCOPY
+
+ .align 5
_memcpy: // void* memcpy(void *dst, void *src, size_t len)
_memmove: // void* memmove(void *dst, const void *src, size_t len)
- cmplwi cr7,rc,32 // length <= 32 bytes?
- sub. w1,r3,rs // must move in reverse if (rd-rs)<rc, set cr0 on sou==dst
- dcbt 0,rs // touch in the first line of source
- cmplw cr6,w1,rc // set cr6 blt iff we must move reverse
- cmplwi cr1,rc,kLong-1 // set cr1 bgt if long
- mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
- bgt- cr7,LMedium // longer than 32 bytes
- dcbtst 0,rd // touch in destination
- beq- cr7,LMove32 // special case moves of 32 bytes
- blt- cr6,LShortReverse0
-
-// Forward short operands. This is the most frequent case, so it is inline.
-// We also end up here to xfer the last 0-31 bytes of longer operands.
-
-LShort: // WARNING: can fall into this routine
- andi. r0,rc,0x10 // test bit 27 separately (sometimes faster than a mtcrf)
- mtcrf 0x01,rc // move rest of length to cr7
- beq 1f // quadword to move?
- lwz w1,0(rs)
- lwz w2,4(rs)
- lwz w3,8(rs)
- lwz w4,12(rs)
- addi rs,rs,16
- stw w1,0(rd)
- stw w2,4(rd)
- stw w3,8(rd)
- stw w4,12(rd)
- addi rd,rd,16
-1:
-LShort16: // join here to xfer 0-15 bytes
- bf 28,2f // doubleword?
- lwz w1,0(rs)
- lwz w2,4(rs)
- addi rs,rs,8
- stw w1,0(rd)
- stw w2,4(rd)
- addi rd,rd,8
-2:
- bf 29,3f // word?
- lwz w1,0(rs)
- addi rs,rs,4
- stw w1,0(rd)
- addi rd,rd,4
-3:
- bf 30,4f // halfword to move?
- lhz w1,0(rs)
- addi rs,rs,2
- sth w1,0(rd)
- addi rd,rd,2
-4:
- bflr 31 // skip if no odd byte
- lbz w1,0(rs)
- stb w1,0(rd)
- blr
-
-
-// Handle short reverse operands, up to kShort in length.
-// This is also used to transfer the last 0-31 bytes of longer operands.
-
-LShortReverse0:
- add rs,rs,rc // adjust ptrs for reverse move
- add rd,rd,rc
-LShortReverse:
- andi. r0,rc,0x10 // test bit 27 separately (sometimes faster than a mtcrf)
- mtcrf 0x01,rc // move rest of length to cr7
- beq 1f // quadword to move?
- lwz w1,-4(rs)
- lwz w2,-8(rs)
- lwz w3,-12(rs)
- lwzu w4,-16(rs)
- stw w1,-4(rd)
- stw w2,-8(rd)
- stw w3,-12(rd)
- stwu w4,-16(rd)
-1:
-LShortReverse16: // join here to xfer 0-15 bytes and return
- bf 28,2f // doubleword?
- lwz w1,-4(rs)
- lwzu w2,-8(rs)
- stw w1,-4(rd)
- stwu w2,-8(rd
-2:
- bf 29,3f // word?
- lwzu w1,-4(rs)
- stwu w1,-4(rd)
-3:
- bf 30,4f // halfword to move?
- lhzu w1,-2(rs)
- sthu w1,-2(rd)
-4:
- bflr 31 // done if no odd byte
- lbz w1,-1(rs) // no update
- stb w1,-1(rd)
- blr
-
-
-// Special case for 32-byte moves. Too long for LShort, too common for LMedium.
-
-LMove32:
- lwz w1,0(rs)
- lwz w2,4(rs)
- lwz w3,8(rs)
- lwz w4,12(rs)
- lwz w5,16(rs)
- lwz w6,20(rs)
- lwz w7,24(rs)
- lwz w8,28(rs)
- stw w1,0(rd)
- stw w2,4(rd)
- stw w3,8(rd)
- stw w4,12(rd)
- stw w5,16(rd)
- stw w6,20(rd)
- stw w7,24(rd)
- stw w8,28(rd)
-LExit:
- blr
-
-
-// Medium length operands (32 < rc < kLong.) These loops run on all CPUs, as the
-// operands are not long enough to bother with the branch table, using cache ops, or
-// Altivec. We word align the source, not the dest as we do for long operands,
-// since doing so is faster on G4+ and probably beyond, we never DCBA on medium-length
-// operands, and the opportunity to cancel reads of dest cache lines is limited.
-// w1 = (rd-rs), used to check for alignment
-// cr0 = set on (rd-rs)
-// cr1 = bgt if long operand
-// cr6 = blt if reverse move
-
-LMedium:
- dcbtst 0,rd // touch in 1st line of destination
- rlwinm r0,w1,0,29,31 // r0 <- ((rd-rs) & 7), ie 0 if doubleword aligned
- beq- LExit // early exit if (rs==rd), avoiding use of "beqlr"
- neg w2,rs // we align source, not dest, and assume forward
- cmpwi cr5,r0,0 // set cr5 beq if doubleword aligned
- bgt- cr1,LLong // handle long operands
- andi. w3,w2,3 // W3 <- #bytes to word-align source
- blt- cr6,LMediumReverse // handle reverse move
- lwz w1,0(rs) // pre-fetch first 4 bytes of source
- beq- cr5,LMediumAligned // operands are doubleword aligned
- sub rc,rc,w3 // adjust count for alignment
- mtcrf 0x01,rc // remaining byte count (0-15) to cr7 for LShort16
- srwi w4,rc,4 // w4 <- number of 16-byte chunks to xfer (>=1)
- mtctr w4 // prepare loop count
- beq+ 2f // source already aligned
-
- lwzx w2,w3,rs // get 1st aligned word (which we might partially overwrite)
- add rs,rs,w3 // word-align source ptr
- stw w1,0(rd) // store all (w3) bytes at once to avoid a loop
- add rd,rd,w3
- mr w1,w2 // first aligned word to w1
- b 2f
-
- .align 4 // align inner loops
-1: // loop over 16-byte chunks
- lwz w1,0(rs)
-2:
- lwz w2,4(rs)
- lwz w3,8(rs)
- lwz w4,12(rs)
- addi rs,rs,16
- stw w1,0(rd)
- stw w2,4(rd)
- stw w3,8(rd)
- stw w4,12(rd)
- addi rd,rd,16
- bdnz 1b
-
- b LShort16
-
-
-// Medium, doubleword aligned. We use floating point. Note that G4+ has bigger latencies
-// and reduced throughput for floating pt loads and stores; future processors will probably
-// have even worse lfd/stfd performance. We use it here because it is so important for G3,
-// and not slower for G4+. But we only do so for doubleword aligned operands, whereas the
-// G3-only long operand loops use floating pt even for word-aligned operands.
-// w2 = neg(rs)
-// w1 = first 4 bytes of source
-
-LMediumAligned:
- andi. w3,w2,7 // already aligned?
- sub rc,rc,w3 // adjust count by 0-7 bytes
- lfdx f0,rs,w3 // pre-fetch first aligned source doubleword
- srwi w4,rc,5 // get count of 32-byte chunks (might be 0 if unaligned)
- mtctr w4
- beq- LForwardFloatLoop1 // already aligned
-
- cmpwi w4,0 // are there any 32-byte chunks to xfer?
- lwz w2,4(rs) // get 2nd (unaligned) source word
- add rs,rs,w3 // doubleword align source pointer
- stw w1,0(rd) // store first 8 bytes of source to align...
- stw w2,4(rd) // ...which could overwrite source
- add rd,rd,w3 // doubleword align destination
- bne+ LForwardFloatLoop1 // at least 1 chunk, so enter loop
-
- subi rc,rc,8 // unfortunate degenerate case: no chunks to xfer
- stfd f0,0(rd) // must store f1 since source might have been overwriten
- addi rs,rs,8
- addi rd,rd,8
- b LShort
-
-
-// Medium reverse moves. This loop runs on all processors.
-
-LMediumReverse:
- add rs,rs,rc // point to other end of operands when in reverse
- add rd,rd,rc
- andi. w3,rs,3 // w3 <- #bytes to word align source
- lwz w1,-4(rs) // pre-fetch 1st 4 bytes of source
- sub rc,rc,w3 // adjust count
- srwi w4,rc,4 // get count of 16-byte chunks (>=1)
- mtcrf 0x01,rc // remaining byte count (0-15) to cr7 for LShortReverse16
- mtctr w4 // prepare loop count
- beq+ 2f // source already aligned
-
- sub rs,rs,w3 // word-align source ptr
- lwz w2,-4(rs) // get 1st aligned word which we may overwrite
- stw w1,-4(rd) // store all 4 bytes to align without a loop
- sub rd,rd,w3
- mr w1,w2 // shift 1st aligned source word to w1
- b 2f
-
-1:
- lwz w1,-4(rs)
-2:
- lwz w2,-8(rs)
- lwz w3,-12(rs)
- lwzu w4,-16(rs)
- stw w1,-4(rd)
- stw w2,-8(rd)
- stw w3,-12(rd)
- stwu w4,-16(rd)
- bdnz 1b
-
- b LShortReverse16
-
-
-// Long operands. Use branch table to decide which loop to use.
-// w1 = (rd-rs), used to determine alignment
-
-LLong:
- xor w4,w1,rc // we must move reverse if (rd-rs)<rc
- mflr ra // save return address
- rlwinm w5,w1,1,27,30 // w5 <- ((w1 & 0xF) << 1)
- bcl 20,31,1f // use reserved form to get our location
-1:
- mflr w3 // w3 == addr(1b)
- lis w8,0x0408 // load a 16 element, 2-bit array into w8...
- cntlzw w4,w4 // find first difference between (rd-rs) and rc
- addis w2,w3,ha16(LBranchTablePtr-1b)
- ori w8,w8,0x040C // ...used to map w5 to alignment encoding (ie, to 0-3)
- lwz w2,lo16(LBranchTablePtr-1b)(w2) // w2 <- branch table address
- slw w4,rc,w4 // bit 0 of w4 set iff (rd-rs)<rc
- rlwnm w5,w8,w5,28,29 // put alignment encoding in bits 01100 of w5
- rlwimi w2,w4,5,27,27 // put reverse bit in bit 10000 of branch table address
- lwzx w3,w2,w5 // w3 <- load loop address from branch table
- neg w1,rd // start to compute destination alignment
- mtctr w3
- andi. r0,w1,0x1F // r0 <- bytes req'd to 32-byte align dest (if forward move)
- bctr // NB: r0/cr0 and w1 are passed as parameters
-
-
-// G3, forward, long, unaligned.
-// w1 = neg(rd)
-
-LForwardWord:
- andi. w3,w1,3 // W3 <- #bytes to word-align destination
- mtlr ra // restore return address
- sub rc,rc,w3 // adjust count for alignment
- srwi r0,rc,5 // number of 32-byte chunks to xfer (>=1)
- mtctr r0 // prepare loop count
- beq+ 1f // dest already aligned
-
- lwz w2,0(rs) // get first 4 bytes of source
- lwzx w1,w3,rs // get source bytes we might overwrite
- add rs,rs,w3 // adjust source ptr
- stw w2,0(rd) // store all 4 bytes to avoid a loop
- add rd,rd,w3 // word-align destination
- b 2f
-1:
- lwz w1,0(rs)
-2:
- lwz w2,4(rs)
- lwz w3,8(rs)
- lwz w4,12(rs)
- lwz w5,16(rs)
- lwz w6,20(rs)
- lwz w7,24(rs)
- lwz w8,28(rs)
- addi rs,rs,32
- stw w1,0(rd)
- stw w2,4(rd)
- stw w3,8(rd)
- stw w4,12(rd)
- stw w5,16(rd)
- stw w6,20(rd)
- stw w7,24(rd)
- stw w8,28(rd)
- addi rd,rd,32
- bdnz 1b
-
- b LShort
-
-
-// G3, forward, long, word aligned. We use floating pt even when only word aligned.
-// w1 = neg(rd)
-
-LForwardFloat:
- andi. w3,w1,7 // W3 <- #bytes to doubleword-align destination
- mtlr ra // restore return address
- sub rc,rc,w3 // adjust count for alignment
- srwi r0,rc,5 // number of 32-byte chunks to xfer (>=1)
- mtctr r0 // prepare loop count
- beq LForwardFloatLoop // dest already aligned
-
- lwz w1,0(rs) // get first 8 bytes of source
- lwz w2,4(rs)
- lfdx f0,w3,rs // get source bytes we might overwrite
- add rs,rs,w3 // word-align source ptr
- stw w1,0(rd) // store all 8 bytes to avoid a loop
- stw w2,4(rd)
- add rd,rd,w3
- b LForwardFloatLoop1
-
- .align 4 // align since this loop is executed by G4s too
-LForwardFloatLoop:
- lfd f0,0(rs)
-LForwardFloatLoop1: // enter here from LMediumAligned and above
- lfd f1,8(rs)
- lfd f2,16(rs)
- lfd f3,24(rs)
- addi rs,rs,32
- stfd f0,0(rd)
- stfd f1,8(rd)
- stfd f2,16(rd)
- stfd f3,24(rd)
- addi rd,rd,32
- bdnz LForwardFloatLoop
-
- b LShort
-
-
-// G4 Forward, long, 16-byte aligned, 32-byte cache ops, use DCBA and DCBT.
-// r0/cr0 = #bytes to 32-byte align
-
-LForwardVecAlig32Dcba:
- bnel+ LAlign32 // align destination iff necessary
- bl LPrepareForwardVectors
- mtlr ra // restore return address before loading c128
- li c128,128
- b 1f // enter aligned loop
-
- .align 5 // long loop heads should be at least 16-byte aligned
-1: // loop over aligned 64-byte chunks
- dcbt c96,rs // pre-fetch three cache lines ahead
- dcbt c128,rs // and four
- lvx v1,0,rs
- lvx v2,c16,rs
- lvx v3,c32,rs
- lvx v4,c48,rs
- addi rs,rs,64
- dcba 0,rd // avoid read of destination cache lines
- stvx v1,0,rd
- stvx v2,c16,rd
- dcba c32,rd
- stvx v3,c32,rd
- stvx v4,c48,rd
- addi rd,rd,64
- bdnz 1b
-
-LForwardVectorAlignedEnd: // r0/cr0=#quadwords, rv=VRSave, cr7=low 4 bits of rc, cr6 set on cr7
- beq- 3f // no leftover quadwords
- mtctr r0
-2: // loop over remaining quadwords (1-7)
- lvx v1,0,rs
- addi rs,rs,16
- stvx v1,0,rd
- addi rd,rd,16
- bdnz 2b
-3:
- mtspr VRSave,rv // restore bitmap of live vr's
- bne cr6,LShort16 // handle last 0-15 bytes if any
- blr
-
-
-// G4 Forward, long, 16-byte aligned, 32-byte cache, use DCBT but not DCBA.
-// r0/cr0 = #bytes to 32-byte align
-
-LForwardVecAlig32NoDcba:
- bnel+ LAlign32 // align destination iff necessary
- bl LPrepareForwardVectors
- mtlr ra // restore return address before loading c128
- li c128,128
- b 1f // enter aligned loop
-
- .align 4 // balance 13-word loop between QWs...
- nop // ...which improves performance 5% +/-
- nop
-1: // loop over aligned 64-byte chunks
- dcbt c96,rs // pre-fetch three cache lines ahead
- dcbt c128,rs // and four
- lvx v1,0,rs
- lvx v2,c16,rs
- lvx v3,c32,rs
- lvx v4,c48,rs
- addi rs,rs,64
- stvx v1,0,rd
- stvx v2,c16,rd
- stvx v3,c32,rd
- stvx v4,c48,rd
- addi rd,rd,64
- bdnz 1b
-
- b LForwardVectorAlignedEnd
-
-
-// G4 Forward, long, unaligned, 32-byte cache ops, use DCBT and DCBA. At least on
-// some CPUs, this routine is no slower than the simpler aligned version that does
-// not use permutes. But it cannot be used with aligned operands, because of the
-// way it prefetches source QWs.
-// r0/cr0 = #bytes to 32-byte align
-
-LForwardVecUnal32Dcba:
- bnel+ LAlign32 // align destination iff necessary
- bl LPrepareForwardVectors
- lvx v1,0,rs // prime loop
- mtlr ra // restore return address before loading c128
- lvsl vp,0,rs // get permute vector to shift left
- li c128,128
- b 1f // enter aligned loop
-
- .align 4 // long loop heads should be at least 16-byte aligned
-1: // loop over aligned 64-byte destination chunks
- lvx v2,c16,rs
- dcbt c96,rs // touch 3rd cache line ahead
- lvx v3,c32,rs
- dcbt c128,rs // touch 4th cache line ahead
- lvx v4,c48,rs
- addi rs,rs,64
- vperm vx,v1,v2,vp
- lvx v1,0,rs
- vperm vy,v2,v3,vp
- dcba 0,rd // avoid read of destination lines
- stvx vx,0,rd
- vperm vz,v3,v4,vp
- stvx vy,c16,rd
- dcba c32,rd
- vperm vx,v4,v1,vp
- stvx vz,c32,rd
- stvx vx,c48,rd
- addi rd,rd,64
- bdnz 1b
-
-LForwardVectorUnalignedEnd: // r0/cr0=#QWs, rv=VRSave, v1=next QW, cr7=(rc & F), cr6 set on cr7
- beq- 3f // no leftover quadwords
- mtctr r0
-2: // loop over remaining quadwords
- lvx v2,c16,rs
- addi rs,rs,16
- vperm vx,v1,v2,vp
- vor v1,v2,v2 // v1 <- v2
- stvx vx,0,rd
- addi rd,rd,16
- bdnz 2b
-3:
- mtspr VRSave,rv // restore bitmap of live vr's
- bne cr6,LShort16 // handle last 0-15 bytes if any
- blr
-
-
-// G4 Forward, long, unaligned, 32-byte cache ops, use DCBT but not DCBA.
-// r0/cr0 = #bytes to 32-byte align
-
-LForwardVecUnal32NoDcba:
- bnel+ LAlign32 // align destination iff necessary
- bl LPrepareForwardVectors
- lvx v1,0,rs // prime loop
- mtlr ra // restore return address before loading c128
- lvsl vp,0,rs // get permute vector to shift left
- li c128,128
- b 1f // enter aligned loop
-
- .align 4
- nop // balance 17-word loop between QWs
- nop
-1: // loop over aligned 64-byte destination chunks
- lvx v2,c16,rs
- dcbt c96,rs // touch 3rd cache line ahead
- lvx v3,c32,rs
- dcbt c128,rs // touch 4th cache line ahead
- lvx v4,c48,rs
- addi rs,rs,64
- vperm vx,v1,v2,vp
- lvx v1,0,rs
- vperm vy,v2,v3,vp
- stvx vx,0,rd
- vperm vz,v3,v4,vp
- stvx vy,c16,rd
- vperm vx,v4,v1,vp
- stvx vz,c32,rd
- stvx vx,c48,rd
- addi rd,rd,64
- bdnz 1b
-
- b LForwardVectorUnalignedEnd
-
-
-// G3 Reverse, long, unaligned.
-
-LReverseWord:
- bl LAlign8Reverse // 8-byte align destination
- mtlr ra // restore return address
- srwi r0,rc,5 // get count of 32-byte chunks to xfer (> 1)
- mtctr r0
-1:
- lwz w1,-4(rs)
- lwz w2,-8(rs)
- lwz w3,-12(rs)
- lwz w4,-16(rs)
- stw w1,-4(rd)
- lwz w5,-20(rs)
- stw w2,-8(rd)
- lwz w6,-24(rs)
- stw w3,-12(rd)
- lwz w7,-28(rs)
- stw w4,-16(rd)
- lwzu w8,-32(rs)
- stw w5,-20(rd)
- stw w6,-24(rd)
- stw w7,-28(rd)
- stwu w8,-32(rd)
- bdnz 1b
-
- b LShortReverse
-
-
-// G3 Reverse, long, word aligned.
-
-LReverseFloat:
- bl LAlign8Reverse // 8-byte align
- mtlr ra // restore return address
- srwi r0,rc,5 // get count of 32-byte chunks to xfer (> 1)
- mtctr r0
-1:
- lfd f0,-8(rs)
- lfd f1,-16(rs)
- lfd f2,-24(rs)
- lfdu f3,-32(rs)
- stfd f0,-8(rd)
- stfd f1,-16(rd)
- stfd f2,-24(rd)
- stfdu f3,-32(rd)
- bdnz 1b
-
- b LShortReverse
-
-
-// G4 Reverse, long, 16-byte aligned, 32-byte DCBT but no DCBA.
-
-LReverseVectorAligned32:
- bl LAlign32Reverse // 32-byte align destination iff necessary
- bl LPrepareReverseVectors
- mtlr ra // restore return address before loading cm129
- li cm129,-129
- b 1f // enter aligned loop
-
- .align 4
- nop // must start in 3rd word of QW...
- nop // ...to keep balanced
-1: // loop over aligned 64-byte chunks
- dcbt cm97,rs // pre-fetch three cache lines ahead
- dcbt cm129,rs // and four
- lvx v1,cm1,rs
- lvx v2,cm17,rs
- lvx v3,cm33,rs
- lvx v4,cm49,rs
- subi rs,rs,64
- stvx v1,cm1,rd
- stvx v2,cm17,rd
- stvx v3,cm33,rd
- stvx v4,cm49,rd
- subi rd,rd,64
- bdnz 1b
-
-LReverseVectorAlignedEnd: // cr0/r0=#quadwords, rv=VRSave, cr7=low 4 bits of rc, cr6 set on cr7
- beq 3f // no leftover quadwords
- mtctr r0
-2: // loop over 1-3 quadwords
- lvx v1,cm1,rs
- subi rs,rs,16
- stvx v1,cm1,rd
- subi rd,rd,16
- bdnz 2b
-3:
- mtspr VRSave,rv // restore bitmap of live vr's
- bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
- blr
-
-
-// G4 Reverse, long, unaligned, 32-byte DCBT.
-
-LReverseVectorUnal32:
- bl LAlign32Reverse // align destination iff necessary
- bl LPrepareReverseVectors
- lvx v1,cm1,rs // prime loop
- mtlr ra // restore return address before loading cm129
- lvsl vp,0,rs // get permute vector to shift left
- li cm129,-129
- b 1f // enter aligned loop
-
- .align 4
- nop // start loop in 3rd word on QW to balance
- nop
-1: // loop over aligned 64-byte destination chunks
- lvx v2,cm17,rs
- dcbt cm97,rs // touch in 3rd source block
- lvx v3,cm33,rs
- dcbt cm129,rs // touch in 4th
- lvx v4,cm49,rs
- subi rs,rs,64
- vperm vx,v2,v1,vp
- lvx v1,cm1,rs
- vperm vy,v3,v2,vp
- stvx vx,cm1,rd
- vperm vz,v4,v3,vp
- stvx vy,cm17,rd
- vperm vx,v1,v4,vp
- stvx vz,cm33,rd
- stvx vx,cm49,rd
- subi rd,rd,64
- bdnz 1b
-
-LReverseVectorUnalignedEnd: // r0/cr0=#QWs, rv=VRSave, v1=source QW, cr7=low 4 bits of rc, cr6 set on cr7
- beq 3f // no leftover quadwords
- mtctr r0
-2: // loop over 1-3 quadwords
- lvx v2,cm17,rs
- subi rs,rs,16
- vperm vx,v2,v1,vp
- vor v1,v2,v2 // v1 <- v2
- stvx vx,cm1,rd
- subi rd,rd,16
- bdnz 2b
-3:
- mtspr VRSave,rv // restore bitmap of live vr's
- bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
- blr
-
-
-// Subroutine to prepare for 64-byte forward vector loops.
-// Returns many things:
-// ctr = number of 64-byte chunks to move
-// r0/cr0 = leftover QWs to move
-// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
-// cr6 = beq if leftover byte count is 0
-// c16..c96 loaded
-// rv = original value of VRSave
-// NB: c128 not set (if needed), since it is still "ra"
-
-LPrepareForwardVectors:
- mfspr rv,VRSave // get bitmap of live vector registers
- srwi r0,rc,6 // get count of 64-byte chunks to move (>=1)
- oris w1,rv,0xFF00 // we use v0-v7
- mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShort16
- rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
- mtspr VRSave,w1 // update mask
- li c16,16 // get constants used in ldvx/stvx
- li c32,32
- mtctr r0 // set up loop count
- cmpwi cr6,w3,0 // set cr6 on leftover byte count
- li c48,48
- li c96,96
- rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
- blr
-
-
-// Subroutine to prepare for 64-byte reverse vector loops.
-// Returns many things:
-// ctr = number of 64-byte chunks to move
-// r0/cr0 = leftover QWs to move
-// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
-// cr6 = beq if leftover byte count is 0
-// cm1..cm97 loaded
-// rv = original value of VRSave
-// NB: cm129 not set (if needed), since it is still "ra"
-
-LPrepareReverseVectors:
- mfspr rv,VRSave // get bitmap of live vector registers
- srwi r0,rc,6 // get count of 64-byte chunks to move (>=1)
- oris w1,rv,0xFF00 // we use v0-v7
- mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16
- rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
- mtspr VRSave,w1 // update mask
- li cm1,-1 // get constants used in ldvx/stvx
- li cm17,-17
- mtctr r0 // set up loop count
- cmpwi cr6,w3,0 // set cr6 on leftover byte count
- li cm33,-33
- li cm49,-49
- rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
- li cm97,-97
- blr
-
-
-// Subroutine to align destination on a 32-byte boundary.
-// r0 = number of bytes to xfer (0-31)
-
-LAlign32:
- mtcrf 0x01,r0 // length to cr (faster to change 1 CR at a time)
- mtcrf 0x02,r0
- sub rc,rc,r0 // adjust length
- bf 31,1f // skip if no odd bit
- lbz w1,0(rs)
- addi rs,rs,1
- stb w1,0(rd)
- addi rd,rd,1
-1:
- bf 30,2f // halfword to move?
- lhz w1,0(rs)
- addi rs,rs,2
- sth w1,0(rd)
- addi rd,rd,2
-2:
- bf 29,3f // word?
- lwz w1,0(rs)
- addi rs,rs,4
- stw w1,0(rd)
- addi rd,rd,4
-3:
- bf 28,4f // doubleword?
- lwz w1,0(rs)
- lwz w2,4(rs)
- addi rs,rs,8
- stw w1,0(rd)
- stw w2,4(rd)
- addi rd,rd,8
-4:
- bflr 27 // done if no quadword to move
- lwz w1,0(rs)
- lwz w2,4(rs)
- lwz w3,8(rs)
- lwz w4,12(rs)
- addi rs,rs,16
- stw w1,0(rd)
- stw w2,4(rd)
- stw w3,8(rd)
- stw w4,12(rd)
- addi rd,rd,16
- blr
-
-// Subroutine to align destination if necessary on a 32-byte boundary for reverse moves.
-// rs and rd still point to low end of operands
-// we adjust rs and rd to point to last byte moved
-
-LAlign32Reverse:
- add rd,rd,rc // point to last byte moved (ie, 1 past end of operands)
- add rs,rs,rc
- andi. r0,rd,0x1F // r0 <- #bytes that must be moved to align destination
- mtcrf 0x01,r0 // length to cr (faster to change 1 CR at a time)
- mtcrf 0x02,r0
- sub rc,rc,r0 // update length
- beqlr- // destination already 32-byte aligned
-
- bf 31,1f // odd byte?
- lbzu w1,-1(rs)
- stbu w1,-1(rd)
-1:
- bf 30,2f // halfword to move?
- lhzu w1,-2(rs)
- sthu w1,-2(rd)
-2:
- bf 29,3f // word?
- lwzu w1,-4(rs)
- stwu w1,-4(rd)
-3:
- bf 28,4f // doubleword?
- lwz w1,-4(rs)
- lwzu w2,-8(rs)
- stw w1,-4(rd)
- stwu w2,-8(rd
-4:
- bflr 27 // done if no quadwords
- lwz w1,-4(rs)
- lwz w2,-8(rs)
- lwz w3,-12(rs)
- lwzu w4,-16(rs)
- stw w1,-4(rd)
- stw w2,-8(rd)
- stw w3,-12(rd)
- stwu w4,-16(rd)
- blr
-
-
-// Subroutine to align destination on an 8-byte boundary for reverse moves.
-// rs and rd still point to low end of operands
-// we adjust rs and rd to point to last byte moved
-
-LAlign8Reverse:
- add rd,rd,rc // point to last byte moved (ie, 1 past end of operands)
- add rs,rs,rc
- andi. r0,rd,0x7 // r0 <- #bytes that must be moved to align destination
- beqlr- // destination already 8-byte aligned
- mtctr r0 // set up for loop
- sub rc,rc,r0 // update length
-1:
- lbzu w1,-1(rs)
- stbu w1,-1(rd)
- bdnz 1b
-
- blr
-
-
-// Called by pthread initialization to set up the branch table pointer based on
-// the CPU capability vector. This routine may be called more than once (for
-// example, during testing.)
-
-// Size of the buffer we use to do DCBA timing on G4:
-#define kBufSiz 1024
-
-// Stack frame size, which contains the 128-byte-aligned buffer:
-#define kSFSize (kBufSiz+128+16)
-
-// Iterations of the timing loop:
-#define kLoopCnt 5
-
-// Bit in cr5 used as a flag in timing loop:
-#define kDCBA 22
-
-__bcopy_initialize: // int _bcopy_initialize(void)
- mflr ra // get return
- stw ra,8(r1) // save
- stwu r1,-kSFSize(r1) // carve our temp buffer from the stack
- addi w6,r1,127+16 // get base address...
- rlwinm w6,w6,0,0,24 // ...of our buffer, 128-byte aligned
- bcl 20,31,1f // get our PIC base
-1:
- mflr w1
- addis w2,w1,ha16(__cpu_capabilities - 1b)
- lwz w3,lo16(__cpu_capabilities - 1b)(w2)
- andi. r0,w3,kUseDcba+kNoDcba+kCache32+k64Bit+kHasAltivec
- cmpwi r0,kCache32+kHasAltivec // untyped G4?
- li w8,0 // assume no need to test
- bne 2f // not an untyped G4, so do not test
-
- // G4, but neither kUseDcba or kNoDcba are set. Time and select fastest.
-
- crset kDCBA // first, use DCBA
- bl LTest32 // time it
- mr w8,w4 // w8 <- best time using DCBA
- srwi r0,w8,3 // bias 12 pct in favor of not using DCBA...
- add w8,w8,r0 // ...because DCBA is always slower with warm cache
- crclr kDCBA
- bl LTest32 // w4 <- best time without DCBA
- cmplw w8,w4 // which is better?
- li w8,kUseDcba // assume using DCBA is faster
- blt 2f
- li w8,kNoDcba // no DCBA is faster
-
- // What branch table to use?
-
-2: // here with w8 = 0, kUseDcba, or kNoDcba
- bcl 20,31,4f // get our PIC base again
-4:
- mflr w1
- addis w2,w1,ha16(__cpu_capabilities - 4b)
- lwz w3,lo16(__cpu_capabilities - 4b)(w2)
- or w3,w3,w8 // add in kUseDcba or kNoDcba if untyped G4
- mr r3,w8 // return dynamic selection, if any (used in testing)
-
- andi. r0,w3,kHasAltivec+k64Bit+kCache128+kCache64+kCache32+kUseDcba+kNoDcba
- cmpwi r0,kHasAltivec+kCache32+kUseDcba // G4 with DCBA?
- addis w4,w1,ha16(LG4UseDcba - 4b)
- addi w4,w4,lo16(LG4UseDcba - 4b)
- beq 5f
-
- andi. r0,w3,kHasAltivec+k64Bit+kCache128+kCache64+kCache32+kUseDcba+kNoDcba
- cmpwi r0,kHasAltivec+kCache32+kNoDcba // G4 without DCBA?
- addis w4,w1,ha16(LG4NoDcba - 4b)
- addi w4,w4,lo16(LG4NoDcba - 4b)
- beq 5f
-
- andi. r0,w3,kHasAltivec+k64Bit+kCache128+kCache64+kCache32
- cmpwi r0,kCache32 // G3?
- addis w4,w1,ha16(LG3 - 4b)
- addi w4,w4,lo16(LG3 - 4b)
- beq 5f
-
- // Map unrecognized CPU types to G3 (lowest common denominator)
-
-5: // w4 <- branch table pointer
- addis w5,w1,ha16(LBranchTablePtr - 4b)
- stw w4,lo16(LBranchTablePtr - 4b)(w5)
- lwz ra,kSFSize+8(r1) // recover return address
- mtlr ra // restore it
- lwz r1,0(r1) // pop off our stack frame
- blr // return dynamic selection (or 0) in r3
-
-
-// Subroutine to time a 32-byte cache.
-// kDCBA = set if we should use DCBA
-// w6 = base of buffer to use for test (kBufSiz bytes)
-// w4 = we return time of fastest loop in w4
-
-LTest32:
- li w1,kLoopCnt // number of times to loop
- li w4,-1 // initialize fastest time
-1:
- mr rd,w6 // initialize buffer ptr
- li r0,kBufSiz/32 // r0 <- cache blocks to test
- mtctr r0
-2:
- dcbf 0,rd // first, force the blocks out of the cache
- addi rd,rd,32
- bdnz 2b
- sync // make sure all the flushes take
- mr rd,w6 // re-initialize buffer ptr
- mtctr r0 // reset cache-block count
- mftbu w5 // remember upper half so we can check for carry
- mftb w2 // start the timer
-3: // loop over cache blocks
- bf kDCBA,4f // should we DCBA?
- dcba 0,rd
-4:
- stfd f1,0(rd) // store the entire cache block
- stfd f1,8(rd)
- stfd f1,16(rd)
- stfd f1,24(rd)
- addi rd,rd,32
- bdnz 3b
- mftb w3
- mftbu r0
- cmpw r0,w5 // did timebase carry?
- bne 1b // yes, retest rather than fuss
- sub w3,w3,w2 // w3 <- time for this loop
- cmplw w3,w4 // faster than current best?
- bge 5f // no
- mr w4,w3 // remember fastest time through loop
-5:
- subi w1,w1,1 // decrement outer loop count
- cmpwi w1,0 // more to go?
- bne 1b // loop if so
- blr
-
\ No newline at end of file
+ ba _COMM_PAGE_MEMCPY
+