X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/1c79356b52d46aa6b508fb032f5ae709b1f2897b..7e4a7d3939db04e70062ae6c7bf24b8c8b2f5a7c:/osfmk/ppc/bcopy.s diff --git a/osfmk/ppc/bcopy.s b/osfmk/ppc/bcopy.s index 95ad3ea6b..bc05940f2 100644 --- a/osfmk/ppc/bcopy.s +++ b/osfmk/ppc/bcopy.s @@ -1,219 +1,417 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved. * - * @APPLE_LICENSE_HEADER_START@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. * - * @APPLE_LICENSE_HEADER_END@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ ; -; Copy bytes of data around. handles overlapped data. -; -; Change this to use Altivec later on, and maybe floating point. +; Copy bytes of data around. Handles overlapped data. ; -; NOTE: This file compiles and executes on both MacOX 8.x (Codewarrior) -; and MacOX X. The "#if 0"s are treated as comments by CW so the -; stuff between them is included by CW and excluded on MacOX X. -; Same with the "#include"s. ; #include #include +#include -; Use CR5_lt to indicate non-cached +; These routines use CR5 for certain flags: +; Use CR5_lt to indicate non-cached (in bcopy and memcpy) #define noncache 20 -; Use CR5_gt to indicate that we need to turn data translation back on -#define fixxlate 21 -#if 0 -noncache: equ 20 -fixxlate: equ 21 -#endif -#if 0 -br0: equ 0 -#endif + +; The bcopy_phys variants use a stack frame so they can call bcopy as a subroutine. +#define BCOPY_SF_SIZE 32 // total size +#define BCOPY_SF_MSR 16 // we save caller's MSR here (possibly minus VEC and FP) + + +#define kShort 32 // short operands are special cased + + +; void bcopy_physvir_32(from, to, nbytes) ; -; bcopy_nc(from, to, nbytes) +; Attempt to copy physically addressed memory with translation on if conditions are met. +; Otherwise do a normal bcopy_phys. This routine is used because some 32-bit processors +; are very slow doing real-mode (translation off) copies, so we set up temporary BATs +; for the passed phys addrs and do the copy with translation on. ; -; bcopy_nc operates on non-cached memory so we can not use any kind -; of cache instructions. +; Rules are: - neither source nor destination can cross a page. +; - Interrupts must be disabled when this routine is called. +; - Translation must be on when called. ; +; To do the copy, we build a 128 DBAT for both the source and sink. If both are the same, only one +; is loaded. We do not touch the IBATs, so there is no issue if either physical page +; address is the same as the virtual address of the instructions we are executing. +; +; At the end, we invalidate the used DBATs. +; +; Note that the address parameters are long longs. We will transform these to 64-bit +; values. Note that on 32-bit architectures that this will ignore the high half of the +; passed in value. This should be ok since we can not have any bigger than 32 bit addresses +; there anyhow. +; +; Note also that this routine is used only on 32-bit machines. If you're contemplating use +; on a 64-bit processor, use the physical memory window instead; please refer to copypv() +; for an example of how this is done. + + .align 5 + .globl EXT(bcopy_physvir_32) + +LEXT(bcopy_physvir_32) + mflr r0 ; get return address + rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg + mfsprg r8,2 ; get processor feature flags + stw r0,8(r1) ; save return address + rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits + stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy + mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test + subi r0,r7,1 ; get length - 1 + rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg + add r11,r3,r0 ; Point to last byte of sink + mr r5,r7 ; Get the length into the right register + rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits + +; This test for page overflow may not work if the length is negative. Negative lengths are invalid input +; to bcopy_physvir() on 32-bit machines, and will result in a panic. + + add r12,r4,r0 ; Point to last byte of source + xor r7,r11,r3 ; See if we went to next page + xor r8,r12,r4 ; See if we went to next page + or r0,r7,r8 ; Combine wrap + +// li r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes + li r9,((2<<3)|2) ; Set default attributes + rlwinm. r0,r0,0,0,19 ; Did we overflow a page? + li r7,2 ; Set validity flags + li r8,2 ; Set validity flags + bne- bcopy_phys1 ; Overflowed page, do normal physical copy... + + rlwimi r11,r9,0,15,31 ; Set sink lower DBAT value + rlwimi r12,r9,0,15,31 ; Set source lower DBAT value + rlwimi r7,r11,0,0,14 ; Set sink upper DBAT value + rlwimi r8,r12,0,0,14 ; Set source upper DBAT value + cmplw cr1,r11,r12 ; See if sink and source are same block + + sync + + mtdbatl 0,r11 ; Set sink lower DBAT + mtdbatu 0,r7 ; Set sink upper DBAT + + beq- cr1,bcpvsame ; Source and sink are in same block + + mtdbatl 1,r12 ; Set source lower DBAT + mtdbatu 1,r8 ; Set source upper DBAT + +bcpvsame: + sync ; wait for the BATs to stabilize + isync + + bl EXT(bcopy) ; BATs set up, args in r3-r5, so do the copy with DR on + + li r0,0 ; Get set to invalidate upper half of BATs + sync ; Make sure all is well + mtdbatu 0,r0 ; Clear sink upper DBAT + mtdbatu 1,r0 ; Clear source upper DBAT + sync + isync + + lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address + addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame + mtlr r0 + blr - -#if 0 - IF 0 -#endif -ENTRY(bcopy_nc, TAG_NO_FRAME_USED) -#if 0 - ENDIF - export xbcopy_nc[DS] - tc xbcopy_nc[TC],xbcopy_nc[DS] - csect xbcopy_nc[DS] - dc.l .xbcopy_nc - dc.l TOC[tc0] - export .xbcopy_nc - csect xbcopy_nc[PR] -.xbcopy_nc: -#endif - - crset noncache ; Set non-cached - b bcpswap - -; ; void bcopy_phys(from, to, nbytes) -; Turns off data translation before the copy. Note, this one will -; not work in user state ; +; Turns off data translation before the copy. This one will not work in user state. +; This routine is used on 32 and 64-bit machines. +; +; Note that the address parameters are long longs. We will transform these to 64-bit +; values. Note that on 32-bit architectures that this will ignore the high half of the +; passed in value. This should be ok since we can not have any bigger than 32 bit addresses +; there anyhow. +; +; Also note that you probably will not be happy if either the sink or source spans across the +; boundary between RAM and I/O space. Good chance of hanging the machine and this code +; will not check, so be careful. +; +; NOTE: when called, translation must be on, and we must be in 32-bit mode. +; Interrupts may or may not be disabled. + + .align 5 + .globl EXT(bcopy_phys) + +LEXT(bcopy_phys) + mflr r0 ; get return address + rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg + stw r0,8(r1) ; save + mfsprg r8,2 ; get processor feature flags + stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy + rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits + rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg + mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test + rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits + mr r5,r7 ; Get the length into the right register + +bcopy_phys1: ; enter from bcopy_physvir with pf64Bit in cr6 and parms in r3-r5 + mfmsr r9 ; Get the MSR + lis r6,hi16(MASK(MSR_VEC)) ; Get vector enable + ori r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR)) ; Add in FP and DR + andc r9,r9,r6 ; unconditionally turn DR, VEC, and FP off + bt++ pf64Bitb,bcopy_phys64 ; skip if 64-bit (only they take hint) -#if 0 - IF 0 -#endif -ENTRY(bcopy_phys, TAG_NO_FRAME_USED) -#if 0 - ENDIF - export xbcopy_phys[DS] - tc bcopy_physc[TC],bcopy_phys[DS] - csect bcopy_phys[DS] - dc.l .bcopy_phys - dc.l TOC[tc0] - export .bcopy_phys - csect bcopy_phys[PR] -.bcopy_phys: -#endif +; 32-bit CPUs - mfmsr r9 ; Get the MSR - crclr noncache ; Set cached - rlwinm. r8,r9,0,MSR_DR_BIT,MSR_DR_BIT ; Is data translation on? - - cmplw cr1,r4,r3 ; Compare "to" and "from" - cmplwi cr7,r5,0 ; Check if we have a 0 length - mr r6,r3 ; Set source - beqlr- cr1 ; Bail if "to" and "from" are the same - xor r9,r9,r8 ; Turn off translation if it is on (should be) - beqlr- cr7 ; Bail if length is 0 - - mtmsr r9 ; Set DR translation off + mtmsr r9 ; turn DR, FP, and VEC off isync ; Wait for it - crnot fixxlate,cr0_eq ; Remember to turn on translation if it was - b copyit ; Go copy it... + bl EXT(bcopy) ; do the copy with translation off and caching on + + mfmsr r9 ; Get the MSR + ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on (but leave VEC and FP off) + mtmsr r9 ; restore msr + isync ; wait for it to happen + lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on + mtlr r0 + addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame + blr + + +; 64-bit: turn DR off and SF on. + +bcopy_phys64: ; r9 = MSR with DP, VEC, and FP off + ori r8,r9,lo16(MASK(MSR_DR)) ; make a copy with DR back on... this is what we return to caller + srdi r2,r3,31 ; Get a 1 if source is in I/O memory + li r0,1 ; Note - we use this in a couple places below + srdi r10,r4,31 ; Get a 1 if sink is in I/O memory + std r8,BCOPY_SF_MSR(r1) ; save caller's MSR so we remember whether EE was on + rldimi r9,r0,63,MSR_SF_BIT ; set SF on in MSR we will copy with + cmpldi cr0,r2,1 ; Is source in I/O memory? + cmpldi cr7,r10,1 ; Is sink in I/O memory? + mtmsrd r9 ; turn 64-bit addressing on, data translation off + isync ; wait for it to happen + cror cr7_eq,cr0_eq,cr7_eq ; See if either source or sink is in I/O area + beq-- cr7,io_space_real_mode_copy ; an operand is in I/O space + + bl EXT(bcopy) ; do copy with DR off and SF on, cache enabled + +bcopy_phys64x: + mfmsr r9 ; Get the MSR we used to copy + rldicl r9,r9,0,MSR_SF_BIT+1 ; clear SF + ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on + mtmsrd r9 ; turn 64-bit mode off, translation back on + isync ; wait for it to happen + lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on + ld r8,BCOPY_SF_MSR(r1) ; get caller's MSR once translation is back on + mtlr r0 + mtmsrd r8,1 ; turn EE back on if necessary + addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame + blr + +; We need to copy with DR off, but one of the operands is in I/O space. To avoid wedging U3, +; which cannot handle a cache burst in I/O space, we must turn caching off for the real memory access. +; This can only be done by setting bits in HID4. We cannot lose control and execute random code in +; this state, so we have to disable interrupts as well. This is an unpleasant hack. + +io_space_real_mode_copy: ; r0=1, r9=MSR we want to copy with + sldi r11,r0,31-MSR_EE_BIT ; Get a mask for the EE bit + sldi r0,r0,32+8 ; Get the right bit to turn off caching + andc r9,r9,r11 ; Turn off EE bit + mfspr r2,hid4 ; Get HID4 + mtmsrd r9,1 ; Force off EE + or r2,r2,r0 ; Set bit to make real accesses cache-inhibited + sync ; Sync up + mtspr hid4,r2 ; Make real accesses cache-inhibited + isync ; Toss prefetches + + lis r12,0xE000 ; Get the unlikeliest ESID possible + srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000 + slbie r12 ; Make sure the ERAT is cleared + + sync + isync + + bl EXT(bcopy_nc) ; copy with SF on and EE, DR, VEC, and FP off, cache inhibited + + li r0,1 ; Get a 1 + sldi r0,r0,32+8 ; Get the right bit to turn off caching + mfspr r2,hid4 ; Get HID4 + andc r2,r2,r0 ; Clear bit to make real accesses cache-inhibited + sync ; Sync up + mtspr hid4,r2 ; Make real accesses not cache-inhibited + isync ; Toss prefetches + + lis r12,0xE000 ; Get the unlikeliest ESID possible + srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000 + slbie r12 ; Make sure the ERAT is cleared + b bcopy_phys64x + +; +; shortcopy +; +; Special case short operands (<32 bytes), which are very common. Note that the check for +; reverse vs normal moves isn't quite correct in 64-bit mode; in rare cases we will move in +; reverse when it wasn't necessary to do so. This is OK, since performance of the two cases +; is similar. We do get the direction right when it counts (ie, when the operands overlap.) +; Also note that we use the G3/G4 "backend" code, even on G5. This is OK too, since G5 has +; plenty of load/store dispatch bandwidth in this case, the extra ops are hidden by latency, +; and using word instead of doubleword moves reduces the possibility of unaligned accesses, +; which cost about 20 cycles if they cross a 32-byte boundary on G5. Finally, because we +; might do unaligned accesses this code cannot be called from bcopy_nc(). +; r4 = destination +; r5 = length (<32) +; r6 = source +; r12 = (dest - source) + + .align 5 +shortcopy: + cmplw r12,r5 ; must move reverse if (dest-source)0) +; r6 = source +; r12 = (dest - source) +; cr5 = noncache flag + +copyit32: ; WARNING! can drop down to this label + cmplw cr1,r12,r5 ; must move reverse if (dest-source)0) +; r6 = source +; r8 = inverse of largest mask smaller than operand length +; r9 = neg(dest), used to compute alignment +; cr5 = noncache flag + +forward32bit: ; enter from 64-bit CPUs with word aligned uncached operands + rlwinm r7,r9,0,0x1F ; get bytes to 32-byte-align destination + andc. r0,r7,r8 ; limit to the maximum front end move + mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time... beq alline ; Already on a line... + mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5 + sub r5,r5,r0 ; Set the length left to move + bf 31,alhalf ; No single byte to do... lbz r7,0(r6) ; Get the byte addi r6,r6,1 ; Point to the next @@ -263,43 +461,45 @@ alquad: bf 27,alline ; No quad to do... ; Sink is line aligned here alline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move - mtcrf 3,r5 ; Make branch mask for backend partial moves - rlwinm r11,r5,0,0,26 ; Get number of bytes we are going to move + mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time... + mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5 beq- backend ; No full lines to move - - sub r5,r5,r11 ; Calculate the residual - li r10,96 ; Stride for touch ahead - -nxtline: subic. r0,r0,1 ; Account for the line now - + + mtctr r0 ; set up loop count + li r0,96 ; Stride for touch ahead + b nxtline + + .align 4 +nxtline: + lwz r2,0(r6) ; Get the first word + lwz r5,4(r6) ; Get the second word + lwz r7,8(r6) ; Get the third word + lwz r8,12(r6) ; Get the fourth word + lwz r9,16(r6) ; Get the fifth word + lwz r10,20(r6) ; Get the sixth word + lwz r11,24(r6) ; Get the seventh word + lwz r12,28(r6) ; Get the eighth word bt- noncache,skipz ; Skip if we are not cached... - dcbz br0,r4 ; Blow away the whole line because we are replacing it - dcbt r6,r10 ; Touch ahead a bit - -skipz: lwz r7,0(r6) ; Get the first word - lwz r8,4(r6) ; Get the second word - lwz r9,8(r6) ; Get the third word - stw r7,0(r4) ; Save the first word - lwz r11,12(r6) ; Get the fourth word - stw r8,4(r4) ; Save the second word - lwz r7,16(r6) ; Get the fifth word - stw r9,8(r4) ; Save the third word - lwz r8,20(r6) ; Get the sixth word - stw r11,12(r4) ; Save the fourth word - lwz r9,24(r6) ; Get the seventh word - stw r7,16(r4) ; Save the fifth word - lwz r11,28(r6) ; Get the eighth word + dcbz 0,r4 ; Blow away the whole line because we are replacing it + dcbt r6,r0 ; Touch ahead a bit +skipz: addi r6,r6,32 ; Point to the next - stw r8,20(r4) ; Save the sixth word - stw r9,24(r4) ; Save the seventh word - stw r11,28(r4) ; Save the eighth word + stw r2,0(r4) ; Save the first word + stw r5,4(r4) ; Save the second word + stw r7,8(r4) ; Save the third word + stw r8,12(r4) ; Save the fourth word + stw r9,16(r4) ; Save the fifth word + stw r10,20(r4) ; Save the sixth word + stw r11,24(r4) ; Save the seventh word + stw r12,28(r4) ; Save the eighth word addi r4,r4,32 ; Bump sink - bgt+ nxtline ; Do the next line, if any... + bdnz+ nxtline ; Do the next line, if any... ; Move backend quadword -backend: bf 27,noquad ; No quad to do... +backend: ; Join here from "shortcopy" for forward moves <32 bytes + bf 27,noquad ; No quad to do... lwz r7,0(r6) ; Get the first word lwz r8,4(r6) ; Get the second word lwz r9,8(r6) ; Get the third word @@ -339,46 +539,33 @@ noword: bf 30,nohalf ; No halfword to do... ; Move backend byte -nohalf: bf 31,bcpydone ; Leave cuz we are all done... +nohalf: bflr 31 ; Leave cuz we are all done... lbz r7,0(r6) ; Get the byte stb r7,0(r4) ; Save the single + blr -bcpydone: bflr fixxlate ; Leave now if we do not need to fix translation... - mfmsr r9 ; Get the MSR - ori r9,r9,lo16(MASK(MSR_DR)) ; Turn data translation on - mtmsr r9 ; Just do it - isync ; Hang in there - blr ; Leave cuz we are all done... -; -; 0123456789ABCDEF0123456789ABCDEF -; 0123456789ABCDEF0123456789ABCDEF -; F -; DE -; 9ABC -; 12345678 -; 123456789ABCDEF0 -; 0 +; Reverse moves on 32-bit machines, also reverse word aligned uncached moves on 64-bit machines. +; NOTE: we never do an unaligned access if the source and destination are "relatively" +; word aligned. We depend on this in the uncached case on 64-bit processors. +; These are slower because we don't bother with dcbz. Fortunately, reverse moves are uncommon. +; r4 = destination +; r5 = length (>0) +; r6 = source +; r8 = inverse of largest mask smaller than operand length +; cr5 = noncache flag (but we don't dcbz anyway) -; -; Here is where we handle a forward overlapping move. These will be slow -; because we can not kill the cache of the destination until after we have -; loaded/saved the source area. Also, because reading memory backwards is -; slower when the cache line needs to be loaded because the critical -; doubleword is loaded first, i.e., the last, then it goes back to the first, -; and on in order. That means that when we are at the second to last DW we -; have to wait until the whole line is in cache before we can proceed. -; - -fwdovrlap: add r4,r5,r4 ; Point past the last sink byte +reverse32bit: ; here from 64-bit code with word aligned uncached operands + add r4,r5,r4 ; Point past the last sink byte add r6,r5,r6 ; Point past the last source byte - and r0,r4,r8 ; Apply movement limit - li r12,-1 ; Make sure we touch in the actual line - mtcrf 3,r0 ; Figure out the best way to move backwards + rlwinm r7,r4,0,0x1F ; Calculate the length to align dest on cache boundary + li r12,-1 ; Make sure we touch in the actual line + andc. r0,r7,r8 ; Apply movement limit dcbt r12,r6 ; Touch in the last line of source - rlwinm. r0,r0,0,27,31 ; Calculate the length to adjust to cache boundary + mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time... dcbtst r12,r4 ; Touch in the last line of the sink - beq- balline ; Aready on cache line boundary + mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5 + beq- balline ; Aready on cache line boundary (or too short to bother) sub r5,r5,r0 ; Precaculate move length left after alignment @@ -431,19 +618,14 @@ balquad: bf 27,balline ; No quad to do... ; Sink is line aligned here balline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move - mtcrf 3,r5 ; Make branch mask for backend partial moves + mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time... + mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5 beq- bbackend ; No full lines to move -#if 0 - stwu r1,-8(r1) ; Dummy stack for MacOS - stw r2,4(r1) ; Save RTOC -#endif - - -; Registers in use: R0, R1, R3, R4, R5, R6 -; Registers not in use: R2, R7, R8, R9, R10, R11, R12 - Ok, we can make another free for 8 of them + mtctr r0 ; set up loop count + b bnxtline -bnxtline: subic. r0,r0,1 ; Account for the line now - + .align 4 +bnxtline: lwz r7,-32(r6) ; Get the first word lwz r5,-28(r6) ; Get the second word lwz r2,-24(r6) ; Get the third word @@ -455,10 +637,7 @@ bnxtline: subic. r0,r0,1 ; Account for the line now subi r6,r6,32 ; Point to the next stw r7,-32(r4) ; Get the first word - ble- bnotouch ; Last time, skip touch of source... - dcbt br0,r6 ; Touch in next source line - -bnotouch: stw r5,-28(r4) ; Get the second word + stw r5,-28(r4) ; Get the second word stw r2,-24(r4) ; Get the third word stw r12,-20(r4) ; Get the third word stw r11,-16(r4) ; Get the fifth word @@ -467,11 +646,7 @@ bnotouch: stw r5,-28(r4) ; Get the second word stw r8,-4(r4) ; Get the eighth word subi r4,r4,32 ; Bump sink - bgt+ bnxtline ; Do the next line, if any... -#if 0 - lwz r2,4(r1) ; Restore RTOC - lwz r1,0(r1) ; Pop dummy stack -#endif + bdnz+ bnxtline ; Do the next line, if any... ; ; Note: We touched these lines in at the beginning @@ -479,7 +654,8 @@ bnotouch: stw r5,-28(r4) ; Get the second word ; Move backend quadword -bbackend: bf 27,bnoquad ; No quad to do... +bbackend: ; Join here from "shortcopy" for reverse moves of <32 bytes + bf 27,bnoquad ; No quad to do... lwz r7,-16(r6) ; Get the first word lwz r8,-12(r6) ; Get the second word lwz r9,-8(r6) ; Get the third word @@ -519,8 +695,287 @@ bnoword: bf 30,bnohalf ; No halfword to do... ; Move backend byte -bnohalf: bflr 31 ; Leave cuz we are all done... +bnohalf: bflr 31 ; Leave cuz we are all done... lbz r7,-1(r6) ; Get the byte stb r7,-1(r4) ; Save the single - - blr ; Leave cuz we are all done... + blr + + +// Here on 64-bit processors, which have a 128-byte cache line. This can be +// called either in 32 or 64-bit mode, which makes the test for reverse moves +// a little tricky. We've already filtered out the (sou==dest) and (len==0) +// special cases. +// +// When entered: +// r4 = destination (32 or 64-bit ptr) +// r5 = length (always 32 bits) +// r6 = source (32 or 64-bit ptr) +// r12 = (dest - source), reverse move required if (dest-source)=length, in mode-independent way + li r0,0 // get a 0 + lis r10,hi16(0x80000000)// get 0x80000000 + addze. r0,r0 // set cr0 on carry bit (beq if reverse move required) + neg r9,r4 // start to get alignment for destination + sraw r8,r10,r11 // get mask based on operand length, to limit alignment + bt-- noncache,c64uncached// skip if uncached + beq-- c64rdouble // handle cached reverse moves + + +// Forward, cached or doubleword aligned uncached. This is the common case. +// NOTE: we never do an unaligned access if the source and destination are "relatively" +// doubleword aligned. We depend on this in the uncached case. +// r4 = destination +// r5 = length (>0) +// r6 = source +// r8 = inverse of largest mask smaller than operand length +// r9 = neg(dest), used to compute alignment +// cr5 = noncache flag + +c64double: + rlwinm r7,r9,0,0x7F // get #bytes to 128-byte align destination + andc r7,r7,r8 // limit by operand length + andi. r8,r7,7 // r8 <- #bytes to doubleword align + srwi r9,r7,3 // r9 <- #doublewords to 128-byte align + sub r5,r5,r7 // adjust length remaining + cmpwi cr1,r9,0 // any doublewords to move to cache align? + srwi r10,r5,7 // r10 <- 128-byte chunks to xfer after aligning dest + cmpwi cr7,r10,0 // set cr7 on chunk count + beq c64double2 // dest already doubleword aligned + mtctr r8 + b c64double1 + + .align 5 // align inner loops +c64double1: // copy bytes until dest is doubleword aligned + lbz r0,0(r6) + addi r6,r6,1 + stb r0,0(r4) + addi r4,r4,1 + bdnz c64double1 + +c64double2: // r9/cr1=doublewords, r10/cr7=128-byte chunks + beq cr1,c64double4 // no doublewords to xfer in order to cache align + mtctr r9 + b c64double3 + + .align 5 // align inner loops +c64double3: // copy doublewords until dest is 128-byte aligned + ld r7,0(r6) + addi r6,r6,8 + std r7,0(r4) + addi r4,r4,8 + bdnz c64double3 + +// Here to xfer 128-byte chunks, if any. Since we only have 8 GPRs for +// data (64 bytes), we load/store each twice per 128-byte chunk. + +c64double4: // r10/cr7=128-byte chunks + rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords, after moving chunks + cmpwi cr1,r0,0 // set cr1 on leftover doublewords + beq cr7,c64double7 // no 128-byte chunks + + ; We must check for (source-dest)<128 in a mode-independent way. If within 128 bytes, + ; turn on "noncache" because we cannot use dcbz128 even if operands are cacheable. + + sub r8,r6,r4 // r8 <- (source - dest) + rldicr. r0,r8,0,63-7 // zero low 7 bits and check for 0, mode independent + cror noncache,cr0_eq,noncache // turn on "noncache" flag if (source-dest)<128 + mtctr r10 + b c64InnerLoop + + .align 5 // align inner loop +c64InnerLoop: // loop copying 128-byte cache lines to 128-aligned destination + ld r0,0(r6) // start pipe: load 1st half-line + ld r2,8(r6) + ld r7,16(r6) + ld r8,24(r6) + ld r9,32(r6) + ld r10,40(r6) + ld r11,48(r6) + ld r12,56(r6) + bt noncache,c64InnerLoop1 // skip if uncached or overlap + dcbz128 0,r4 // avoid prefetch of next cache line +c64InnerLoop1: + + std r0,0(r4) + std r2,8(r4) + std r7,16(r4) + std r8,24(r4) + std r9,32(r4) + std r10,40(r4) + std r11,48(r4) + std r12,56(r4) + + ld r0,64(r6) // load 2nd half of chunk + ld r2,72(r6) + ld r7,80(r6) + ld r8,88(r6) + ld r9,96(r6) + ld r10,104(r6) + ld r11,112(r6) + ld r12,120(r6) + addi r6,r6,128 + + std r0,64(r4) + std r2,72(r4) + std r7,80(r4) + std r8,88(r4) + std r9,96(r4) + std r10,104(r4) + std r11,112(r4) + std r12,120(r4) + addi r4,r4,128 // advance to next dest chunk + + bdnz c64InnerLoop // loop if more chunks + + +c64double7: // r5 <- leftover bytes, cr1 set on doubleword count + rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords (0-15) + andi. r5,r5,7 // r5/cr0 <- count of leftover bytes (0-7) + beq cr1,c64byte // no leftover doublewords + mtctr r0 + b c64double8 + + .align 5 // align inner loop +c64double8: // loop copying leftover doublewords + ld r0,0(r6) + addi r6,r6,8 + std r0,0(r4) + addi r4,r4,8 + bdnz c64double8 + + +// Forward byte loop. + +c64byte: // r5/cr0 <- byte count (can be big if unaligned uncached) + beqlr // done if no leftover bytes + mtctr r5 + b c64byte1 + + .align 5 // align inner loop +c64byte1: + lbz r0,0(r6) + addi r6,r6,1 + stb r0,0(r4) + addi r4,r4,1 + bdnz c64byte1 + + blr + + +// Uncached copies. We must avoid unaligned accesses, since they always take alignment +// exceptions on uncached memory on 64-bit processors. This may mean we copy long operands +// a byte at a time, but that is still much faster than alignment exceptions. +// r4 = destination +// r5 = length (>0) +// r6 = source +// r8 = inverse of largest mask smaller than operand length +// r9 = neg(dest), used to compute alignment +// r12 = (dest-source), used to test relative alignment +// cr0 = beq if reverse move required +// cr5 = noncache flag + +c64uncached: + rlwinm r10,r12,0,29,31 // relatively doubleword aligned? + rlwinm r11,r12,0,30,31 // relatively word aligned? + cmpwi cr7,r10,0 // set cr7 beq if doubleword aligned + cmpwi cr1,r11,0 // set cr1 beq if word aligned + beq-- c64reverseUncached + + beq cr7,c64double // doubleword aligned + beq cr1,forward32bit // word aligned, use G3/G4 code + cmpwi r5,0 // set cr0 on byte count + b c64byte // unaligned operands + +c64reverseUncached: + beq cr7,c64rdouble // doubleword aligned so can use LD/STD + beq cr1,reverse32bit // word aligned, use G3/G4 code + add r6,r6,r5 // point to (end+1) of source and dest + add r4,r4,r5 + cmpwi r5,0 // set cr0 on length + b c64rbyte // copy a byte at a time + + + +// Reverse doubleword copies. This is used for all cached copies, and doubleword +// aligned uncached copies. +// r4 = destination +// r5 = length (>0) +// r6 = source +// r8 = inverse of largest mask of low-order 1s smaller than operand length +// cr5 = noncache flag + +c64rdouble: + add r6,r6,r5 // point to (end+1) of source and dest + add r4,r4,r5 + rlwinm r7,r4,0,29,31 // r7 <- #bytes to doubleword align dest + andc. r7,r7,r8 // limit by operand length + sub r5,r5,r7 // adjust length + srwi r8,r5,6 // r8 <- 64-byte chunks to xfer + cmpwi cr1,r8,0 // any chunks? + beq c64rd2 // source already doubleword aligned + mtctr r7 + +c64rd1: // copy bytes until source doublword aligned + lbzu r0,-1(r6) + stbu r0,-1(r4) + bdnz c64rd1 + +c64rd2: // r8/cr1 <- count of 64-byte chunks + rlwinm r0,r5,29,29,31 // r0 <- count of leftover doublewords + andi. r5,r5,7 // r5/cr0 <- count of leftover bytes + cmpwi cr7,r0,0 // leftover doublewords? + beq cr1,c64rd4 // no chunks to xfer + mtctr r8 + b c64rd3 + + .align 5 // align inner loop +c64rd3: // loop copying 64-byte chunks + ld r7,-8(r6) + ld r8,-16(r6) + ld r9,-24(r6) + ld r10,-32(r6) + ld r11,-40(r6) + ld r12,-48(r6) + std r7,-8(r4) + std r8,-16(r4) + ld r7,-56(r6) + ldu r8,-64(r6) + std r9,-24(r4) + std r10,-32(r4) + std r11,-40(r4) + std r12,-48(r4) + std r7,-56(r4) + stdu r8,-64(r4) + bdnz c64rd3 + +c64rd4: // r0/cr7 = leftover doublewords r5/cr0 = leftover bytes + beq cr7,c64rbyte // no leftover doublewords + mtctr r0 + +c64rd5: // loop copying leftover doublewords + ldu r0,-8(r6) + stdu r0,-8(r4) + bdnz c64rd5 + + +// Reverse byte loop. + +c64rbyte: // r5/cr0 <- byte count (can be big if unaligned uncached) + beqlr // done if no leftover bytes + mtctr r5 + +c64rbyte1: + lbzu r0,-1(r6) + stbu r0,-1(r4) + bdnz c64rbyte1 + + blr +