X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/cf03f5cdc65293b4cb5eba3ed23fed26dad903c9..de355530ae67247cbd0da700edb3a2a1dae884c2:/osfmk/ppc/bcopy.s diff --git a/osfmk/ppc/bcopy.s b/osfmk/ppc/bcopy.s index 389fe4b2f..1a18bf37a 100644 --- a/osfmk/ppc/bcopy.s +++ b/osfmk/ppc/bcopy.s @@ -1,24 +1,21 @@ /* - * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * - * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this - * file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. * * @APPLE_LICENSE_HEADER_END@ */ @@ -30,22 +27,13 @@ ; #include #include -#include ; Use CR5_lt to indicate non-cached #define noncache 20 - ; Use CR5_gt to indicate that we need to turn data translation back on #define fixxlate 21 - -; Use CR5_eq to indicate that we need to invalidate bats (if 32-bit) or turn off -; 64-bit mode (if 64-bit) before returning to our caller. We overload the -; bit to reduce the number of conditional branches at bcopy exit. -#define restorex 22 - -; Use CR5_so to indicate that we need to restore real-mode cachability -; Only needed on 64-bit machines -#define flipcache 23 +; Use CR5_eq to indicate that we need to invalidate bats +#define killbats 22 ; ; bcopy_nc(from, to, nbytes) @@ -65,24 +53,19 @@ LEXT(bcopy_nc) ; ; void bcopy_physvir(from, to, nbytes) ; Attempt to copy physically addressed memory with translation on if conditions are met. -; Otherwise do a normal bcopy_phys. This routine is used because some 32-bit processors -; are very slow doing real-mode (translation off) copies, so we set up temporary BATs -; for the passed phys addrs and do the copy with translation on. +; Otherwise do a normal bcopy_phys. ; ; Rules are: neither source nor destination can cross a page. +; No accesses above the 2GB line (I/O or ROM). ; -; Interrupts must be disabled throughout the copy when this is called. +; Interrupts must be disabled throughout the copy when this is called + ; To do this, we build a ; 128 DBAT for both the source and sink. If both are the same, only one is ; loaded. We do not touch the IBATs, so there is no issue if either physical page ; address is the same as the virtual address of the instructions we are executing. ; -; At the end, we invalidate the used DBATs. -; -; Note that the address parameters are long longs. We will transform these to 64-bit -; values. Note that on 32-bit architectures that this will ignore the high half of the -; passed in value. This should be ok since we can not have any bigger than 32 bit addresses -; there anyhow. +; At the end, we invalidate the used DBATs and reenable interrupts. ; ; Note, this one will not work in user state ; @@ -92,32 +75,22 @@ LEXT(bcopy_nc) LEXT(bcopy_physvir) - crclr flipcache ; (HACK) No cache flip needed - mfsprg r8,2 ; get processor feature flags - rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg - addic. r0,r7,-1 ; Get length - 1 - rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits + addic. r0,r5,-1 ; Get length - 1 add r11,r3,r0 ; Point to last byte of sink - rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg - mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test - rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits - mr r5,r7 ; Get the length into the right register - cmplw cr1,r3,r4 ; Does source == sink? - bt++ pf64Bitb,bcopy_phys1 ; if 64-bit processor, use standard routine (no BATs) + cmplw cr1,r3,r4 ; Does source == sink? add r12,r4,r0 ; Point to last byte of source bltlr- ; Bail if length is 0 or way too big xor r7,r11,r3 ; See if we went to next page xor r8,r12,r4 ; See if we went to next page or r0,r7,r8 ; Combine wrap -// li r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes - li r9,((2<<3)|2) ; Set default attributes + li r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes rlwinm. r0,r0,0,0,19 ; Did we overflow a page? li r7,2 ; Set validity flags li r8,2 ; Set validity flags - bne- bcopy_phys1 ; Overflowed page, do normal physical copy... + bne- EXT(bcopy_phys) ; Overflowed page, do normal physical copy... - crset restorex ; Remember to trash BATs on the way out + crset killbats ; Remember to trash BATs on the way out rlwimi r11,r9,0,15,31 ; Set sink lower DBAT value rlwimi r12,r9,0,15,31 ; Set source lower DBAT value rlwimi r7,r11,0,0,14 ; Set sink upper DBAT value @@ -136,123 +109,41 @@ LEXT(bcopy_physvir) bcpvsame: mr r6,r3 ; Set source crclr noncache ; Set cached - crclr fixxlate ; Set translation already ok - b copyit32 ; Go copy it... + b copyit ; Go copy it... + ; ; void bcopy_phys(from, to, nbytes) ; Turns off data translation before the copy. Note, this one will -; not work in user state. This routine is used on 32 and 64-bit -; machines. -; -; Note that the address parameters are long longs. We will transform these to 64-bit -; values. Note that on 32-bit architectures that this will ignore the high half of the -; passed in value. This should be ok since we can not have any bigger than 32 bit addresses -; there anyhow. -; -; Also note that you probably will not be happy if either the sink or source spans across the -; boundary between RAM and I/O space. Good chance of hanging the machine and this code -; will not check, so be careful. +; not work in user state ; .align 5 .globl EXT(bcopy_phys) LEXT(bcopy_phys) - crclr flipcache ; (HACK) No cache flip needed - rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg - mfsprg r8,2 ; get processor feature flags - rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits - rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg - mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test - rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits - mr r5,r7 ; Get the length into the right register - -bcopy_phys1: ; enter from bcopy_physvir with pf64Bit already in cr6 + mfmsr r9 ; Get the MSR + crclr noncache ; Set cached - bt++ pf64Bitb,bcopy_phys64 ; skip if 64-bit (only they take hint) - -; 32-bit CPUs - - sub. r0,r3,r4 ; to==from? - rlwinm r8,r9,0,MSR_DR_BIT,MSR_DR_BIT ; was translation on? - cmpwi cr1,r8,0 ; set cr1 beq if translation was off - oris r8,r8,hi16(MASK(MSR_VEC)) ; Get vector enable + rlwinm. r8,r9,0,MSR_DR_BIT,MSR_DR_BIT ; Is data translation on? + + cmplw cr1,r4,r3 ; Compare "to" and "from" cmplwi cr7,r5,0 ; Check if we have a 0 length - beqlr- ; bail if to==from - ori r8,r8,lo16(MASK(MSR_FP)) ; Get FP mr r6,r3 ; Set source - andc r9,r9,r8 ; Turn off translation if it is on (should be) and FP, VEC + beqlr- cr1 ; Bail if "to" and "from" are the same + xor r9,r9,r8 ; Turn off translation if it is on (should be) beqlr- cr7 ; Bail if length is 0 - crclr restorex ; Make sure we do not trash BATs on the way out + rlwinm r9,r9,0,MSR_FP_BIT+1,MSR_FP_BIT-1 ; Force floating point off + crclr killbats ; Make sure we do not trash BATs on the way out + rlwinm r9,r9,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1 ; Force vectors off mtmsr r9 ; Set DR translation off isync ; Wait for it - crnot fixxlate,cr1_eq ; Remember to turn on translation if it was - b copyit32 ; Go copy it... - -; 64-bit: turn DR off and SF on, remember if we need to restore on way out. - -bcopy_phys64: ; r9 = MSR - - srdi r2,r3,31 ; (HACK) Get a 1 if source is in I/O memory - srdi. r0,r9,63-MSR_SF_BIT ; set cr0 beq on if SF was off when we were called - rlwinm r8,r9,MSR_DR_BIT+1,31,31 ; r8 <- DR bit right justified - cmpld cr1,r3,r4 ; to==from? - li r0,1 ; Note - we use this in a couple places below - lis r6,hi16(MASK(MSR_VEC)) ; Get vector enable - cmpwi cr7,r5,0 ; length==0 ? - ori r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR)) ; Add in FP and DR - beqlr-- cr1 ; bail if to==from - srdi r10,r4,31 ; (HACK) Get a 1 if sink is in I/O memory - rldimi r9,r0,63,MSR_SF_BIT ; set SF on - beqlr-- cr7 ; bail if length==0 - andc r9,r9,r6 ; turn DR, VEC, FP off - cmpwi cr1,r8,0 ; was DR on? - crmove restorex,cr0_eq ; if SF was off, remember to turn back off before we return - mtmsrd r9 ; turn 64-bit addressing on, data translation off - cmpldi cr0,r2,1 ; (HACK) Is source in I/O memory? - isync ; wait for it to happen - mr r6,r3 ; Set source - cmpldi cr7,r10,1 ; (HACK) Is sink in I/O memory? - crnot fixxlate,cr1_eq ; if DR was on, remember to turn back on before we return - - cror flipcache,cr0_eq,cr7_eq ; (HACK) See if either source or sink is in I/O area - - rlwinm r10,r9,MSR_EE_BIT+1,31,31 ; (HACK GLORIOUS HACK) Isolate the EE bit - sldi r11,r0,31-MSR_EE_BIT ; (HACK GLORIOUS HACK)) Get a mask for the EE bit - sldi r0,r0,32+8 ; (HACK) Get the right bit to turn off caching - bf++ flipcache,copyit64 ; (HACK) No need to mess with caching... - -; -; HACK GLORIOUS HACK - when we force of caching, we need to also force off -; interruptions. We are out of CR bits, so we need to stash the entry EE -; somewheres. It is in the XER.... We NEED to change this!!!! -; - - mtxer r10 ; (HACK GLORIOUS HACK) Remember EE - andc r9,r9,r11 ; (HACK GLORIOUS HACK) Turn off EE bit - mfspr r2,hid4 ; (HACK) Get HID4 - crset noncache ; (HACK) Set non-cached - mtmsrd r9 ; (HACK GLORIOUS HACK) Force off EE - or r2,r2,r0 ; (HACK) Set bit to make real accesses cache-inhibited - sync ; (HACK) Sync up - li r0,1 - mtspr hid4,r2 ; (HACK) Make real accesses cache-inhibited - isync ; (HACK) Toss prefetches - - lis r12,0xE000 ; (HACK) Get the unlikeliest ESID possible - srdi r12,r12,1 ; (HACK) Make 0x7FFFFFFFF0000000 - slbie r12 ; (HACK) Make sure the ERAT is cleared - - sync ; (HACK) - isync ; (HACK) - - b copyit64 - + crnot fixxlate,cr0_eq ; Remember to turn on translation if it was + b copyit ; Go copy it... ; ; void bcopy(from, to, nbytes) @@ -265,19 +156,14 @@ LEXT(bcopy) crclr noncache ; Set cached -bcpswap: - crclr flipcache ; (HACK) No cache flip needed - mfsprg r8,2 ; get processor feature flags - sub. r0,r4,r3 ; test for to==from in mode-independent way - mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test - cmpwi cr1,r5,0 ; Check if we have a 0 length - crclr restorex ; Make sure we do not trash BATs on the way out +bcpswap: cmplw cr1,r4,r3 ; Compare "to" and "from" + mr. r5,r5 ; Check if we have a 0 length mr r6,r3 ; Set source + crclr killbats ; Make sure we do not trash BATs on the way out + beqlr- cr1 ; Bail if "to" and "from" are the same + beqlr- ; Bail if length is 0 crclr fixxlate ; Set translation already ok - beqlr- ; Bail if "to" and "from" are the same - beqlr- cr1 ; Bail if length is 0 - bt++ pf64Bitb,copyit64 ; handle 64-bit processor - b copyit32 ; Go copy it... + b copyit ; Go copy it... ; ; When we move the memory, forward overlays must be handled. We @@ -285,32 +171,19 @@ bcpswap: ; We need to preserve R3 because it needs to be returned for memcpy. ; We can be interrupted and lose control here. ; -; There is no stack, so in order to use vectors, we would -; need to take the vector exception. Any potential gains by using vectors +; There is no stack, so in order to used floating point, we would +; need to take the FP exception. Any potential gains by using FP ; would be more than eaten up by this. ; -; NOTE: this code is called in three "modes": -; - on 32-bit processors (32-byte cache line) -; - on 64-bit processors running in 32-bit mode (128-byte cache line) -; - on 64-bit processors running in 64-bit mode (128-byte cache line) -; -; ALSO NOTE: bcopy is called from copyin and copyout etc -; with the "thread_recover" ptr set. This means bcopy must not set up a -; stack frame or touch non-volatile registers, and also means that it -; cannot rely on turning off interrupts, because we expect to get DSIs -; and have execution aborted by a "longjmp" to the thread_recover -; routine. +; Later, we should used Altivec for large moves. ; .align 5 .globl EXT(memcpy) - ; NB: memcpy is only called in 32-bit mode, albeit on both 32- and 64-bit - ; processors... + LEXT(memcpy) - crclr flipcache ; (HACK) No cache flip needed - mfsprg r8,2 ; get processor feature flags + cmplw cr1,r3,r4 ; "to" and "from" the same? - mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test mr r6,r4 ; Set the "from" mr. r5,r5 ; Length zero? crclr noncache ; Set cached @@ -318,10 +191,9 @@ LEXT(memcpy) crclr fixxlate ; Set translation already ok beqlr- cr1 ; "to" and "from" are the same beqlr- ; Length is 0 - crclr restorex ; Make sure we do not trash BATs on the way out - bt++ pf64Bitb,copyit64 ; handle 64-bit processors + crclr killbats ; Make sure we do not trash BATs on the way out -copyit32: sub r12,r4,r6 ; Get potential overlap (negative if backward move) +copyit: sub r12,r4,r6 ; Get potential overlap (negative if backward move) lis r8,0x7FFF ; Start up a mask srawi r11,r12,31 ; Propagate the sign bit dcbt br0,r6 ; Touch in the first source line @@ -334,7 +206,7 @@ copyit32: sub r12,r4,r6 ; Get potential overlap (negative if backward move) cmplwi cr7,r9,32 ; See if at least a line between source and sink dcbtst br0,r4 ; Touch in the first sink line cmplwi cr1,r5,32 ; Are we moving more than a line? - cror noncache,noncache,cr7_lt ; Set to not DCBZ output line if not enough space + cror noncache,noncache,28 ; Set to not DCBZ output line if not enough space blt- fwdovrlap ; This is a forward overlapping area, handle it... ; @@ -350,7 +222,6 @@ copyit32: sub r12,r4,r6 ; Get potential overlap (negative if backward move) ; We can not do this if noncache is set because we will take an ; alignment exception. -G4word: ; enter from 64-bit case with word aligned uncached operands neg r0,r4 ; Get the number of bytes to move to align to a line boundary rlwinm. r0,r0,0,27,31 ; Clean it up and test it and r0,r0,r8 ; limit to the maximum front end move @@ -487,45 +358,17 @@ nohalf: bf 31,bcpydone ; Leave cuz we are all done... lbz r7,0(r6) ; Get the byte stb r7,0(r4) ; Save the single -bcpydone: +bcpydone: bt- killbats,bcclrbat ; Jump if we need to clear bats... + bflr fixxlate ; Leave now if we do not need to fix translation... mfmsr r9 ; Get the MSR - bf++ flipcache,bcpydone0 ; (HACK) No need to mess with caching... - - li r0,1 ; (HACK) Get a 1 - mfxer r10 ; (HACK GLORIOUS HACK) Get the entry EE - sldi r0,r0,32+8 ; (HACK) Get the right bit to turn off caching - mfspr r2,hid4 ; (HACK) Get HID4 - rlwinm r10,r10,31-MSR_EE_BIT,MSR_EE_BIT,MSR_EE_BIT ; (HACK GLORIOUS HACK) Set the EE bit - andc r2,r2,r0 ; (HACK) Clear bit to make real accesses cache-inhibited - or r9,r9,r10 ; (HACK GLORIOUS HACK) Set the EE in MSR - sync ; (HACK) Sync up - mtspr hid4,r2 ; (HACK) Make real accesses not cache-inhibited - isync ; (HACK) Toss prefetches - - lis r12,0xE000 ; (HACK) Get the unlikeliest ESID possible - srdi r12,r12,1 ; (HACK) Make 0x7FFFFFFFF0000000 - slbie r12 ; (HACK) Make sure the ERAT is cleared - - mtmsr r9 ; (HACK GLORIOUS HACK) Set EE properly - -bcpydone0: - lis r0,hi16(MASK(MSR_VEC)) ; Get the vector bit - ori r0,r0,lo16(MASK(MSR_FP)) ; Get the float bit - bf++ fixxlate,bcpydone1 ; skip if we do not need to fix translation... ori r9,r9,lo16(MASK(MSR_DR)) ; Turn data translation on - andc r9,r9,r0 ; Make sure that FP and VEC are off + rlwinm r9,r9,0,MSR_FP_BIT+1,MSR_FP_BIT-1 ; Force floating point off + rlwinm r9,r9,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1 ; Force vectors off mtmsr r9 ; Just do it isync ; Hang in there - -bcpydone1: - bflr++ restorex ; done if we do not have to fix up addressing - mfsprg r8,2 ; get the feature flags again - mtcrf 0x02,r8 ; put pf64Bit where we can test it - bt++ pf64Bitb,bcpydone2 ; skip if 64-bit processor - - ; 32-bit processor, so clear out the BATs we set up for bcopy_physvir - - li r0,0 ; Get set to invalidate upper half + blr ; Leave cuz we are all done... + +bcclrbat: li r0,0 ; Get set to invalidate upper half sync ; Make sure all is well mtdbatu 0,r0 ; Clear sink upper DBAT mtdbatu 1,r0 ; Clear source upper DBAT @@ -533,16 +376,6 @@ bcpydone1: isync blr - ; 64-bit processor, so turn off 64-bit mode we turned on to do bcopy_phys - -bcpydone2: - mfmsr r9 ; get MSR again - andc r9,r9,r0 ; Make sure that FP and VEC are off - rldicl r9,r9,0,MSR_SF_BIT+1 ; clear SF - mtmsrd r9 - isync - blr - ; ; 0123456789ABCDEF0123456789ABCDEF @@ -563,8 +396,7 @@ bcpydone2: ; and on in order. That means that when we are at the second to last DW we ; have to wait until the whole line is in cache before we can proceed. ; - -G4reverseWord: ; here from 64-bit code with word aligned uncached operands + fwdovrlap: add r4,r5,r4 ; Point past the last sink byte add r6,r5,r6 ; Point past the last source byte and r0,r4,r8 ; Apply movement limit @@ -711,306 +543,3 @@ bnohalf: bflr 31 ; Leave cuz we are all done... stb r7,-1(r4) ; Save the single b bcpydone ; Go exit cuz we are all done... - - -// Here on 64-bit processors, which have a 128-byte cache line. This can be -// called either in 32 or 64-bit mode, which makes the test for reverse moves -// a little tricky. We've already filtered out the (sou==dest) and (len==0) -// special cases. -// -// When entered: -// r4 = destination (32 or 64-bit ptr) -// r5 = length (always 32 bits) -// r6 = source (32 or 64-bit ptr) -// cr5 = noncache, fixxlate, flipcache, and restorex flags set - - .align 5 -copyit64: - lis r2,0x4000 // r2 = 0x00000000 40000000 - neg r12,r4 // start to compute #bytes to align dest - bt-- noncache,noncache1 // (HACK) Do not even try anything cached... - dcbt 0,r6 // touch in 1st block of source -noncache1: - add. r2,r2,r2 // if 0x00000000 80000000 < 0, we are in 32-bit mode - cntlzw r9,r5 // get highest power-of-2 in length - rlwinm r7,r12,0,25,31 // r7 <- bytes to 128-byte align dest - bt-- noncache,noncache2 // (HACK) Do not even try anything cached... - dcbtst 0,r4 // touch in 1st destination cache block -noncache2: - sraw r2,r2,r9 // get mask with 1s for leading 0s in length, plus 1 more 1-bit - bge copyit64a // skip if we are running in 64-bit mode - rlwinm r4,r4,0,0,31 // running in 32-bit mode, so truncate ptrs and lengths to 32 bits - rlwinm r5,r5,0,0,31 - rlwinm r6,r6,0,0,31 -copyit64a: // now we can use 64-bit compares even if running in 32-bit mode - sub r8,r4,r6 // get (dest-source) - andc r7,r7,r2 // limit bytes to align by operand length - cmpld cr1,r8,r5 // if (dest-source)