/* * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * * The contents of this file constitute Original Code as defined in and * are subject to the Apple Public Source License Version 1.1 (the * "License"). You may not use this file except in compliance with the * License. Please obtain a copy of the License at * http://www.apple.com/publicsource and read it before using this file. * * This Original Code and all software distributed under the License are * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the * License for the specific language governing rights and limitations * under the License. * * @APPLE_LICENSE_HEADER_END@ */ ; ; Copy bytes of data around. handles overlapped data. ; ; Change this to use Altivec later on, and maybe floating point. ; ; #include #include #include ; Use CR5_lt to indicate non-cached #define noncache 20 ; Use CR5_gt to indicate that we need to turn data translation back on #define fixxlate 21 ; Use CR5_eq to indicate that we need to invalidate bats (if 32-bit) or turn off ; 64-bit mode (if 64-bit) before returning to our caller. We overload the ; bit to reduce the number of conditional branches at bcopy exit. #define restorex 22 ; Use CR5_so to indicate that we need to restore real-mode cachability ; Only needed on 64-bit machines #define flipcache 23 ; ; bcopy_nc(from, to, nbytes) ; ; bcopy_nc operates on non-cached memory so we can not use any kind ; of cache instructions. ; .align 5 .globl EXT(bcopy_nc) LEXT(bcopy_nc) crset noncache ; Set non-cached b bcpswap ; ; void bcopy_physvir(from, to, nbytes) ; Attempt to copy physically addressed memory with translation on if conditions are met. ; Otherwise do a normal bcopy_phys. This routine is used because some 32-bit processors ; are very slow doing real-mode (translation off) copies, so we set up temporary BATs ; for the passed phys addrs and do the copy with translation on. ; ; Rules are: neither source nor destination can cross a page. ; ; Interrupts must be disabled throughout the copy when this is called. ; To do this, we build a ; 128 DBAT for both the source and sink. If both are the same, only one is ; loaded. We do not touch the IBATs, so there is no issue if either physical page ; address is the same as the virtual address of the instructions we are executing. ; ; At the end, we invalidate the used DBATs. ; ; Note that the address parameters are long longs. We will transform these to 64-bit ; values. Note that on 32-bit architectures that this will ignore the high half of the ; passed in value. This should be ok since we can not have any bigger than 32 bit addresses ; there anyhow. ; ; Note, this one will not work in user state ; .align 5 .globl EXT(bcopy_physvir) LEXT(bcopy_physvir) crclr flipcache ; (HACK) No cache flip needed mfsprg r8,2 ; get processor feature flags rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg addic. r0,r7,-1 ; Get length - 1 rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits add r11,r3,r0 ; Point to last byte of sink rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits mr r5,r7 ; Get the length into the right register cmplw cr1,r3,r4 ; Does source == sink? bt++ pf64Bitb,bcopy_phys1 ; if 64-bit processor, use standard routine (no BATs) add r12,r4,r0 ; Point to last byte of source bltlr- ; Bail if length is 0 or way too big xor r7,r11,r3 ; See if we went to next page xor r8,r12,r4 ; See if we went to next page or r0,r7,r8 ; Combine wrap // li r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes li r9,((2<<3)|2) ; Set default attributes rlwinm. r0,r0,0,0,19 ; Did we overflow a page? li r7,2 ; Set validity flags li r8,2 ; Set validity flags bne- bcopy_phys1 ; Overflowed page, do normal physical copy... crset restorex ; Remember to trash BATs on the way out rlwimi r11,r9,0,15,31 ; Set sink lower DBAT value rlwimi r12,r9,0,15,31 ; Set source lower DBAT value rlwimi r7,r11,0,0,14 ; Set sink upper DBAT value rlwimi r8,r12,0,0,14 ; Set source upper DBAT value cmplw cr1,r11,r12 ; See if sink and source are same block sync mtdbatl 0,r11 ; Set sink lower DBAT mtdbatu 0,r7 ; Set sink upper DBAT beq- cr1,bcpvsame ; Source and sink are in same block mtdbatl 1,r12 ; Set source lower DBAT mtdbatu 1,r8 ; Set source upper DBAT bcpvsame: sync ; wait for BAT to stabilize isync mr r6,r3 ; Set source crclr noncache ; Set cached crclr fixxlate ; Set translation already ok b copyit32 ; Go copy it... ; ; void bcopy_phys(from, to, nbytes) ; Turns off data translation before the copy. Note, this one will ; not work in user state. This routine is used on 32 and 64-bit ; machines. ; ; Note that the address parameters are long longs. We will transform these to 64-bit ; values. Note that on 32-bit architectures that this will ignore the high half of the ; passed in value. This should be ok since we can not have any bigger than 32 bit addresses ; there anyhow. ; ; Also note that you probably will not be happy if either the sink or source spans across the ; boundary between RAM and I/O space. Good chance of hanging the machine and this code ; will not check, so be careful. ; .align 5 .globl EXT(bcopy_phys) LEXT(bcopy_phys) crclr flipcache ; (HACK) No cache flip needed rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg mfsprg r8,2 ; get processor feature flags rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits mr r5,r7 ; Get the length into the right register bcopy_phys1: ; enter from bcopy_physvir with pf64Bit already in cr6 mfmsr r9 ; Get the MSR crclr noncache ; Set cached bt++ pf64Bitb,bcopy_phys64 ; skip if 64-bit (only they take hint) ; 32-bit CPUs sub. r0,r3,r4 ; to==from? rlwinm r8,r9,0,MSR_DR_BIT,MSR_DR_BIT ; was translation on? cmpwi cr1,r8,0 ; set cr1 beq if translation was off oris r8,r8,hi16(MASK(MSR_VEC)) ; Get vector enable cmplwi cr7,r5,0 ; Check if we have a 0 length beqlr- ; bail if to==from ori r8,r8,lo16(MASK(MSR_FP)) ; Get FP mr r6,r3 ; Set source andc r9,r9,r8 ; Turn off translation if it is on (should be) and FP, VEC beqlr- cr7 ; Bail if length is 0 crclr restorex ; Make sure we do not trash BATs on the way out mtmsr r9 ; Set DR translation off isync ; Wait for it crnot fixxlate,cr1_eq ; Remember to turn on translation if it was b copyit32 ; Go copy it... ; 64-bit: turn DR off and SF on, remember if we need to restore on way out. bcopy_phys64: ; r9 = MSR srdi r2,r3,31 ; (HACK) Get a 1 if source is in I/O memory srdi. r0,r9,63-MSR_SF_BIT ; set cr0 beq on if SF was off when we were called rlwinm r8,r9,MSR_DR_BIT+1,31,31 ; r8 <- DR bit right justified cmpld cr1,r3,r4 ; to==from? li r0,1 ; Note - we use this in a couple places below lis r6,hi16(MASK(MSR_VEC)) ; Get vector enable cmpwi cr7,r5,0 ; length==0 ? ori r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR)) ; Add in FP and DR beqlr-- cr1 ; bail if to==from srdi r10,r4,31 ; (HACK) Get a 1 if sink is in I/O memory rldimi r9,r0,63,MSR_SF_BIT ; set SF on beqlr-- cr7 ; bail if length==0 andc r9,r9,r6 ; turn DR, VEC, FP off cmpwi cr1,r8,0 ; was DR on? crmove restorex,cr0_eq ; if SF was off, remember to turn back off before we return mtmsrd r9 ; turn 64-bit addressing on, data translation off cmpldi cr0,r2,1 ; (HACK) Is source in I/O memory? isync ; wait for it to happen mr r6,r3 ; Set source cmpldi cr7,r10,1 ; (HACK) Is sink in I/O memory? crnot fixxlate,cr1_eq ; if DR was on, remember to turn back on before we return cror flipcache,cr0_eq,cr7_eq ; (HACK) See if either source or sink is in I/O area rlwinm r10,r9,MSR_EE_BIT+1,31,31 ; (HACK GLORIOUS HACK) Isolate the EE bit sldi r11,r0,31-MSR_EE_BIT ; (HACK GLORIOUS HACK)) Get a mask for the EE bit sldi r0,r0,32+8 ; (HACK) Get the right bit to turn off caching bf++ flipcache,copyit64 ; (HACK) No need to mess with caching... ; ; HACK GLORIOUS HACK - when we force of caching, we need to also force off ; interruptions. We are out of CR bits, so we need to stash the entry EE ; somewheres. It is in the XER.... We NEED to change this!!!! ; mtxer r10 ; (HACK GLORIOUS HACK) Remember EE andc r9,r9,r11 ; (HACK GLORIOUS HACK) Turn off EE bit mfspr r2,hid4 ; (HACK) Get HID4 crset noncache ; (HACK) Set non-cached mtmsrd r9 ; (HACK GLORIOUS HACK) Force off EE or r2,r2,r0 ; (HACK) Set bit to make real accesses cache-inhibited sync ; (HACK) Sync up li r0,1 mtspr hid4,r2 ; (HACK) Make real accesses cache-inhibited isync ; (HACK) Toss prefetches lis r12,0xE000 ; (HACK) Get the unlikeliest ESID possible srdi r12,r12,1 ; (HACK) Make 0x7FFFFFFFF0000000 slbie r12 ; (HACK) Make sure the ERAT is cleared sync ; (HACK) isync ; (HACK) b copyit64 ; ; void bcopy(from, to, nbytes) ; .align 5 .globl EXT(bcopy) LEXT(bcopy) crclr noncache ; Set cached bcpswap: crclr flipcache ; (HACK) No cache flip needed mfsprg r8,2 ; get processor feature flags sub. r0,r4,r3 ; test for to==from in mode-independent way mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test cmpwi cr1,r5,0 ; Check if we have a 0 length crclr restorex ; Make sure we do not trash BATs on the way out mr r6,r3 ; Set source crclr fixxlate ; Set translation already ok beqlr- ; Bail if "to" and "from" are the same beqlr- cr1 ; Bail if length is 0 bt++ pf64Bitb,copyit64 ; handle 64-bit processor b copyit32 ; Go copy it... ; ; When we move the memory, forward overlays must be handled. We ; also can not use the cache instructions if we are from bcopy_nc. ; We need to preserve R3 because it needs to be returned for memcpy. ; We can be interrupted and lose control here. ; ; There is no stack, so in order to use vectors, we would ; need to take the vector exception. Any potential gains by using vectors ; would be more than eaten up by this. ; ; NOTE: this code is called in three "modes": ; - on 32-bit processors (32-byte cache line) ; - on 64-bit processors running in 32-bit mode (128-byte cache line) ; - on 64-bit processors running in 64-bit mode (128-byte cache line) ; ; ALSO NOTE: bcopy is called from copyin and copyout etc ; with the "thread_recover" ptr set. This means bcopy must not set up a ; stack frame or touch non-volatile registers, and also means that it ; cannot rely on turning off interrupts, because we expect to get DSIs ; and have execution aborted by a "longjmp" to the thread_recover ; routine. ; .align 5 .globl EXT(memcpy) ; NB: memcpy is only called in 32-bit mode, albeit on both 32- and 64-bit ; processors... LEXT(memcpy) crclr flipcache ; (HACK) No cache flip needed mfsprg r8,2 ; get processor feature flags cmplw cr1,r3,r4 ; "to" and "from" the same? mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test mr r6,r4 ; Set the "from" mr. r5,r5 ; Length zero? crclr noncache ; Set cached mr r4,r3 ; Set the "to" crclr fixxlate ; Set translation already ok beqlr- cr1 ; "to" and "from" are the same beqlr- ; Length is 0 crclr restorex ; Make sure we do not trash BATs on the way out bt++ pf64Bitb,copyit64 ; handle 64-bit processors copyit32: sub r12,r4,r6 ; Get potential overlap (negative if backward move) lis r8,0x7FFF ; Start up a mask srawi r11,r12,31 ; Propagate the sign bit dcbt br0,r6 ; Touch in the first source line cntlzw r7,r5 ; Get the highest power of 2 factor of the length ori r8,r8,0xFFFF ; Make limit 0x7FFFFFFF xor r9,r12,r11 ; If sink - source was negative, invert bits srw r8,r8,r7 ; Get move length limitation sub r9,r9,r11 ; If sink - source was negative, add 1 and get absolute value cmplw r12,r5 ; See if we actually forward overlap cmplwi cr7,r9,32 ; See if at least a line between source and sink dcbtst br0,r4 ; Touch in the first sink line cmplwi cr1,r5,32 ; Are we moving more than a line? cror noncache,noncache,cr7_lt ; Set to not DCBZ output line if not enough space blt- fwdovrlap ; This is a forward overlapping area, handle it... ; ; R4 = sink ; R5 = length ; R6 = source ; ; ; Here we figure out how much we have to move to get the sink onto a ; cache boundary. If we can, and there are still more that 32 bytes ; left to move, we can really speed things up by DCBZing the sink line. ; We can not do this if noncache is set because we will take an ; alignment exception. G4word: ; enter from 64-bit case with word aligned uncached operands neg r0,r4 ; Get the number of bytes to move to align to a line boundary rlwinm. r0,r0,0,27,31 ; Clean it up and test it and r0,r0,r8 ; limit to the maximum front end move mtcrf 3,r0 ; Make branch mask for partial moves sub r5,r5,r0 ; Set the length left to move beq alline ; Already on a line... bf 31,alhalf ; No single byte to do... lbz r7,0(r6) ; Get the byte addi r6,r6,1 ; Point to the next stb r7,0(r4) ; Save the single addi r4,r4,1 ; Bump sink ; Sink is halfword aligned here alhalf: bf 30,alword ; No halfword to do... lhz r7,0(r6) ; Get the halfword addi r6,r6,2 ; Point to the next sth r7,0(r4) ; Save the halfword addi r4,r4,2 ; Bump sink ; Sink is word aligned here alword: bf 29,aldouble ; No word to do... lwz r7,0(r6) ; Get the word addi r6,r6,4 ; Point to the next stw r7,0(r4) ; Save the word addi r4,r4,4 ; Bump sink ; Sink is double aligned here aldouble: bf 28,alquad ; No double to do... lwz r7,0(r6) ; Get the first word lwz r8,4(r6) ; Get the second word addi r6,r6,8 ; Point to the next stw r7,0(r4) ; Save the first word stw r8,4(r4) ; Save the second word addi r4,r4,8 ; Bump sink ; Sink is quadword aligned here alquad: bf 27,alline ; No quad to do... lwz r7,0(r6) ; Get the first word lwz r8,4(r6) ; Get the second word lwz r9,8(r6) ; Get the third word stw r7,0(r4) ; Save the first word lwz r11,12(r6) ; Get the fourth word addi r6,r6,16 ; Point to the next stw r8,4(r4) ; Save the second word stw r9,8(r4) ; Save the third word stw r11,12(r4) ; Save the fourth word addi r4,r4,16 ; Bump sink ; Sink is line aligned here alline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move mtcrf 3,r5 ; Make branch mask for backend partial moves rlwinm r11,r5,0,0,26 ; Get number of bytes we are going to move beq- backend ; No full lines to move sub r5,r5,r11 ; Calculate the residual li r10,96 ; Stride for touch ahead nxtline: subic. r0,r0,1 ; Account for the line now bt- noncache,skipz ; Skip if we are not cached... dcbz br0,r4 ; Blow away the whole line because we are replacing it dcbt r6,r10 ; Touch ahead a bit skipz: lwz r7,0(r6) ; Get the first word lwz r8,4(r6) ; Get the second word lwz r9,8(r6) ; Get the third word stw r7,0(r4) ; Save the first word lwz r11,12(r6) ; Get the fourth word stw r8,4(r4) ; Save the second word lwz r7,16(r6) ; Get the fifth word stw r9,8(r4) ; Save the third word lwz r8,20(r6) ; Get the sixth word stw r11,12(r4) ; Save the fourth word lwz r9,24(r6) ; Get the seventh word stw r7,16(r4) ; Save the fifth word lwz r11,28(r6) ; Get the eighth word addi r6,r6,32 ; Point to the next stw r8,20(r4) ; Save the sixth word stw r9,24(r4) ; Save the seventh word stw r11,28(r4) ; Save the eighth word addi r4,r4,32 ; Bump sink bgt+ nxtline ; Do the next line, if any... ; Move backend quadword backend: bf 27,noquad ; No quad to do... lwz r7,0(r6) ; Get the first word lwz r8,4(r6) ; Get the second word lwz r9,8(r6) ; Get the third word lwz r11,12(r6) ; Get the fourth word stw r7,0(r4) ; Save the first word addi r6,r6,16 ; Point to the next stw r8,4(r4) ; Save the second word stw r9,8(r4) ; Save the third word stw r11,12(r4) ; Save the fourth word addi r4,r4,16 ; Bump sink ; Move backend double noquad: bf 28,nodouble ; No double to do... lwz r7,0(r6) ; Get the first word lwz r8,4(r6) ; Get the second word addi r6,r6,8 ; Point to the next stw r7,0(r4) ; Save the first word stw r8,4(r4) ; Save the second word addi r4,r4,8 ; Bump sink ; Move backend word nodouble: bf 29,noword ; No word to do... lwz r7,0(r6) ; Get the word addi r6,r6,4 ; Point to the next stw r7,0(r4) ; Save the word addi r4,r4,4 ; Bump sink ; Move backend halfword noword: bf 30,nohalf ; No halfword to do... lhz r7,0(r6) ; Get the halfword addi r6,r6,2 ; Point to the next sth r7,0(r4) ; Save the halfword addi r4,r4,2 ; Bump sink ; Move backend byte nohalf: bf 31,bcpydone ; Leave cuz we are all done... lbz r7,0(r6) ; Get the byte stb r7,0(r4) ; Save the single bcpydone: mfmsr r9 ; Get the MSR bf++ flipcache,bcpydone0 ; (HACK) No need to mess with caching... li r0,1 ; (HACK) Get a 1 mfxer r10 ; (HACK GLORIOUS HACK) Get the entry EE sldi r0,r0,32+8 ; (HACK) Get the right bit to turn off caching mfspr r2,hid4 ; (HACK) Get HID4 rlwinm r10,r10,31-MSR_EE_BIT,MSR_EE_BIT,MSR_EE_BIT ; (HACK GLORIOUS HACK) Set the EE bit andc r2,r2,r0 ; (HACK) Clear bit to make real accesses cache-inhibited or r9,r9,r10 ; (HACK GLORIOUS HACK) Set the EE in MSR sync ; (HACK) Sync up mtspr hid4,r2 ; (HACK) Make real accesses not cache-inhibited isync ; (HACK) Toss prefetches lis r12,0xE000 ; (HACK) Get the unlikeliest ESID possible srdi r12,r12,1 ; (HACK) Make 0x7FFFFFFFF0000000 slbie r12 ; (HACK) Make sure the ERAT is cleared mtmsr r9 ; (HACK GLORIOUS HACK) Set EE properly bcpydone0: lis r0,hi16(MASK(MSR_VEC)) ; Get the vector bit ori r0,r0,lo16(MASK(MSR_FP)) ; Get the float bit bf++ fixxlate,bcpydone1 ; skip if we do not need to fix translation... ori r9,r9,lo16(MASK(MSR_DR)) ; Turn data translation on andc r9,r9,r0 ; Make sure that FP and VEC are off mtmsr r9 ; Just do it isync ; Hang in there bcpydone1: bflr++ restorex ; done if we do not have to fix up addressing mfsprg r8,2 ; get the feature flags again mtcrf 0x02,r8 ; put pf64Bit where we can test it bt++ pf64Bitb,bcpydone2 ; skip if 64-bit processor ; 32-bit processor, so clear out the BATs we set up for bcopy_physvir li r0,0 ; Get set to invalidate upper half sync ; Make sure all is well mtdbatu 0,r0 ; Clear sink upper DBAT mtdbatu 1,r0 ; Clear source upper DBAT sync isync blr ; 64-bit processor, so turn off 64-bit mode we turned on to do bcopy_phys bcpydone2: mfmsr r9 ; get MSR again andc r9,r9,r0 ; Make sure that FP and VEC are off rldicl r9,r9,0,MSR_SF_BIT+1 ; clear SF mtmsrd r9 isync blr ; ; 0123456789ABCDEF0123456789ABCDEF ; 0123456789ABCDEF0123456789ABCDEF ; F ; DE ; 9ABC ; 12345678 ; 123456789ABCDEF0 ; 0 ; ; Here is where we handle a forward overlapping move. These will be slow ; because we can not kill the cache of the destination until after we have ; loaded/saved the source area. Also, because reading memory backwards is ; slower when the cache line needs to be loaded because the critical ; doubleword is loaded first, i.e., the last, then it goes back to the first, ; and on in order. That means that when we are at the second to last DW we ; have to wait until the whole line is in cache before we can proceed. ; G4reverseWord: ; here from 64-bit code with word aligned uncached operands fwdovrlap: add r4,r5,r4 ; Point past the last sink byte add r6,r5,r6 ; Point past the last source byte and r0,r4,r8 ; Apply movement limit li r12,-1 ; Make sure we touch in the actual line mtcrf 3,r0 ; Figure out the best way to move backwards dcbt r12,r6 ; Touch in the last line of source rlwinm. r0,r0,0,27,31 ; Calculate the length to adjust to cache boundary dcbtst r12,r4 ; Touch in the last line of the sink beq- balline ; Aready on cache line boundary sub r5,r5,r0 ; Precaculate move length left after alignment bf 31,balhalf ; No single byte to do... lbz r7,-1(r6) ; Get the byte subi r6,r6,1 ; Point to the next stb r7,-1(r4) ; Save the single subi r4,r4,1 ; Bump sink ; Sink is halfword aligned here balhalf: bf 30,balword ; No halfword to do... lhz r7,-2(r6) ; Get the halfword subi r6,r6,2 ; Point to the next sth r7,-2(r4) ; Save the halfword subi r4,r4,2 ; Bump sink ; Sink is word aligned here balword: bf 29,baldouble ; No word to do... lwz r7,-4(r6) ; Get the word subi r6,r6,4 ; Point to the next stw r7,-4(r4) ; Save the word subi r4,r4,4 ; Bump sink ; Sink is double aligned here baldouble: bf 28,balquad ; No double to do... lwz r7,-8(r6) ; Get the first word lwz r8,-4(r6) ; Get the second word subi r6,r6,8 ; Point to the next stw r7,-8(r4) ; Save the first word stw r8,-4(r4) ; Save the second word subi r4,r4,8 ; Bump sink ; Sink is quadword aligned here balquad: bf 27,balline ; No quad to do... lwz r7,-16(r6) ; Get the first word lwz r8,-12(r6) ; Get the second word lwz r9,-8(r6) ; Get the third word lwz r11,-4(r6) ; Get the fourth word stw r7,-16(r4) ; Save the first word subi r6,r6,16 ; Point to the next stw r8,-12(r4) ; Save the second word stw r9,-8(r4) ; Save the third word stw r11,-4(r4) ; Save the fourth word subi r4,r4,16 ; Bump sink ; Sink is line aligned here balline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move mtcrf 3,r5 ; Make branch mask for backend partial moves beq- bbackend ; No full lines to move ; Registers in use: R0, R1, R3, R4, R5, R6 ; Registers not in use: R2, R7, R8, R9, R10, R11, R12 - Ok, we can make another free for 8 of them bnxtline: subic. r0,r0,1 ; Account for the line now lwz r7,-32(r6) ; Get the first word lwz r5,-28(r6) ; Get the second word lwz r2,-24(r6) ; Get the third word lwz r12,-20(r6) ; Get the third word lwz r11,-16(r6) ; Get the fifth word lwz r10,-12(r6) ; Get the sixth word lwz r9,-8(r6) ; Get the seventh word lwz r8,-4(r6) ; Get the eighth word subi r6,r6,32 ; Point to the next stw r7,-32(r4) ; Get the first word ble- bnotouch ; Last time, skip touch of source... dcbt br0,r6 ; Touch in next source line bnotouch: stw r5,-28(r4) ; Get the second word stw r2,-24(r4) ; Get the third word stw r12,-20(r4) ; Get the third word stw r11,-16(r4) ; Get the fifth word stw r10,-12(r4) ; Get the sixth word stw r9,-8(r4) ; Get the seventh word stw r8,-4(r4) ; Get the eighth word subi r4,r4,32 ; Bump sink bgt+ bnxtline ; Do the next line, if any... ; ; Note: We touched these lines in at the beginning ; ; Move backend quadword bbackend: bf 27,bnoquad ; No quad to do... lwz r7,-16(r6) ; Get the first word lwz r8,-12(r6) ; Get the second word lwz r9,-8(r6) ; Get the third word lwz r11,-4(r6) ; Get the fourth word stw r7,-16(r4) ; Save the first word subi r6,r6,16 ; Point to the next stw r8,-12(r4) ; Save the second word stw r9,-8(r4) ; Save the third word stw r11,-4(r4) ; Save the fourth word subi r4,r4,16 ; Bump sink ; Move backend double bnoquad: bf 28,bnodouble ; No double to do... lwz r7,-8(r6) ; Get the first word lwz r8,-4(r6) ; Get the second word subi r6,r6,8 ; Point to the next stw r7,-8(r4) ; Save the first word stw r8,-4(r4) ; Save the second word subi r4,r4,8 ; Bump sink ; Move backend word bnodouble: bf 29,bnoword ; No word to do... lwz r7,-4(r6) ; Get the word subi r6,r6,4 ; Point to the next stw r7,-4(r4) ; Save the word subi r4,r4,4 ; Bump sink ; Move backend halfword bnoword: bf 30,bnohalf ; No halfword to do... lhz r7,-2(r6) ; Get the halfword subi r6,r6,2 ; Point to the next sth r7,-2(r4) ; Save the halfword subi r4,r4,2 ; Bump sink ; Move backend byte bnohalf: bf 31,bcpydone ; Leave cuz we are all done... lbz r7,-1(r6) ; Get the byte stb r7,-1(r4) ; Save the single b bcpydone ; Go exit cuz we are all done... // Here on 64-bit processors, which have a 128-byte cache line. This can be // called either in 32 or 64-bit mode, which makes the test for reverse moves // a little tricky. We've already filtered out the (sou==dest) and (len==0) // special cases. // // When entered: // r4 = destination (32 or 64-bit ptr) // r5 = length (always 32 bits) // r6 = source (32 or 64-bit ptr) // cr5 = noncache, fixxlate, flipcache, and restorex flags set .align 5 copyit64: lis r2,0x4000 // r2 = 0x00000000 40000000 neg r12,r4 // start to compute #bytes to align dest bt-- noncache,noncache1 // (HACK) Do not even try anything cached... dcbt 0,r6 // touch in 1st block of source noncache1: add. r2,r2,r2 // if 0x00000000 80000000 < 0, we are in 32-bit mode cntlzw r9,r5 // get highest power-of-2 in length rlwinm r7,r12,0,25,31 // r7 <- bytes to 128-byte align dest bt-- noncache,noncache2 // (HACK) Do not even try anything cached... dcbtst 0,r4 // touch in 1st destination cache block noncache2: sraw r2,r2,r9 // get mask with 1s for leading 0s in length, plus 1 more 1-bit bge copyit64a // skip if we are running in 64-bit mode rlwinm r4,r4,0,0,31 // running in 32-bit mode, so truncate ptrs and lengths to 32 bits rlwinm r5,r5,0,0,31 rlwinm r6,r6,0,0,31 copyit64a: // now we can use 64-bit compares even if running in 32-bit mode sub r8,r4,r6 // get (dest-source) andc r7,r7,r2 // limit bytes to align by operand length cmpld cr1,r8,r5 // if (dest-source)