2 * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
23 ; Copy bytes of data around. handles overlapped data.
25 ; Change this to use Altivec later on, and maybe floating point.
29 #include <ppc/proc_reg.h>
32 ; Use CR5_lt to indicate non-cached
35 ; Use CR5_gt to indicate that we need to turn data translation back on
38 ; Use CR5_eq to indicate that we need to invalidate bats (if 32-bit) or turn off
39 ; 64-bit mode (if 64-bit) before returning to our caller. We overload the
40 ; bit to reduce the number of conditional branches at bcopy exit.
43 ; Use CR5_so to indicate that we need to restore real-mode cachability
44 ; Only needed on 64-bit machines
48 ; bcopy_nc(from, to, nbytes)
50 ; bcopy_nc operates on non-cached memory so we can not use any kind
51 ; of cache instructions.
59 crset noncache ; Set non-cached
63 ; void bcopy_physvir(from, to, nbytes)
64 ; Attempt to copy physically addressed memory with translation on if conditions are met.
65 ; Otherwise do a normal bcopy_phys. This routine is used because some 32-bit processors
66 ; are very slow doing real-mode (translation off) copies, so we set up temporary BATs
67 ; for the passed phys addrs and do the copy with translation on.
69 ; Rules are: neither source nor destination can cross a page.
71 ; Interrupts must be disabled throughout the copy when this is called.
72 ; To do this, we build a
73 ; 128 DBAT for both the source and sink. If both are the same, only one is
74 ; loaded. We do not touch the IBATs, so there is no issue if either physical page
75 ; address is the same as the virtual address of the instructions we are executing.
77 ; At the end, we invalidate the used DBATs.
79 ; Note that the address parameters are long longs. We will transform these to 64-bit
80 ; values. Note that on 32-bit architectures that this will ignore the high half of the
81 ; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
84 ; Note, this one will not work in user state
88 .globl EXT(bcopy_physvir)
92 crclr flipcache ; (HACK) No cache flip needed
93 mfsprg r8,2 ; get processor feature flags
94 rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
95 addic. r0,r7,-1 ; Get length - 1
96 rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
97 add r11,r3,r0 ; Point to last byte of sink
98 rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
99 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
100 rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
101 mr r5,r7 ; Get the length into the right register
102 cmplw cr1,r3,r4 ; Does source == sink?
103 bt++ pf64Bitb,bcopy_phys1 ; if 64-bit processor, use standard routine (no BATs)
104 add r12,r4,r0 ; Point to last byte of source
105 bltlr- ; Bail if length is 0 or way too big
106 xor r7,r11,r3 ; See if we went to next page
107 xor r8,r12,r4 ; See if we went to next page
108 or r0,r7,r8 ; Combine wrap
110 // li r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
111 li r9,((2<<3)|2) ; Set default attributes
112 rlwinm. r0,r0,0,0,19 ; Did we overflow a page?
113 li r7,2 ; Set validity flags
114 li r8,2 ; Set validity flags
115 bne- bcopy_phys1 ; Overflowed page, do normal physical copy...
117 crset restorex ; Remember to trash BATs on the way out
118 rlwimi r11,r9,0,15,31 ; Set sink lower DBAT value
119 rlwimi r12,r9,0,15,31 ; Set source lower DBAT value
120 rlwimi r7,r11,0,0,14 ; Set sink upper DBAT value
121 rlwimi r8,r12,0,0,14 ; Set source upper DBAT value
122 cmplw cr1,r11,r12 ; See if sink and source are same block
126 mtdbatl 0,r11 ; Set sink lower DBAT
127 mtdbatu 0,r7 ; Set sink upper DBAT
129 beq- cr1,bcpvsame ; Source and sink are in same block
131 mtdbatl 1,r12 ; Set source lower DBAT
132 mtdbatu 1,r8 ; Set source upper DBAT
135 sync ; wait for BAT to stabilize
137 mr r6,r3 ; Set source
138 crclr noncache ; Set cached
139 crclr fixxlate ; Set translation already ok
141 b copyit32 ; Go copy it...
144 ; void bcopy_phys(from, to, nbytes)
145 ; Turns off data translation before the copy. Note, this one will
146 ; not work in user state. This routine is used on 32 and 64-bit
149 ; Note that the address parameters are long longs. We will transform these to 64-bit
150 ; values. Note that on 32-bit architectures that this will ignore the high half of the
151 ; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
154 ; Also note that you probably will not be happy if either the sink or source spans across the
155 ; boundary between RAM and I/O space. Good chance of hanging the machine and this code
156 ; will not check, so be careful.
160 .globl EXT(bcopy_phys)
163 crclr flipcache ; (HACK) No cache flip needed
164 rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
165 mfsprg r8,2 ; get processor feature flags
166 rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
167 rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
168 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
169 rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
170 mr r5,r7 ; Get the length into the right register
172 bcopy_phys1: ; enter from bcopy_physvir with pf64Bit already in cr6
173 mfmsr r9 ; Get the MSR
174 crclr noncache ; Set cached
175 bt++ pf64Bitb,bcopy_phys64 ; skip if 64-bit (only they take hint)
179 sub. r0,r3,r4 ; to==from?
180 rlwinm r8,r9,0,MSR_DR_BIT,MSR_DR_BIT ; was translation on?
181 cmpwi cr1,r8,0 ; set cr1 beq if translation was off
182 oris r8,r8,hi16(MASK(MSR_VEC)) ; Get vector enable
183 cmplwi cr7,r5,0 ; Check if we have a 0 length
184 beqlr- ; bail if to==from
185 ori r8,r8,lo16(MASK(MSR_FP)) ; Get FP
186 mr r6,r3 ; Set source
187 andc r9,r9,r8 ; Turn off translation if it is on (should be) and FP, VEC
188 beqlr- cr7 ; Bail if length is 0
190 crclr restorex ; Make sure we do not trash BATs on the way out
191 mtmsr r9 ; Set DR translation off
194 crnot fixxlate,cr1_eq ; Remember to turn on translation if it was
195 b copyit32 ; Go copy it...
197 ; 64-bit: turn DR off and SF on, remember if we need to restore on way out.
199 bcopy_phys64: ; r9 = MSR
201 srdi r2,r3,31 ; (HACK) Get a 1 if source is in I/O memory
202 srdi. r0,r9,63-MSR_SF_BIT ; set cr0 beq on if SF was off when we were called
203 rlwinm r8,r9,MSR_DR_BIT+1,31,31 ; r8 <- DR bit right justified
204 cmpld cr1,r3,r4 ; to==from?
205 li r0,1 ; Note - we use this in a couple places below
206 lis r6,hi16(MASK(MSR_VEC)) ; Get vector enable
207 cmpwi cr7,r5,0 ; length==0 ?
208 ori r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR)) ; Add in FP and DR
209 beqlr-- cr1 ; bail if to==from
210 srdi r10,r4,31 ; (HACK) Get a 1 if sink is in I/O memory
211 rldimi r9,r0,63,MSR_SF_BIT ; set SF on
212 beqlr-- cr7 ; bail if length==0
213 andc r9,r9,r6 ; turn DR, VEC, FP off
214 cmpwi cr1,r8,0 ; was DR on?
215 crmove restorex,cr0_eq ; if SF was off, remember to turn back off before we return
216 mtmsrd r9 ; turn 64-bit addressing on, data translation off
217 cmpldi cr0,r2,1 ; (HACK) Is source in I/O memory?
218 isync ; wait for it to happen
219 mr r6,r3 ; Set source
220 cmpldi cr7,r10,1 ; (HACK) Is sink in I/O memory?
221 crnot fixxlate,cr1_eq ; if DR was on, remember to turn back on before we return
223 cror flipcache,cr0_eq,cr7_eq ; (HACK) See if either source or sink is in I/O area
225 rlwinm r10,r9,MSR_EE_BIT+1,31,31 ; (HACK GLORIOUS HACK) Isolate the EE bit
226 sldi r11,r0,31-MSR_EE_BIT ; (HACK GLORIOUS HACK)) Get a mask for the EE bit
227 sldi r0,r0,32+8 ; (HACK) Get the right bit to turn off caching
228 bf++ flipcache,copyit64 ; (HACK) No need to mess with caching...
231 ; HACK GLORIOUS HACK - when we force of caching, we need to also force off
232 ; interruptions. We are out of CR bits, so we need to stash the entry EE
233 ; somewheres. It is in the XER.... We NEED to change this!!!!
236 mtxer r10 ; (HACK GLORIOUS HACK) Remember EE
237 andc r9,r9,r11 ; (HACK GLORIOUS HACK) Turn off EE bit
238 mfspr r2,hid4 ; (HACK) Get HID4
239 crset noncache ; (HACK) Set non-cached
240 mtmsrd r9 ; (HACK GLORIOUS HACK) Force off EE
241 or r2,r2,r0 ; (HACK) Set bit to make real accesses cache-inhibited
242 sync ; (HACK) Sync up
244 mtspr hid4,r2 ; (HACK) Make real accesses cache-inhibited
245 isync ; (HACK) Toss prefetches
247 lis r12,0xE000 ; (HACK) Get the unlikeliest ESID possible
248 srdi r12,r12,1 ; (HACK) Make 0x7FFFFFFFF0000000
249 slbie r12 ; (HACK) Make sure the ERAT is cleared
258 ; void bcopy(from, to, nbytes)
266 crclr noncache ; Set cached
269 crclr flipcache ; (HACK) No cache flip needed
270 mfsprg r8,2 ; get processor feature flags
271 sub. r0,r4,r3 ; test for to==from in mode-independent way
272 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
273 cmpwi cr1,r5,0 ; Check if we have a 0 length
274 crclr restorex ; Make sure we do not trash BATs on the way out
275 mr r6,r3 ; Set source
276 crclr fixxlate ; Set translation already ok
277 beqlr- ; Bail if "to" and "from" are the same
278 beqlr- cr1 ; Bail if length is 0
279 bt++ pf64Bitb,copyit64 ; handle 64-bit processor
280 b copyit32 ; Go copy it...
283 ; When we move the memory, forward overlays must be handled. We
284 ; also can not use the cache instructions if we are from bcopy_nc.
285 ; We need to preserve R3 because it needs to be returned for memcpy.
286 ; We can be interrupted and lose control here.
288 ; There is no stack, so in order to use vectors, we would
289 ; need to take the vector exception. Any potential gains by using vectors
290 ; would be more than eaten up by this.
292 ; NOTE: this code is called in three "modes":
293 ; - on 32-bit processors (32-byte cache line)
294 ; - on 64-bit processors running in 32-bit mode (128-byte cache line)
295 ; - on 64-bit processors running in 64-bit mode (128-byte cache line)
297 ; ALSO NOTE: bcopy is called from copyin and copyout etc
298 ; with the "thread_recover" ptr set. This means bcopy must not set up a
299 ; stack frame or touch non-volatile registers, and also means that it
300 ; cannot rely on turning off interrupts, because we expect to get DSIs
301 ; and have execution aborted by a "longjmp" to the thread_recover
307 ; NB: memcpy is only called in 32-bit mode, albeit on both 32- and 64-bit
310 crclr flipcache ; (HACK) No cache flip needed
311 mfsprg r8,2 ; get processor feature flags
312 cmplw cr1,r3,r4 ; "to" and "from" the same?
313 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
314 mr r6,r4 ; Set the "from"
315 mr. r5,r5 ; Length zero?
316 crclr noncache ; Set cached
317 mr r4,r3 ; Set the "to"
318 crclr fixxlate ; Set translation already ok
319 beqlr- cr1 ; "to" and "from" are the same
321 crclr restorex ; Make sure we do not trash BATs on the way out
322 bt++ pf64Bitb,copyit64 ; handle 64-bit processors
324 copyit32: sub r12,r4,r6 ; Get potential overlap (negative if backward move)
325 lis r8,0x7FFF ; Start up a mask
326 srawi r11,r12,31 ; Propagate the sign bit
327 dcbt br0,r6 ; Touch in the first source line
328 cntlzw r7,r5 ; Get the highest power of 2 factor of the length
329 ori r8,r8,0xFFFF ; Make limit 0x7FFFFFFF
330 xor r9,r12,r11 ; If sink - source was negative, invert bits
331 srw r8,r8,r7 ; Get move length limitation
332 sub r9,r9,r11 ; If sink - source was negative, add 1 and get absolute value
333 cmplw r12,r5 ; See if we actually forward overlap
334 cmplwi cr7,r9,32 ; See if at least a line between source and sink
335 dcbtst br0,r4 ; Touch in the first sink line
336 cmplwi cr1,r5,32 ; Are we moving more than a line?
337 cror noncache,noncache,cr7_lt ; Set to not DCBZ output line if not enough space
338 blt- fwdovrlap ; This is a forward overlapping area, handle it...
347 ; Here we figure out how much we have to move to get the sink onto a
348 ; cache boundary. If we can, and there are still more that 32 bytes
349 ; left to move, we can really speed things up by DCBZing the sink line.
350 ; We can not do this if noncache is set because we will take an
351 ; alignment exception.
353 G4word: ; enter from 64-bit case with word aligned uncached operands
354 neg r0,r4 ; Get the number of bytes to move to align to a line boundary
355 rlwinm. r0,r0,0,27,31 ; Clean it up and test it
356 and r0,r0,r8 ; limit to the maximum front end move
357 mtcrf 3,r0 ; Make branch mask for partial moves
358 sub r5,r5,r0 ; Set the length left to move
359 beq alline ; Already on a line...
361 bf 31,alhalf ; No single byte to do...
362 lbz r7,0(r6) ; Get the byte
363 addi r6,r6,1 ; Point to the next
364 stb r7,0(r4) ; Save the single
365 addi r4,r4,1 ; Bump sink
367 ; Sink is halfword aligned here
369 alhalf: bf 30,alword ; No halfword to do...
370 lhz r7,0(r6) ; Get the halfword
371 addi r6,r6,2 ; Point to the next
372 sth r7,0(r4) ; Save the halfword
373 addi r4,r4,2 ; Bump sink
375 ; Sink is word aligned here
377 alword: bf 29,aldouble ; No word to do...
378 lwz r7,0(r6) ; Get the word
379 addi r6,r6,4 ; Point to the next
380 stw r7,0(r4) ; Save the word
381 addi r4,r4,4 ; Bump sink
383 ; Sink is double aligned here
385 aldouble: bf 28,alquad ; No double to do...
386 lwz r7,0(r6) ; Get the first word
387 lwz r8,4(r6) ; Get the second word
388 addi r6,r6,8 ; Point to the next
389 stw r7,0(r4) ; Save the first word
390 stw r8,4(r4) ; Save the second word
391 addi r4,r4,8 ; Bump sink
393 ; Sink is quadword aligned here
395 alquad: bf 27,alline ; No quad to do...
396 lwz r7,0(r6) ; Get the first word
397 lwz r8,4(r6) ; Get the second word
398 lwz r9,8(r6) ; Get the third word
399 stw r7,0(r4) ; Save the first word
400 lwz r11,12(r6) ; Get the fourth word
401 addi r6,r6,16 ; Point to the next
402 stw r8,4(r4) ; Save the second word
403 stw r9,8(r4) ; Save the third word
404 stw r11,12(r4) ; Save the fourth word
405 addi r4,r4,16 ; Bump sink
407 ; Sink is line aligned here
409 alline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
410 mtcrf 3,r5 ; Make branch mask for backend partial moves
411 rlwinm r11,r5,0,0,26 ; Get number of bytes we are going to move
412 beq- backend ; No full lines to move
414 sub r5,r5,r11 ; Calculate the residual
415 li r10,96 ; Stride for touch ahead
417 nxtline: subic. r0,r0,1 ; Account for the line now
419 bt- noncache,skipz ; Skip if we are not cached...
420 dcbz br0,r4 ; Blow away the whole line because we are replacing it
421 dcbt r6,r10 ; Touch ahead a bit
423 skipz: lwz r7,0(r6) ; Get the first word
424 lwz r8,4(r6) ; Get the second word
425 lwz r9,8(r6) ; Get the third word
426 stw r7,0(r4) ; Save the first word
427 lwz r11,12(r6) ; Get the fourth word
428 stw r8,4(r4) ; Save the second word
429 lwz r7,16(r6) ; Get the fifth word
430 stw r9,8(r4) ; Save the third word
431 lwz r8,20(r6) ; Get the sixth word
432 stw r11,12(r4) ; Save the fourth word
433 lwz r9,24(r6) ; Get the seventh word
434 stw r7,16(r4) ; Save the fifth word
435 lwz r11,28(r6) ; Get the eighth word
436 addi r6,r6,32 ; Point to the next
437 stw r8,20(r4) ; Save the sixth word
438 stw r9,24(r4) ; Save the seventh word
439 stw r11,28(r4) ; Save the eighth word
440 addi r4,r4,32 ; Bump sink
441 bgt+ nxtline ; Do the next line, if any...
444 ; Move backend quadword
446 backend: bf 27,noquad ; No quad to do...
447 lwz r7,0(r6) ; Get the first word
448 lwz r8,4(r6) ; Get the second word
449 lwz r9,8(r6) ; Get the third word
450 lwz r11,12(r6) ; Get the fourth word
451 stw r7,0(r4) ; Save the first word
452 addi r6,r6,16 ; Point to the next
453 stw r8,4(r4) ; Save the second word
454 stw r9,8(r4) ; Save the third word
455 stw r11,12(r4) ; Save the fourth word
456 addi r4,r4,16 ; Bump sink
458 ; Move backend double
460 noquad: bf 28,nodouble ; No double to do...
461 lwz r7,0(r6) ; Get the first word
462 lwz r8,4(r6) ; Get the second word
463 addi r6,r6,8 ; Point to the next
464 stw r7,0(r4) ; Save the first word
465 stw r8,4(r4) ; Save the second word
466 addi r4,r4,8 ; Bump sink
470 nodouble: bf 29,noword ; No word to do...
471 lwz r7,0(r6) ; Get the word
472 addi r6,r6,4 ; Point to the next
473 stw r7,0(r4) ; Save the word
474 addi r4,r4,4 ; Bump sink
476 ; Move backend halfword
478 noword: bf 30,nohalf ; No halfword to do...
479 lhz r7,0(r6) ; Get the halfword
480 addi r6,r6,2 ; Point to the next
481 sth r7,0(r4) ; Save the halfword
482 addi r4,r4,2 ; Bump sink
486 nohalf: bf 31,bcpydone ; Leave cuz we are all done...
487 lbz r7,0(r6) ; Get the byte
488 stb r7,0(r4) ; Save the single
491 mfmsr r9 ; Get the MSR
492 bf++ flipcache,bcpydone0 ; (HACK) No need to mess with caching...
494 li r0,1 ; (HACK) Get a 1
495 mfxer r10 ; (HACK GLORIOUS HACK) Get the entry EE
496 sldi r0,r0,32+8 ; (HACK) Get the right bit to turn off caching
497 mfspr r2,hid4 ; (HACK) Get HID4
498 rlwinm r10,r10,31-MSR_EE_BIT,MSR_EE_BIT,MSR_EE_BIT ; (HACK GLORIOUS HACK) Set the EE bit
499 andc r2,r2,r0 ; (HACK) Clear bit to make real accesses cache-inhibited
500 or r9,r9,r10 ; (HACK GLORIOUS HACK) Set the EE in MSR
501 sync ; (HACK) Sync up
502 mtspr hid4,r2 ; (HACK) Make real accesses not cache-inhibited
503 isync ; (HACK) Toss prefetches
505 lis r12,0xE000 ; (HACK) Get the unlikeliest ESID possible
506 srdi r12,r12,1 ; (HACK) Make 0x7FFFFFFFF0000000
507 slbie r12 ; (HACK) Make sure the ERAT is cleared
509 mtmsr r9 ; (HACK GLORIOUS HACK) Set EE properly
512 lis r0,hi16(MASK(MSR_VEC)) ; Get the vector bit
513 ori r0,r0,lo16(MASK(MSR_FP)) ; Get the float bit
514 bf++ fixxlate,bcpydone1 ; skip if we do not need to fix translation...
515 ori r9,r9,lo16(MASK(MSR_DR)) ; Turn data translation on
516 andc r9,r9,r0 ; Make sure that FP and VEC are off
517 mtmsr r9 ; Just do it
518 isync ; Hang in there
521 bflr++ restorex ; done if we do not have to fix up addressing
522 mfsprg r8,2 ; get the feature flags again
523 mtcrf 0x02,r8 ; put pf64Bit where we can test it
524 bt++ pf64Bitb,bcpydone2 ; skip if 64-bit processor
526 ; 32-bit processor, so clear out the BATs we set up for bcopy_physvir
528 li r0,0 ; Get set to invalidate upper half
529 sync ; Make sure all is well
530 mtdbatu 0,r0 ; Clear sink upper DBAT
531 mtdbatu 1,r0 ; Clear source upper DBAT
536 ; 64-bit processor, so turn off 64-bit mode we turned on to do bcopy_phys
539 mfmsr r9 ; get MSR again
540 andc r9,r9,r0 ; Make sure that FP and VEC are off
541 rldicl r9,r9,0,MSR_SF_BIT+1 ; clear SF
548 ; 0123456789ABCDEF0123456789ABCDEF
549 ; 0123456789ABCDEF0123456789ABCDEF
558 ; Here is where we handle a forward overlapping move. These will be slow
559 ; because we can not kill the cache of the destination until after we have
560 ; loaded/saved the source area. Also, because reading memory backwards is
561 ; slower when the cache line needs to be loaded because the critical
562 ; doubleword is loaded first, i.e., the last, then it goes back to the first,
563 ; and on in order. That means that when we are at the second to last DW we
564 ; have to wait until the whole line is in cache before we can proceed.
567 G4reverseWord: ; here from 64-bit code with word aligned uncached operands
568 fwdovrlap: add r4,r5,r4 ; Point past the last sink byte
569 add r6,r5,r6 ; Point past the last source byte
570 and r0,r4,r8 ; Apply movement limit
571 li r12,-1 ; Make sure we touch in the actual line
572 mtcrf 3,r0 ; Figure out the best way to move backwards
573 dcbt r12,r6 ; Touch in the last line of source
574 rlwinm. r0,r0,0,27,31 ; Calculate the length to adjust to cache boundary
575 dcbtst r12,r4 ; Touch in the last line of the sink
576 beq- balline ; Aready on cache line boundary
578 sub r5,r5,r0 ; Precaculate move length left after alignment
580 bf 31,balhalf ; No single byte to do...
581 lbz r7,-1(r6) ; Get the byte
582 subi r6,r6,1 ; Point to the next
583 stb r7,-1(r4) ; Save the single
584 subi r4,r4,1 ; Bump sink
586 ; Sink is halfword aligned here
588 balhalf: bf 30,balword ; No halfword to do...
589 lhz r7,-2(r6) ; Get the halfword
590 subi r6,r6,2 ; Point to the next
591 sth r7,-2(r4) ; Save the halfword
592 subi r4,r4,2 ; Bump sink
594 ; Sink is word aligned here
596 balword: bf 29,baldouble ; No word to do...
597 lwz r7,-4(r6) ; Get the word
598 subi r6,r6,4 ; Point to the next
599 stw r7,-4(r4) ; Save the word
600 subi r4,r4,4 ; Bump sink
602 ; Sink is double aligned here
604 baldouble: bf 28,balquad ; No double to do...
605 lwz r7,-8(r6) ; Get the first word
606 lwz r8,-4(r6) ; Get the second word
607 subi r6,r6,8 ; Point to the next
608 stw r7,-8(r4) ; Save the first word
609 stw r8,-4(r4) ; Save the second word
610 subi r4,r4,8 ; Bump sink
612 ; Sink is quadword aligned here
614 balquad: bf 27,balline ; No quad to do...
615 lwz r7,-16(r6) ; Get the first word
616 lwz r8,-12(r6) ; Get the second word
617 lwz r9,-8(r6) ; Get the third word
618 lwz r11,-4(r6) ; Get the fourth word
619 stw r7,-16(r4) ; Save the first word
620 subi r6,r6,16 ; Point to the next
621 stw r8,-12(r4) ; Save the second word
622 stw r9,-8(r4) ; Save the third word
623 stw r11,-4(r4) ; Save the fourth word
624 subi r4,r4,16 ; Bump sink
626 ; Sink is line aligned here
628 balline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
629 mtcrf 3,r5 ; Make branch mask for backend partial moves
630 beq- bbackend ; No full lines to move
633 ; Registers in use: R0, R1, R3, R4, R5, R6
634 ; Registers not in use: R2, R7, R8, R9, R10, R11, R12 - Ok, we can make another free for 8 of them
636 bnxtline: subic. r0,r0,1 ; Account for the line now
638 lwz r7,-32(r6) ; Get the first word
639 lwz r5,-28(r6) ; Get the second word
640 lwz r2,-24(r6) ; Get the third word
641 lwz r12,-20(r6) ; Get the third word
642 lwz r11,-16(r6) ; Get the fifth word
643 lwz r10,-12(r6) ; Get the sixth word
644 lwz r9,-8(r6) ; Get the seventh word
645 lwz r8,-4(r6) ; Get the eighth word
646 subi r6,r6,32 ; Point to the next
648 stw r7,-32(r4) ; Get the first word
649 ble- bnotouch ; Last time, skip touch of source...
650 dcbt br0,r6 ; Touch in next source line
652 bnotouch: stw r5,-28(r4) ; Get the second word
653 stw r2,-24(r4) ; Get the third word
654 stw r12,-20(r4) ; Get the third word
655 stw r11,-16(r4) ; Get the fifth word
656 stw r10,-12(r4) ; Get the sixth word
657 stw r9,-8(r4) ; Get the seventh word
658 stw r8,-4(r4) ; Get the eighth word
659 subi r4,r4,32 ; Bump sink
661 bgt+ bnxtline ; Do the next line, if any...
664 ; Note: We touched these lines in at the beginning
667 ; Move backend quadword
669 bbackend: bf 27,bnoquad ; No quad to do...
670 lwz r7,-16(r6) ; Get the first word
671 lwz r8,-12(r6) ; Get the second word
672 lwz r9,-8(r6) ; Get the third word
673 lwz r11,-4(r6) ; Get the fourth word
674 stw r7,-16(r4) ; Save the first word
675 subi r6,r6,16 ; Point to the next
676 stw r8,-12(r4) ; Save the second word
677 stw r9,-8(r4) ; Save the third word
678 stw r11,-4(r4) ; Save the fourth word
679 subi r4,r4,16 ; Bump sink
681 ; Move backend double
683 bnoquad: bf 28,bnodouble ; No double to do...
684 lwz r7,-8(r6) ; Get the first word
685 lwz r8,-4(r6) ; Get the second word
686 subi r6,r6,8 ; Point to the next
687 stw r7,-8(r4) ; Save the first word
688 stw r8,-4(r4) ; Save the second word
689 subi r4,r4,8 ; Bump sink
693 bnodouble: bf 29,bnoword ; No word to do...
694 lwz r7,-4(r6) ; Get the word
695 subi r6,r6,4 ; Point to the next
696 stw r7,-4(r4) ; Save the word
697 subi r4,r4,4 ; Bump sink
699 ; Move backend halfword
701 bnoword: bf 30,bnohalf ; No halfword to do...
702 lhz r7,-2(r6) ; Get the halfword
703 subi r6,r6,2 ; Point to the next
704 sth r7,-2(r4) ; Save the halfword
705 subi r4,r4,2 ; Bump sink
709 bnohalf: bf 31,bcpydone ; Leave cuz we are all done...
710 lbz r7,-1(r6) ; Get the byte
711 stb r7,-1(r4) ; Save the single
713 b bcpydone ; Go exit cuz we are all done...
716 // Here on 64-bit processors, which have a 128-byte cache line. This can be
717 // called either in 32 or 64-bit mode, which makes the test for reverse moves
718 // a little tricky. We've already filtered out the (sou==dest) and (len==0)
722 // r4 = destination (32 or 64-bit ptr)
723 // r5 = length (always 32 bits)
724 // r6 = source (32 or 64-bit ptr)
725 // cr5 = noncache, fixxlate, flipcache, and restorex flags set
729 lis r2,0x4000 // r2 = 0x00000000 40000000
730 neg r12,r4 // start to compute #bytes to align dest
731 bt-- noncache,noncache1 // (HACK) Do not even try anything cached...
732 dcbt 0,r6 // touch in 1st block of source
734 add. r2,r2,r2 // if 0x00000000 80000000 < 0, we are in 32-bit mode
735 cntlzw r9,r5 // get highest power-of-2 in length
736 rlwinm r7,r12,0,25,31 // r7 <- bytes to 128-byte align dest
737 bt-- noncache,noncache2 // (HACK) Do not even try anything cached...
738 dcbtst 0,r4 // touch in 1st destination cache block
740 sraw r2,r2,r9 // get mask with 1s for leading 0s in length, plus 1 more 1-bit
741 bge copyit64a // skip if we are running in 64-bit mode
742 rlwinm r4,r4,0,0,31 // running in 32-bit mode, so truncate ptrs and lengths to 32 bits
745 copyit64a: // now we can use 64-bit compares even if running in 32-bit mode
746 sub r8,r4,r6 // get (dest-source)
747 andc r7,r7,r2 // limit bytes to align by operand length
748 cmpld cr1,r8,r5 // if (dest-source)<length, must move reverse
749 bt-- noncache,c64uncached // skip if uncached
750 blt-- cr1,c64rdouble // handle cached reverse moves
753 // Forward, cached or doubleword aligned uncached. This is the common case.
754 // r4-r6 = dest, length, source (as above)
755 // r7 = #bytes 128-byte align dest (limited by copy length)
756 // cr5 = flags, as above
759 andi. r8,r7,7 // r8 <- #bytes to doubleword align
760 srwi r9,r7,3 // r9 <- #doublewords to 128-byte align
761 sub r5,r5,r7 // adjust length remaining
762 cmpwi cr1,r9,0 // any doublewords to move to cache align?
763 srwi r10,r5,7 // r10 <- 128-byte chunks to xfer after aligning dest
764 cmpwi cr7,r10,0 // set cr7 on chunk count
765 beq c64double2 // dest already doubleword aligned
769 .align 5 // align inner loops
770 c64double1: // copy bytes until dest is doubleword aligned
777 c64double2: // r9/cr1=doublewords, r10=128-byte chunks, cr7=blt if r5==0
778 beq cr1,c64double4 // no doublewords to xfer in order to cache align
782 .align 5 // align inner loops
783 c64double3: // copy doublewords until dest is 128-byte aligned
790 // Here to xfer 128-byte chunks, if any. Because the IBM 970 cannot issue two stores/cycle,
791 // we pipeline the inner loop so we can pair loads and stores. Since we only have 8 GPRs for
792 // data (64 bytes), we load/store each twice per 128-byte chunk.
794 c64double4: // r10/cr7=128-byte chunks
795 rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords, after moving chunks
796 cmpwi cr1,r0,0 // set cr1 on leftover doublewords
797 beq cr7,c64double7 // no 128-byte chunks
798 sub r8,r6,r4 // r8 <- (source - dest)
799 li r9,128 // start at next cache line (we've already touched in 1st line)
800 cmpldi cr7,r8,128 // if (source-dest)<128, cannot use dcbz128 beacause of overlap
801 cror noncache,cr7_lt,noncache // turn on "noncache" flag if (source-dest)<128
802 bt-- noncache,noncache3 // (HACK) Skip cache touch if noncachable
803 dcbt128 r9,r6,1 // start forward stream
807 ld r0,0(r6) // start pipe: load 1st half-line
815 b c64InnerLoopEntryPt
817 .align 5 // align inner loop
818 c64InnerLoop: // loop copying 128-byte cache lines to 128-aligned destination
819 std r0,64(r4) // store 2nd half of chunk n
820 ld r0,0(r6) // load 1st half of chunk n+1
835 addi r4,r4,128 // advance to next dest chunk
836 c64InnerLoopEntryPt: // initial entry into loop, with 1st halfline loaded
837 bt noncache,c64InnerLoop1 // skip if uncached or overlap
838 dcbz128 0,r4 // avoid prefetch of next cache line
840 std r0,0(r4) // store 1st half of chunk n
841 ld r0,64(r6) // load 2nd half of chunk n
856 addi r6,r6,128 // advance to next source chunk if any
857 bdnz c64InnerLoop // loop if more chunks
859 std r0,64(r4) // store 2nd half of last chunk
867 addi r4,r4,128 // advance to next dest chunk
869 c64double7: // r5 <- leftover bytes, cr1 set on doubleword count
870 rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords (0-15)
871 andi. r5,r5,7 // r5/cr0 <- count of leftover bytes (0-7)
872 beq cr1,c64byte // no leftover doublewords
876 .align 5 // align inner loop
877 c64double8: // loop copying leftover doublewords
885 // Forward byte loop.
887 c64byte: // r5/cr0 <- byte count (can be big if unaligned uncached)
888 beq bcpydone // done if no leftover bytes
892 .align 5 // align inner loop
903 // Uncached copies. We must avoid unaligned accesses, since they always take alignment
904 // exceptions on uncached memory on 64-bit processors. This may mean we copy long operands
905 // a byte at a time, but that is still much faster than alignment exceptions.
906 // r4-r6 = dest, length, source (as above)
907 // r2 = mask of 1s for leading 0s in length, plus 1 extra 1
908 // r7 = #bytes to copy to 128-byte align dest (limited by operand length)
909 // cr1 = blt if reverse move required
912 xor r0,r6,r4 // get relative alignment
913 rlwinm r10,r0,0,29,31 // relatively doubleword aligned?
914 rlwinm r11,r0,0,30,31 // relatively word aligned?
915 not r8,r2 // get mask to limit initial length of copy for G4word
916 blt cr1,c64reverseUncached
918 cmpwi cr0,r10,0 // set cr0 beq if doubleword aligned
919 cmpwi cr1,r11,0 // set cr1 beq if word aligned
920 beq cr0,c64double // doubleword aligned
921 beq cr1,G4word // word aligned, use G3/G4 code
922 cmpwi r5,0 // set cr0 on byte count
923 b c64byte // unaligned operands
926 cmpwi cr0,r10,0 // set cr0 beq if doubleword aligned
927 cmpwi cr1,r11,0 // set cr1 beq if word aligned
928 beq cr0,c64rdouble // doubleword aligned so can use LD/STD
929 beq cr1,G4reverseWord // word aligned, use G3/G4 code
930 add r6,r6,r5 // point to (end+1) of source and dest
932 cmpwi r5,0 // set cr0 on length
933 b c64rbyte // copy a byte at a time
937 // Reverse doubleword copies. This is used for all cached copies, and doubleword
938 // aligned uncached copies.
939 // r4 = destination (32 or 64-bit ptr)
940 // r5 = length (always 32 bits)
941 // r6 = source (32 or 64-bit ptr)
942 // cr5 = noncache, fixxlate, and restorex flags set
945 add r6,r6,r5 // point to (end+1) of source and dest
947 rlwinm. r7,r4,0,29,31 // r7 <- #bytes to doubleword align dest
948 cmplw cr1,r7,r5 // operand long enough to doubleword align?
949 blt cr1,c64rd0 // yes
952 sub r5,r5,r7 // adjust length
953 srwi r8,r5,6 // r8 <- 64-byte chunks to xfer
954 cmpwi cr1,r8,0 // any chunks?
955 beq c64rd2 // source already doubleword aligned
958 c64rd1: // copy bytes until source doublword aligned
963 c64rd2: // r8/cr1 <- count of 64-byte chunks
964 rlwinm r0,r5,29,29,31 // r0 <- count of leftover doublewords
965 andi. r5,r5,7 // r5/cr0 <- count of leftover bytes
966 cmpwi cr7,r0,0 // leftover doublewords?
967 beq cr1,c64rd4 // no chunks to xfer
968 li r9,-128 // start at next cache line
970 bt noncache,c64rd3 // (HACK) Do not start a stream if noncachable...
971 dcbt128 r9,r6,3 // start reverse stream
974 .align 5 // align inner loop
975 c64rd3: // loop copying 64-byte chunks
994 c64rd4: // r0/cr7 = leftover doublewords r5/cr0 = leftover bytes
995 beq cr7,c64rbyte // no leftover doublewords
998 c64rd5: // loop copying leftover doublewords
1004 // Reverse byte loop.
1006 c64rbyte: // r5/cr0 <- byte count (can be big if unaligned uncached)
1007 beq bcpydone // done if no leftover bytes