]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/bcopy.s
9e3f008ad54afd13358eabd4becfff21ea1780f2
[apple/xnu.git] / osfmk / ppc / bcopy.s
1 /*
2 * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23 ;
24 ; Copy bytes of data around. Handles overlapped data.
25 ;
26 ;
27 #include <ppc/asm.h>
28 #include <ppc/proc_reg.h>
29 #include <assym.s>
30
31 ; These routines use CR5 for certain flags:
32 ; Use CR5_lt to indicate non-cached (in bcopy and memcpy)
33 #define noncache 20
34
35
36 ; The bcopy_phys variants use a stack frame so they can call bcopy as a subroutine.
37 #define BCOPY_SF_SIZE 32 // total size
38 #define BCOPY_SF_MSR 16 // we save caller's MSR here (possibly minus VEC and FP)
39
40
41 #define kShort 32 // short operands are special cased
42
43
44 ; void bcopy_physvir_32(from, to, nbytes)
45 ;
46 ; Attempt to copy physically addressed memory with translation on if conditions are met.
47 ; Otherwise do a normal bcopy_phys. This routine is used because some 32-bit processors
48 ; are very slow doing real-mode (translation off) copies, so we set up temporary BATs
49 ; for the passed phys addrs and do the copy with translation on.
50 ;
51 ; Rules are: - neither source nor destination can cross a page.
52 ; - Interrupts must be disabled when this routine is called.
53 ; - Translation must be on when called.
54 ;
55 ; To do the copy, we build a 128 DBAT for both the source and sink. If both are the same, only one
56 ; is loaded. We do not touch the IBATs, so there is no issue if either physical page
57 ; address is the same as the virtual address of the instructions we are executing.
58 ;
59 ; At the end, we invalidate the used DBATs.
60 ;
61 ; Note that the address parameters are long longs. We will transform these to 64-bit
62 ; values. Note that on 32-bit architectures that this will ignore the high half of the
63 ; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
64 ; there anyhow.
65 ;
66 ; Note also that this routine is used only on 32-bit machines. If you're contemplating use
67 ; on a 64-bit processor, use the physical memory window instead; please refer to copypv()
68 ; for an example of how this is done.
69
70 .align 5
71 .globl EXT(bcopy_physvir_32)
72
73 LEXT(bcopy_physvir_32)
74 mflr r0 ; get return address
75 rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
76 mfsprg r8,2 ; get processor feature flags
77 stw r0,8(r1) ; save return address
78 rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
79 stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy
80 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
81 subi r0,r7,1 ; get length - 1
82 rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
83 add r11,r3,r0 ; Point to last byte of sink
84 mr r5,r7 ; Get the length into the right register
85 rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
86
87 ; This test for page overflow may not work if the length is negative. Negative lengths are invalid input
88 ; to bcopy_physvir() on 32-bit machines, and will result in a panic.
89
90 add r12,r4,r0 ; Point to last byte of source
91 xor r7,r11,r3 ; See if we went to next page
92 xor r8,r12,r4 ; See if we went to next page
93 or r0,r7,r8 ; Combine wrap
94
95 // li r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
96 li r9,((2<<3)|2) ; Set default attributes
97 rlwinm. r0,r0,0,0,19 ; Did we overflow a page?
98 li r7,2 ; Set validity flags
99 li r8,2 ; Set validity flags
100 bne- bcopy_phys1 ; Overflowed page, do normal physical copy...
101
102 rlwimi r11,r9,0,15,31 ; Set sink lower DBAT value
103 rlwimi r12,r9,0,15,31 ; Set source lower DBAT value
104 rlwimi r7,r11,0,0,14 ; Set sink upper DBAT value
105 rlwimi r8,r12,0,0,14 ; Set source upper DBAT value
106 cmplw cr1,r11,r12 ; See if sink and source are same block
107
108 sync
109
110 mtdbatl 0,r11 ; Set sink lower DBAT
111 mtdbatu 0,r7 ; Set sink upper DBAT
112
113 beq- cr1,bcpvsame ; Source and sink are in same block
114
115 mtdbatl 1,r12 ; Set source lower DBAT
116 mtdbatu 1,r8 ; Set source upper DBAT
117
118 bcpvsame:
119 sync ; wait for the BATs to stabilize
120 isync
121
122 bl EXT(bcopy) ; BATs set up, args in r3-r5, so do the copy with DR on
123
124 li r0,0 ; Get set to invalidate upper half of BATs
125 sync ; Make sure all is well
126 mtdbatu 0,r0 ; Clear sink upper DBAT
127 mtdbatu 1,r0 ; Clear source upper DBAT
128 sync
129 isync
130
131 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address
132 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
133 mtlr r0
134 blr
135
136
137 ; void bcopy_phys(from, to, nbytes)
138 ;
139 ; Turns off data translation before the copy. This one will not work in user state.
140 ; This routine is used on 32 and 64-bit machines.
141 ;
142 ; Note that the address parameters are long longs. We will transform these to 64-bit
143 ; values. Note that on 32-bit architectures that this will ignore the high half of the
144 ; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
145 ; there anyhow.
146 ;
147 ; Also note that you probably will not be happy if either the sink or source spans across the
148 ; boundary between RAM and I/O space. Good chance of hanging the machine and this code
149 ; will not check, so be careful.
150 ;
151 ; NOTE: when called, translation must be on, and we must be in 32-bit mode.
152 ; Interrupts may or may not be disabled.
153
154 .align 5
155 .globl EXT(bcopy_phys)
156
157 LEXT(bcopy_phys)
158 mflr r0 ; get return address
159 rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
160 stw r0,8(r1) ; save
161 mfsprg r8,2 ; get processor feature flags
162 stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy
163 rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
164 rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
165 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
166 rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
167 mr r5,r7 ; Get the length into the right register
168
169 bcopy_phys1: ; enter from bcopy_physvir with pf64Bit in cr6 and parms in r3-r5
170 mfmsr r9 ; Get the MSR
171 lis r6,hi16(MASK(MSR_VEC)) ; Get vector enable
172 ori r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR)) ; Add in FP and DR
173 andc r9,r9,r6 ; unconditionally turn DR, VEC, and FP off
174 bt++ pf64Bitb,bcopy_phys64 ; skip if 64-bit (only they take hint)
175
176 ; 32-bit CPUs
177
178 mtmsr r9 ; turn DR, FP, and VEC off
179 isync ; Wait for it
180
181 bl EXT(bcopy) ; do the copy with translation off and caching on
182
183 mfmsr r9 ; Get the MSR
184 ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on (but leave VEC and FP off)
185 mtmsr r9 ; restore msr
186 isync ; wait for it to happen
187 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on
188 mtlr r0
189 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
190 blr
191
192
193 ; 64-bit: turn DR off and SF on.
194
195 bcopy_phys64: ; r9 = MSR with DP, VEC, and FP off
196 ori r8,r9,lo16(MASK(MSR_DR)) ; make a copy with DR back on... this is what we return to caller
197 srdi r2,r3,31 ; Get a 1 if source is in I/O memory
198 li r0,1 ; Note - we use this in a couple places below
199 srdi r10,r4,31 ; Get a 1 if sink is in I/O memory
200 std r8,BCOPY_SF_MSR(r1) ; save caller's MSR so we remember whether EE was on
201 rldimi r9,r0,63,MSR_SF_BIT ; set SF on in MSR we will copy with
202 cmpldi cr0,r2,1 ; Is source in I/O memory?
203 cmpldi cr7,r10,1 ; Is sink in I/O memory?
204 mtmsrd r9 ; turn 64-bit addressing on, data translation off
205 isync ; wait for it to happen
206 cror cr7_eq,cr0_eq,cr7_eq ; See if either source or sink is in I/O area
207 beq-- cr7,io_space_real_mode_copy ; an operand is in I/O space
208
209 bl EXT(bcopy) ; do copy with DR off and SF on, cache enabled
210
211 bcopy_phys64x:
212 mfmsr r9 ; Get the MSR we used to copy
213 rldicl r9,r9,0,MSR_SF_BIT+1 ; clear SF
214 ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on
215 mtmsrd r9 ; turn 64-bit mode off, translation back on
216 isync ; wait for it to happen
217 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on
218 ld r8,BCOPY_SF_MSR(r1) ; get caller's MSR once translation is back on
219 mtlr r0
220 mtmsrd r8,1 ; turn EE back on if necessary
221 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
222 blr
223
224 ; We need to copy with DR off, but one of the operands is in I/O space. To avoid wedging U3,
225 ; which cannot handle a cache burst in I/O space, we must turn caching off for the real memory access.
226 ; This can only be done by setting bits in HID4. We cannot lose control and execute random code in
227 ; this state, so we have to disable interrupts as well. This is an unpleasant hack.
228
229 io_space_real_mode_copy: ; r0=1, r9=MSR we want to copy with
230 sldi r11,r0,31-MSR_EE_BIT ; Get a mask for the EE bit
231 sldi r0,r0,32+8 ; Get the right bit to turn off caching
232 andc r9,r9,r11 ; Turn off EE bit
233 mfspr r2,hid4 ; Get HID4
234 mtmsrd r9,1 ; Force off EE
235 or r2,r2,r0 ; Set bit to make real accesses cache-inhibited
236 sync ; Sync up
237 mtspr hid4,r2 ; Make real accesses cache-inhibited
238 isync ; Toss prefetches
239
240 lis r12,0xE000 ; Get the unlikeliest ESID possible
241 srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000
242 slbie r12 ; Make sure the ERAT is cleared
243
244 sync
245 isync
246
247 bl EXT(bcopy_nc) ; copy with SF on and EE, DR, VEC, and FP off, cache inhibited
248
249 li r0,1 ; Get a 1
250 sldi r0,r0,32+8 ; Get the right bit to turn off caching
251 mfspr r2,hid4 ; Get HID4
252 andc r2,r2,r0 ; Clear bit to make real accesses cache-inhibited
253 sync ; Sync up
254 mtspr hid4,r2 ; Make real accesses not cache-inhibited
255 isync ; Toss prefetches
256
257 lis r12,0xE000 ; Get the unlikeliest ESID possible
258 srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000
259 slbie r12 ; Make sure the ERAT is cleared
260 b bcopy_phys64x
261
262
263 ;
264 ; shortcopy
265 ;
266 ; Special case short operands (<32 bytes), which are very common. Note that the check for
267 ; reverse vs normal moves isn't quite correct in 64-bit mode; in rare cases we will move in
268 ; reverse when it wasn't necessary to do so. This is OK, since performance of the two cases
269 ; is similar. We do get the direction right when it counts (ie, when the operands overlap.)
270 ; Also note that we use the G3/G4 "backend" code, even on G5. This is OK too, since G5 has
271 ; plenty of load/store dispatch bandwidth in this case, the extra ops are hidden by latency,
272 ; and using word instead of doubleword moves reduces the possibility of unaligned accesses,
273 ; which cost about 20 cycles if they cross a 32-byte boundary on G5. Finally, because we
274 ; might do unaligned accesses this code cannot be called from bcopy_nc().
275 ; r4 = destination
276 ; r5 = length (<32)
277 ; r6 = source
278 ; r12 = (dest - source)
279
280 .align 5
281 shortcopy:
282 cmplw r12,r5 ; must move reverse if (dest-source)<length
283 mtcrf 2,r5 ; move length to cr6 and cr7 one at a time...
284 mtcrf 1,r5 ; ...which is faster on G4 and G5
285 bge++ backend ; handle forward moves (most common case)
286 add r6,r6,r5 ; point one past end of operands in reverse moves
287 add r4,r4,r5
288 b bbackend ; handle reverse moves
289
290 ;
291 ; void bcopy(from, to, nbytes)
292 ;
293 ; NOTE: bcopy is called from copyin and copyout etc with the "thread_recover" ptr set.
294 ; This means bcopy must not set up a stack frame or touch non-volatile registers, and also means that it
295 ; cannot rely on turning off interrupts, because we expect to get DSIs and have execution aborted by a "longjmp"
296 ; to the thread_recover routine. What this means is that it would be hard to use vector or floating point
297 ; registers to accelerate the copy.
298 ;
299 ; NOTE: this code can be called in any of three "modes":
300 ; - on 32-bit processors (32-byte cache line)
301 ; - on 64-bit processors running in 32-bit mode (128-byte cache line)
302 ; - on 64-bit processors running in 64-bit mode (128-byte cache line)
303
304 .align 5
305 .globl EXT(bcopy)
306 .globl EXT(bcopy_nop_if_32bit)
307
308 LEXT(bcopy)
309 cmplwi cr1,r5,kShort ; less than 32 bytes?
310 sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check
311 mr r6,r3 ; Set source (must preserve r3 for memcopy return)
312 blt cr1,shortcopy ; special case short operands
313 crclr noncache ; Set cached
314 LEXT(bcopy_nop_if_32bit)
315 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
316 bne+ copyit32 ; handle 32-bit processor
317 blr ; to==from so nothing to do
318
319 ;
320 ; bcopy_nc(from, to, nbytes)
321 ;
322 ; bcopy_nc() operates on non-cached memory so we can not use any kind of cache instructions.
323 ; Furthermore, we must avoid all unaligned accesses on 64-bit machines, since they take
324 ; alignment exceptions. Thus we cannot use "shortcopy", which could do unaligned lwz/stw.
325 ; Like bcopy(), bcopy_nc() can be called both in 32- and 64-bit mode.
326
327 .align 5
328 .globl EXT(bcopy_nc)
329 .globl EXT(bcopy_nc_nop_if_32bit)
330
331 LEXT(bcopy_nc)
332 cmpwi cr1,r5,0 ; Check if we have a 0 length
333 sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check
334 mr r6,r3 ; Set source (must preserve r3 for memcopy return)
335 crset noncache ; Set non-cached
336 cror cr0_eq,cr1_eq,cr0_eq ; set cr0 beq if either length zero or to==from
337 LEXT(bcopy_nc_nop_if_32bit)
338 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
339 bne+ copyit32 ; handle 32-bit processor
340 blr ; either zero length or to==from
341
342 ;
343 ; void* memcpy(to, from, nbytes)
344 ; void* memmove(to, from, nbytes)
345 ;
346 ; memcpy() and memmove() are only called in 32-bit mode, albeit on both 32- and 64-bit processors.
347 ; However, they would work correctly if called in 64-bit mode.
348
349 .align 5
350 .globl EXT(memcpy)
351 .globl EXT(memmove)
352 .globl EXT(memcpy_nop_if_32bit)
353
354 LEXT(memcpy)
355 LEXT(memmove)
356 cmplwi cr1,r5,kShort ; less than 32 bytes?
357 sub. r12,r3,r4 ; test for to==from in mode-independent way, start fwd/rev check
358 mr r6,r4 ; Set source
359 mr r4,r3 ; Set the "to" (must preserve r3 for return value)
360 blt cr1,shortcopy ; special case short operands
361 crclr noncache ; Set cached
362 LEXT(memcpy_nop_if_32bit)
363 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
364 beqlr- ; exit if to==from
365
366
367 ; Here to copy on 32-bit processors.
368 ;
369 ; When we move the memory, forward overlays must be handled. We
370 ; also can not use the cache instructions if we are from bcopy_nc.
371 ; We need to preserve R3 because it needs to be returned for memcpy.
372 ; We can be interrupted and lose control here.
373 ;
374 ; When entered:
375 ; r4 = destination
376 ; r5 = length (>0)
377 ; r6 = source
378 ; r12 = (dest - source)
379 ; cr5 = noncache flag
380
381 copyit32: ; WARNING! can drop down to this label
382 cmplw cr1,r12,r5 ; must move reverse if (dest-source)<length
383 cntlzw r11,r5 ; get magnitude of length
384 dcbt 0,r6 ; start to touch in source
385 lis r10,hi16(0x80000000) ; get 0x80000000
386 neg r9,r4 ; start to get alignment for destination
387 dcbtst 0,r4 ; start to touch in destination
388 sraw r8,r10,r11 ; get mask based on operand length, to limit alignment
389 blt- cr1,reverse32bit ; reverse move required
390
391 ; Forward moves on 32-bit machines, also word aligned uncached ops on 64-bit machines.
392 ; NOTE: we never do an unaligned access if the source and destination are "relatively"
393 ; word aligned. We depend on this in the uncached case on 64-bit processors.
394 ; r4 = destination
395 ; r5 = length (>0)
396 ; r6 = source
397 ; r8 = inverse of largest mask smaller than operand length
398 ; r9 = neg(dest), used to compute alignment
399 ; cr5 = noncache flag
400
401 forward32bit: ; enter from 64-bit CPUs with word aligned uncached operands
402 rlwinm r7,r9,0,0x1F ; get bytes to 32-byte-align destination
403 andc. r0,r7,r8 ; limit to the maximum front end move
404 mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time...
405 beq alline ; Already on a line...
406
407 mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5
408 sub r5,r5,r0 ; Set the length left to move
409
410 bf 31,alhalf ; No single byte to do...
411 lbz r7,0(r6) ; Get the byte
412 addi r6,r6,1 ; Point to the next
413 stb r7,0(r4) ; Save the single
414 addi r4,r4,1 ; Bump sink
415
416 ; Sink is halfword aligned here
417
418 alhalf: bf 30,alword ; No halfword to do...
419 lhz r7,0(r6) ; Get the halfword
420 addi r6,r6,2 ; Point to the next
421 sth r7,0(r4) ; Save the halfword
422 addi r4,r4,2 ; Bump sink
423
424 ; Sink is word aligned here
425
426 alword: bf 29,aldouble ; No word to do...
427 lwz r7,0(r6) ; Get the word
428 addi r6,r6,4 ; Point to the next
429 stw r7,0(r4) ; Save the word
430 addi r4,r4,4 ; Bump sink
431
432 ; Sink is double aligned here
433
434 aldouble: bf 28,alquad ; No double to do...
435 lwz r7,0(r6) ; Get the first word
436 lwz r8,4(r6) ; Get the second word
437 addi r6,r6,8 ; Point to the next
438 stw r7,0(r4) ; Save the first word
439 stw r8,4(r4) ; Save the second word
440 addi r4,r4,8 ; Bump sink
441
442 ; Sink is quadword aligned here
443
444 alquad: bf 27,alline ; No quad to do...
445 lwz r7,0(r6) ; Get the first word
446 lwz r8,4(r6) ; Get the second word
447 lwz r9,8(r6) ; Get the third word
448 stw r7,0(r4) ; Save the first word
449 lwz r11,12(r6) ; Get the fourth word
450 addi r6,r6,16 ; Point to the next
451 stw r8,4(r4) ; Save the second word
452 stw r9,8(r4) ; Save the third word
453 stw r11,12(r4) ; Save the fourth word
454 addi r4,r4,16 ; Bump sink
455
456 ; Sink is line aligned here
457
458 alline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
459 mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time...
460 mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5
461 beq- backend ; No full lines to move
462
463 mtctr r0 ; set up loop count
464 li r0,96 ; Stride for touch ahead
465 b nxtline
466
467 .align 4
468 nxtline:
469 lwz r2,0(r6) ; Get the first word
470 lwz r5,4(r6) ; Get the second word
471 lwz r7,8(r6) ; Get the third word
472 lwz r8,12(r6) ; Get the fourth word
473 lwz r9,16(r6) ; Get the fifth word
474 lwz r10,20(r6) ; Get the sixth word
475 lwz r11,24(r6) ; Get the seventh word
476 lwz r12,28(r6) ; Get the eighth word
477 bt- noncache,skipz ; Skip if we are not cached...
478 dcbz 0,r4 ; Blow away the whole line because we are replacing it
479 dcbt r6,r0 ; Touch ahead a bit
480 skipz:
481 addi r6,r6,32 ; Point to the next
482 stw r2,0(r4) ; Save the first word
483 stw r5,4(r4) ; Save the second word
484 stw r7,8(r4) ; Save the third word
485 stw r8,12(r4) ; Save the fourth word
486 stw r9,16(r4) ; Save the fifth word
487 stw r10,20(r4) ; Save the sixth word
488 stw r11,24(r4) ; Save the seventh word
489 stw r12,28(r4) ; Save the eighth word
490 addi r4,r4,32 ; Bump sink
491 bdnz+ nxtline ; Do the next line, if any...
492
493
494 ; Move backend quadword
495
496 backend: ; Join here from "shortcopy" for forward moves <32 bytes
497 bf 27,noquad ; No quad to do...
498 lwz r7,0(r6) ; Get the first word
499 lwz r8,4(r6) ; Get the second word
500 lwz r9,8(r6) ; Get the third word
501 lwz r11,12(r6) ; Get the fourth word
502 stw r7,0(r4) ; Save the first word
503 addi r6,r6,16 ; Point to the next
504 stw r8,4(r4) ; Save the second word
505 stw r9,8(r4) ; Save the third word
506 stw r11,12(r4) ; Save the fourth word
507 addi r4,r4,16 ; Bump sink
508
509 ; Move backend double
510
511 noquad: bf 28,nodouble ; No double to do...
512 lwz r7,0(r6) ; Get the first word
513 lwz r8,4(r6) ; Get the second word
514 addi r6,r6,8 ; Point to the next
515 stw r7,0(r4) ; Save the first word
516 stw r8,4(r4) ; Save the second word
517 addi r4,r4,8 ; Bump sink
518
519 ; Move backend word
520
521 nodouble: bf 29,noword ; No word to do...
522 lwz r7,0(r6) ; Get the word
523 addi r6,r6,4 ; Point to the next
524 stw r7,0(r4) ; Save the word
525 addi r4,r4,4 ; Bump sink
526
527 ; Move backend halfword
528
529 noword: bf 30,nohalf ; No halfword to do...
530 lhz r7,0(r6) ; Get the halfword
531 addi r6,r6,2 ; Point to the next
532 sth r7,0(r4) ; Save the halfword
533 addi r4,r4,2 ; Bump sink
534
535 ; Move backend byte
536
537 nohalf: bflr 31 ; Leave cuz we are all done...
538 lbz r7,0(r6) ; Get the byte
539 stb r7,0(r4) ; Save the single
540 blr
541
542
543 ; Reverse moves on 32-bit machines, also reverse word aligned uncached moves on 64-bit machines.
544 ; NOTE: we never do an unaligned access if the source and destination are "relatively"
545 ; word aligned. We depend on this in the uncached case on 64-bit processors.
546 ; These are slower because we don't bother with dcbz. Fortunately, reverse moves are uncommon.
547 ; r4 = destination
548 ; r5 = length (>0)
549 ; r6 = source
550 ; r8 = inverse of largest mask smaller than operand length
551 ; cr5 = noncache flag (but we don't dcbz anyway)
552
553 reverse32bit: ; here from 64-bit code with word aligned uncached operands
554 add r4,r5,r4 ; Point past the last sink byte
555 add r6,r5,r6 ; Point past the last source byte
556 rlwinm r7,r4,0,0x1F ; Calculate the length to align dest on cache boundary
557 li r12,-1 ; Make sure we touch in the actual line
558 andc. r0,r7,r8 ; Apply movement limit
559 dcbt r12,r6 ; Touch in the last line of source
560 mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time...
561 dcbtst r12,r4 ; Touch in the last line of the sink
562 mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5
563 beq- balline ; Aready on cache line boundary (or too short to bother)
564
565 sub r5,r5,r0 ; Precaculate move length left after alignment
566
567 bf 31,balhalf ; No single byte to do...
568 lbz r7,-1(r6) ; Get the byte
569 subi r6,r6,1 ; Point to the next
570 stb r7,-1(r4) ; Save the single
571 subi r4,r4,1 ; Bump sink
572
573 ; Sink is halfword aligned here
574
575 balhalf: bf 30,balword ; No halfword to do...
576 lhz r7,-2(r6) ; Get the halfword
577 subi r6,r6,2 ; Point to the next
578 sth r7,-2(r4) ; Save the halfword
579 subi r4,r4,2 ; Bump sink
580
581 ; Sink is word aligned here
582
583 balword: bf 29,baldouble ; No word to do...
584 lwz r7,-4(r6) ; Get the word
585 subi r6,r6,4 ; Point to the next
586 stw r7,-4(r4) ; Save the word
587 subi r4,r4,4 ; Bump sink
588
589 ; Sink is double aligned here
590
591 baldouble: bf 28,balquad ; No double to do...
592 lwz r7,-8(r6) ; Get the first word
593 lwz r8,-4(r6) ; Get the second word
594 subi r6,r6,8 ; Point to the next
595 stw r7,-8(r4) ; Save the first word
596 stw r8,-4(r4) ; Save the second word
597 subi r4,r4,8 ; Bump sink
598
599 ; Sink is quadword aligned here
600
601 balquad: bf 27,balline ; No quad to do...
602 lwz r7,-16(r6) ; Get the first word
603 lwz r8,-12(r6) ; Get the second word
604 lwz r9,-8(r6) ; Get the third word
605 lwz r11,-4(r6) ; Get the fourth word
606 stw r7,-16(r4) ; Save the first word
607 subi r6,r6,16 ; Point to the next
608 stw r8,-12(r4) ; Save the second word
609 stw r9,-8(r4) ; Save the third word
610 stw r11,-4(r4) ; Save the fourth word
611 subi r4,r4,16 ; Bump sink
612
613 ; Sink is line aligned here
614
615 balline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
616 mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time...
617 mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5
618 beq- bbackend ; No full lines to move
619 mtctr r0 ; set up loop count
620 b bnxtline
621
622 .align 4
623 bnxtline:
624 lwz r7,-32(r6) ; Get the first word
625 lwz r5,-28(r6) ; Get the second word
626 lwz r2,-24(r6) ; Get the third word
627 lwz r12,-20(r6) ; Get the third word
628 lwz r11,-16(r6) ; Get the fifth word
629 lwz r10,-12(r6) ; Get the sixth word
630 lwz r9,-8(r6) ; Get the seventh word
631 lwz r8,-4(r6) ; Get the eighth word
632 subi r6,r6,32 ; Point to the next
633
634 stw r7,-32(r4) ; Get the first word
635 stw r5,-28(r4) ; Get the second word
636 stw r2,-24(r4) ; Get the third word
637 stw r12,-20(r4) ; Get the third word
638 stw r11,-16(r4) ; Get the fifth word
639 stw r10,-12(r4) ; Get the sixth word
640 stw r9,-8(r4) ; Get the seventh word
641 stw r8,-4(r4) ; Get the eighth word
642 subi r4,r4,32 ; Bump sink
643
644 bdnz+ bnxtline ; Do the next line, if any...
645
646 ;
647 ; Note: We touched these lines in at the beginning
648 ;
649
650 ; Move backend quadword
651
652 bbackend: ; Join here from "shortcopy" for reverse moves of <32 bytes
653 bf 27,bnoquad ; No quad to do...
654 lwz r7,-16(r6) ; Get the first word
655 lwz r8,-12(r6) ; Get the second word
656 lwz r9,-8(r6) ; Get the third word
657 lwz r11,-4(r6) ; Get the fourth word
658 stw r7,-16(r4) ; Save the first word
659 subi r6,r6,16 ; Point to the next
660 stw r8,-12(r4) ; Save the second word
661 stw r9,-8(r4) ; Save the third word
662 stw r11,-4(r4) ; Save the fourth word
663 subi r4,r4,16 ; Bump sink
664
665 ; Move backend double
666
667 bnoquad: bf 28,bnodouble ; No double to do...
668 lwz r7,-8(r6) ; Get the first word
669 lwz r8,-4(r6) ; Get the second word
670 subi r6,r6,8 ; Point to the next
671 stw r7,-8(r4) ; Save the first word
672 stw r8,-4(r4) ; Save the second word
673 subi r4,r4,8 ; Bump sink
674
675 ; Move backend word
676
677 bnodouble: bf 29,bnoword ; No word to do...
678 lwz r7,-4(r6) ; Get the word
679 subi r6,r6,4 ; Point to the next
680 stw r7,-4(r4) ; Save the word
681 subi r4,r4,4 ; Bump sink
682
683 ; Move backend halfword
684
685 bnoword: bf 30,bnohalf ; No halfword to do...
686 lhz r7,-2(r6) ; Get the halfword
687 subi r6,r6,2 ; Point to the next
688 sth r7,-2(r4) ; Save the halfword
689 subi r4,r4,2 ; Bump sink
690
691 ; Move backend byte
692
693 bnohalf: bflr 31 ; Leave cuz we are all done...
694 lbz r7,-1(r6) ; Get the byte
695 stb r7,-1(r4) ; Save the single
696 blr
697
698
699 // Here on 64-bit processors, which have a 128-byte cache line. This can be
700 // called either in 32 or 64-bit mode, which makes the test for reverse moves
701 // a little tricky. We've already filtered out the (sou==dest) and (len==0)
702 // special cases.
703 //
704 // When entered:
705 // r4 = destination (32 or 64-bit ptr)
706 // r5 = length (always 32 bits)
707 // r6 = source (32 or 64-bit ptr)
708 // r12 = (dest - source), reverse move required if (dest-source)<length
709 // cr5 = noncache flag
710
711 .align 5
712 copyit64:
713 rlwinm r7,r5,0,0,31 // truncate length to 32-bit, in case we're running in 64-bit mode
714 cntlzw r11,r5 // get magnitude of length
715 dcbt 0,r6 // touch in 1st block of source
716 dcbtst 0,r4 // touch in 1st destination cache block
717 subc r7,r12,r7 // set Carry if (dest-source)>=length, in mode-independent way
718 li r0,0 // get a 0
719 lis r10,hi16(0x80000000)// get 0x80000000
720 addze. r0,r0 // set cr0 on carry bit (beq if reverse move required)
721 neg r9,r4 // start to get alignment for destination
722 sraw r8,r10,r11 // get mask based on operand length, to limit alignment
723 bt-- noncache,c64uncached// skip if uncached
724 beq-- c64rdouble // handle cached reverse moves
725
726
727 // Forward, cached or doubleword aligned uncached. This is the common case.
728 // NOTE: we never do an unaligned access if the source and destination are "relatively"
729 // doubleword aligned. We depend on this in the uncached case.
730 // r4 = destination
731 // r5 = length (>0)
732 // r6 = source
733 // r8 = inverse of largest mask smaller than operand length
734 // r9 = neg(dest), used to compute alignment
735 // cr5 = noncache flag
736
737 c64double:
738 rlwinm r7,r9,0,0x7F // get #bytes to 128-byte align destination
739 andc r7,r7,r8 // limit by operand length
740 andi. r8,r7,7 // r8 <- #bytes to doubleword align
741 srwi r9,r7,3 // r9 <- #doublewords to 128-byte align
742 sub r5,r5,r7 // adjust length remaining
743 cmpwi cr1,r9,0 // any doublewords to move to cache align?
744 srwi r10,r5,7 // r10 <- 128-byte chunks to xfer after aligning dest
745 cmpwi cr7,r10,0 // set cr7 on chunk count
746 beq c64double2 // dest already doubleword aligned
747 mtctr r8
748 b c64double1
749
750 .align 5 // align inner loops
751 c64double1: // copy bytes until dest is doubleword aligned
752 lbz r0,0(r6)
753 addi r6,r6,1
754 stb r0,0(r4)
755 addi r4,r4,1
756 bdnz c64double1
757
758 c64double2: // r9/cr1=doublewords, r10/cr7=128-byte chunks
759 beq cr1,c64double4 // no doublewords to xfer in order to cache align
760 mtctr r9
761 b c64double3
762
763 .align 5 // align inner loops
764 c64double3: // copy doublewords until dest is 128-byte aligned
765 ld r7,0(r6)
766 addi r6,r6,8
767 std r7,0(r4)
768 addi r4,r4,8
769 bdnz c64double3
770
771 // Here to xfer 128-byte chunks, if any. Since we only have 8 GPRs for
772 // data (64 bytes), we load/store each twice per 128-byte chunk.
773
774 c64double4: // r10/cr7=128-byte chunks
775 rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords, after moving chunks
776 cmpwi cr1,r0,0 // set cr1 on leftover doublewords
777 beq cr7,c64double7 // no 128-byte chunks
778
779 ; We must check for (source-dest)<128 in a mode-independent way. If within 128 bytes,
780 ; turn on "noncache" because we cannot use dcbz128 even if operands are cacheable.
781
782 sub r8,r6,r4 // r8 <- (source - dest)
783 rldicr. r0,r8,0,63-7 // zero low 7 bits and check for 0, mode independent
784 cror noncache,cr0_eq,noncache // turn on "noncache" flag if (source-dest)<128
785 mtctr r10
786 b c64InnerLoop
787
788 .align 5 // align inner loop
789 c64InnerLoop: // loop copying 128-byte cache lines to 128-aligned destination
790 ld r0,0(r6) // start pipe: load 1st half-line
791 ld r2,8(r6)
792 ld r7,16(r6)
793 ld r8,24(r6)
794 ld r9,32(r6)
795 ld r10,40(r6)
796 ld r11,48(r6)
797 ld r12,56(r6)
798 bt noncache,c64InnerLoop1 // skip if uncached or overlap
799 dcbz128 0,r4 // avoid prefetch of next cache line
800 c64InnerLoop1:
801
802 std r0,0(r4)
803 std r2,8(r4)
804 std r7,16(r4)
805 std r8,24(r4)
806 std r9,32(r4)
807 std r10,40(r4)
808 std r11,48(r4)
809 std r12,56(r4)
810
811 ld r0,64(r6) // load 2nd half of chunk
812 ld r2,72(r6)
813 ld r7,80(r6)
814 ld r8,88(r6)
815 ld r9,96(r6)
816 ld r10,104(r6)
817 ld r11,112(r6)
818 ld r12,120(r6)
819 addi r6,r6,128
820
821 std r0,64(r4)
822 std r2,72(r4)
823 std r7,80(r4)
824 std r8,88(r4)
825 std r9,96(r4)
826 std r10,104(r4)
827 std r11,112(r4)
828 std r12,120(r4)
829 addi r4,r4,128 // advance to next dest chunk
830
831 bdnz c64InnerLoop // loop if more chunks
832
833
834 c64double7: // r5 <- leftover bytes, cr1 set on doubleword count
835 rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords (0-15)
836 andi. r5,r5,7 // r5/cr0 <- count of leftover bytes (0-7)
837 beq cr1,c64byte // no leftover doublewords
838 mtctr r0
839 b c64double8
840
841 .align 5 // align inner loop
842 c64double8: // loop copying leftover doublewords
843 ld r0,0(r6)
844 addi r6,r6,8
845 std r0,0(r4)
846 addi r4,r4,8
847 bdnz c64double8
848
849
850 // Forward byte loop.
851
852 c64byte: // r5/cr0 <- byte count (can be big if unaligned uncached)
853 beqlr // done if no leftover bytes
854 mtctr r5
855 b c64byte1
856
857 .align 5 // align inner loop
858 c64byte1:
859 lbz r0,0(r6)
860 addi r6,r6,1
861 stb r0,0(r4)
862 addi r4,r4,1
863 bdnz c64byte1
864
865 blr
866
867
868 // Uncached copies. We must avoid unaligned accesses, since they always take alignment
869 // exceptions on uncached memory on 64-bit processors. This may mean we copy long operands
870 // a byte at a time, but that is still much faster than alignment exceptions.
871 // r4 = destination
872 // r5 = length (>0)
873 // r6 = source
874 // r8 = inverse of largest mask smaller than operand length
875 // r9 = neg(dest), used to compute alignment
876 // r12 = (dest-source), used to test relative alignment
877 // cr0 = beq if reverse move required
878 // cr5 = noncache flag
879
880 c64uncached:
881 rlwinm r10,r12,0,29,31 // relatively doubleword aligned?
882 rlwinm r11,r12,0,30,31 // relatively word aligned?
883 cmpwi cr7,r10,0 // set cr7 beq if doubleword aligned
884 cmpwi cr1,r11,0 // set cr1 beq if word aligned
885 beq-- c64reverseUncached
886
887 beq cr7,c64double // doubleword aligned
888 beq cr1,forward32bit // word aligned, use G3/G4 code
889 cmpwi r5,0 // set cr0 on byte count
890 b c64byte // unaligned operands
891
892 c64reverseUncached:
893 beq cr7,c64rdouble // doubleword aligned so can use LD/STD
894 beq cr1,reverse32bit // word aligned, use G3/G4 code
895 add r6,r6,r5 // point to (end+1) of source and dest
896 add r4,r4,r5
897 cmpwi r5,0 // set cr0 on length
898 b c64rbyte // copy a byte at a time
899
900
901
902 // Reverse doubleword copies. This is used for all cached copies, and doubleword
903 // aligned uncached copies.
904 // r4 = destination
905 // r5 = length (>0)
906 // r6 = source
907 // r8 = inverse of largest mask of low-order 1s smaller than operand length
908 // cr5 = noncache flag
909
910 c64rdouble:
911 add r6,r6,r5 // point to (end+1) of source and dest
912 add r4,r4,r5
913 rlwinm r7,r4,0,29,31 // r7 <- #bytes to doubleword align dest
914 andc. r7,r7,r8 // limit by operand length
915 sub r5,r5,r7 // adjust length
916 srwi r8,r5,6 // r8 <- 64-byte chunks to xfer
917 cmpwi cr1,r8,0 // any chunks?
918 beq c64rd2 // source already doubleword aligned
919 mtctr r7
920
921 c64rd1: // copy bytes until source doublword aligned
922 lbzu r0,-1(r6)
923 stbu r0,-1(r4)
924 bdnz c64rd1
925
926 c64rd2: // r8/cr1 <- count of 64-byte chunks
927 rlwinm r0,r5,29,29,31 // r0 <- count of leftover doublewords
928 andi. r5,r5,7 // r5/cr0 <- count of leftover bytes
929 cmpwi cr7,r0,0 // leftover doublewords?
930 beq cr1,c64rd4 // no chunks to xfer
931 mtctr r8
932 b c64rd3
933
934 .align 5 // align inner loop
935 c64rd3: // loop copying 64-byte chunks
936 ld r7,-8(r6)
937 ld r8,-16(r6)
938 ld r9,-24(r6)
939 ld r10,-32(r6)
940 ld r11,-40(r6)
941 ld r12,-48(r6)
942 std r7,-8(r4)
943 std r8,-16(r4)
944 ld r7,-56(r6)
945 ldu r8,-64(r6)
946 std r9,-24(r4)
947 std r10,-32(r4)
948 std r11,-40(r4)
949 std r12,-48(r4)
950 std r7,-56(r4)
951 stdu r8,-64(r4)
952 bdnz c64rd3
953
954 c64rd4: // r0/cr7 = leftover doublewords r5/cr0 = leftover bytes
955 beq cr7,c64rbyte // no leftover doublewords
956 mtctr r0
957
958 c64rd5: // loop copying leftover doublewords
959 ldu r0,-8(r6)
960 stdu r0,-8(r4)
961 bdnz c64rd5
962
963
964 // Reverse byte loop.
965
966 c64rbyte: // r5/cr0 <- byte count (can be big if unaligned uncached)
967 beqlr // done if no leftover bytes
968 mtctr r5
969
970 c64rbyte1:
971 lbzu r0,-1(r6)
972 stbu r0,-1(r4)
973 bdnz c64rbyte1
974
975 blr
976