]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/bcopy.s
9c100dda6d3c6cb7c65349132116a8b3a8be74f0
[apple/xnu.git] / osfmk / ppc / bcopy.s
1 /*
2 * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
14 * agreement.
15 *
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
18 * file.
19 *
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
27 *
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
29 */
30 ;
31 ; Copy bytes of data around. Handles overlapped data.
32 ;
33 ;
34 #include <ppc/asm.h>
35 #include <ppc/proc_reg.h>
36 #include <assym.s>
37
38 ; These routines use CR5 for certain flags:
39 ; Use CR5_lt to indicate non-cached (in bcopy and memcpy)
40 #define noncache 20
41
42
43 ; The bcopy_phys variants use a stack frame so they can call bcopy as a subroutine.
44 #define BCOPY_SF_SIZE 32 // total size
45 #define BCOPY_SF_MSR 16 // we save caller's MSR here (possibly minus VEC and FP)
46
47
48 #define kShort 32 // short operands are special cased
49
50
51 ; void bcopy_physvir_32(from, to, nbytes)
52 ;
53 ; Attempt to copy physically addressed memory with translation on if conditions are met.
54 ; Otherwise do a normal bcopy_phys. This routine is used because some 32-bit processors
55 ; are very slow doing real-mode (translation off) copies, so we set up temporary BATs
56 ; for the passed phys addrs and do the copy with translation on.
57 ;
58 ; Rules are: - neither source nor destination can cross a page.
59 ; - Interrupts must be disabled when this routine is called.
60 ; - Translation must be on when called.
61 ;
62 ; To do the copy, we build a 128 DBAT for both the source and sink. If both are the same, only one
63 ; is loaded. We do not touch the IBATs, so there is no issue if either physical page
64 ; address is the same as the virtual address of the instructions we are executing.
65 ;
66 ; At the end, we invalidate the used DBATs.
67 ;
68 ; Note that the address parameters are long longs. We will transform these to 64-bit
69 ; values. Note that on 32-bit architectures that this will ignore the high half of the
70 ; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
71 ; there anyhow.
72 ;
73 ; Note also that this routine is used only on 32-bit machines. If you're contemplating use
74 ; on a 64-bit processor, use the physical memory window instead; please refer to copypv()
75 ; for an example of how this is done.
76
77 .align 5
78 .globl EXT(bcopy_physvir_32)
79
80 LEXT(bcopy_physvir_32)
81 mflr r0 ; get return address
82 rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
83 mfsprg r8,2 ; get processor feature flags
84 stw r0,8(r1) ; save return address
85 rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
86 stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy
87 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
88 subi r0,r7,1 ; get length - 1
89 rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
90 add r11,r3,r0 ; Point to last byte of sink
91 mr r5,r7 ; Get the length into the right register
92 rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
93
94 ; This test for page overflow may not work if the length is negative. Negative lengths are invalid input
95 ; to bcopy_physvir() on 32-bit machines, and will result in a panic.
96
97 add r12,r4,r0 ; Point to last byte of source
98 xor r7,r11,r3 ; See if we went to next page
99 xor r8,r12,r4 ; See if we went to next page
100 or r0,r7,r8 ; Combine wrap
101
102 // li r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
103 li r9,((2<<3)|2) ; Set default attributes
104 rlwinm. r0,r0,0,0,19 ; Did we overflow a page?
105 li r7,2 ; Set validity flags
106 li r8,2 ; Set validity flags
107 bne- bcopy_phys1 ; Overflowed page, do normal physical copy...
108
109 rlwimi r11,r9,0,15,31 ; Set sink lower DBAT value
110 rlwimi r12,r9,0,15,31 ; Set source lower DBAT value
111 rlwimi r7,r11,0,0,14 ; Set sink upper DBAT value
112 rlwimi r8,r12,0,0,14 ; Set source upper DBAT value
113 cmplw cr1,r11,r12 ; See if sink and source are same block
114
115 sync
116
117 mtdbatl 0,r11 ; Set sink lower DBAT
118 mtdbatu 0,r7 ; Set sink upper DBAT
119
120 beq- cr1,bcpvsame ; Source and sink are in same block
121
122 mtdbatl 1,r12 ; Set source lower DBAT
123 mtdbatu 1,r8 ; Set source upper DBAT
124
125 bcpvsame:
126 sync ; wait for the BATs to stabilize
127 isync
128
129 bl EXT(bcopy) ; BATs set up, args in r3-r5, so do the copy with DR on
130
131 li r0,0 ; Get set to invalidate upper half of BATs
132 sync ; Make sure all is well
133 mtdbatu 0,r0 ; Clear sink upper DBAT
134 mtdbatu 1,r0 ; Clear source upper DBAT
135 sync
136 isync
137
138 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address
139 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
140 mtlr r0
141 blr
142
143
144 ; void bcopy_phys(from, to, nbytes)
145 ;
146 ; Turns off data translation before the copy. This one will not work in user state.
147 ; This routine is used on 32 and 64-bit machines.
148 ;
149 ; Note that the address parameters are long longs. We will transform these to 64-bit
150 ; values. Note that on 32-bit architectures that this will ignore the high half of the
151 ; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
152 ; there anyhow.
153 ;
154 ; Also note that you probably will not be happy if either the sink or source spans across the
155 ; boundary between RAM and I/O space. Good chance of hanging the machine and this code
156 ; will not check, so be careful.
157 ;
158 ; NOTE: when called, translation must be on, and we must be in 32-bit mode.
159 ; Interrupts may or may not be disabled.
160
161 .align 5
162 .globl EXT(bcopy_phys)
163
164 LEXT(bcopy_phys)
165 mflr r0 ; get return address
166 rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
167 stw r0,8(r1) ; save
168 mfsprg r8,2 ; get processor feature flags
169 stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy
170 rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
171 rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
172 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
173 rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
174 mr r5,r7 ; Get the length into the right register
175
176 bcopy_phys1: ; enter from bcopy_physvir with pf64Bit in cr6 and parms in r3-r5
177 mfmsr r9 ; Get the MSR
178 lis r6,hi16(MASK(MSR_VEC)) ; Get vector enable
179 ori r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR)) ; Add in FP and DR
180 andc r9,r9,r6 ; unconditionally turn DR, VEC, and FP off
181 bt++ pf64Bitb,bcopy_phys64 ; skip if 64-bit (only they take hint)
182
183 ; 32-bit CPUs
184
185 mtmsr r9 ; turn DR, FP, and VEC off
186 isync ; Wait for it
187
188 bl EXT(bcopy) ; do the copy with translation off and caching on
189
190 mfmsr r9 ; Get the MSR
191 ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on (but leave VEC and FP off)
192 mtmsr r9 ; restore msr
193 isync ; wait for it to happen
194 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on
195 mtlr r0
196 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
197 blr
198
199
200 ; 64-bit: turn DR off and SF on.
201
202 bcopy_phys64: ; r9 = MSR with DP, VEC, and FP off
203 ori r8,r9,lo16(MASK(MSR_DR)) ; make a copy with DR back on... this is what we return to caller
204 srdi r2,r3,31 ; Get a 1 if source is in I/O memory
205 li r0,1 ; Note - we use this in a couple places below
206 srdi r10,r4,31 ; Get a 1 if sink is in I/O memory
207 std r8,BCOPY_SF_MSR(r1) ; save caller's MSR so we remember whether EE was on
208 rldimi r9,r0,63,MSR_SF_BIT ; set SF on in MSR we will copy with
209 cmpldi cr0,r2,1 ; Is source in I/O memory?
210 cmpldi cr7,r10,1 ; Is sink in I/O memory?
211 mtmsrd r9 ; turn 64-bit addressing on, data translation off
212 isync ; wait for it to happen
213 cror cr7_eq,cr0_eq,cr7_eq ; See if either source or sink is in I/O area
214 beq-- cr7,io_space_real_mode_copy ; an operand is in I/O space
215
216 bl EXT(bcopy) ; do copy with DR off and SF on, cache enabled
217
218 bcopy_phys64x:
219 mfmsr r9 ; Get the MSR we used to copy
220 rldicl r9,r9,0,MSR_SF_BIT+1 ; clear SF
221 ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on
222 mtmsrd r9 ; turn 64-bit mode off, translation back on
223 isync ; wait for it to happen
224 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on
225 ld r8,BCOPY_SF_MSR(r1) ; get caller's MSR once translation is back on
226 mtlr r0
227 mtmsrd r8,1 ; turn EE back on if necessary
228 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
229 blr
230
231 ; We need to copy with DR off, but one of the operands is in I/O space. To avoid wedging U3,
232 ; which cannot handle a cache burst in I/O space, we must turn caching off for the real memory access.
233 ; This can only be done by setting bits in HID4. We cannot lose control and execute random code in
234 ; this state, so we have to disable interrupts as well. This is an unpleasant hack.
235
236 io_space_real_mode_copy: ; r0=1, r9=MSR we want to copy with
237 sldi r11,r0,31-MSR_EE_BIT ; Get a mask for the EE bit
238 sldi r0,r0,32+8 ; Get the right bit to turn off caching
239 andc r9,r9,r11 ; Turn off EE bit
240 mfspr r2,hid4 ; Get HID4
241 mtmsrd r9,1 ; Force off EE
242 or r2,r2,r0 ; Set bit to make real accesses cache-inhibited
243 sync ; Sync up
244 mtspr hid4,r2 ; Make real accesses cache-inhibited
245 isync ; Toss prefetches
246
247 lis r12,0xE000 ; Get the unlikeliest ESID possible
248 srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000
249 slbie r12 ; Make sure the ERAT is cleared
250
251 sync
252 isync
253
254 bl EXT(bcopy_nc) ; copy with SF on and EE, DR, VEC, and FP off, cache inhibited
255
256 li r0,1 ; Get a 1
257 sldi r0,r0,32+8 ; Get the right bit to turn off caching
258 mfspr r2,hid4 ; Get HID4
259 andc r2,r2,r0 ; Clear bit to make real accesses cache-inhibited
260 sync ; Sync up
261 mtspr hid4,r2 ; Make real accesses not cache-inhibited
262 isync ; Toss prefetches
263
264 lis r12,0xE000 ; Get the unlikeliest ESID possible
265 srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000
266 slbie r12 ; Make sure the ERAT is cleared
267 b bcopy_phys64x
268
269
270 ;
271 ; shortcopy
272 ;
273 ; Special case short operands (<32 bytes), which are very common. Note that the check for
274 ; reverse vs normal moves isn't quite correct in 64-bit mode; in rare cases we will move in
275 ; reverse when it wasn't necessary to do so. This is OK, since performance of the two cases
276 ; is similar. We do get the direction right when it counts (ie, when the operands overlap.)
277 ; Also note that we use the G3/G4 "backend" code, even on G5. This is OK too, since G5 has
278 ; plenty of load/store dispatch bandwidth in this case, the extra ops are hidden by latency,
279 ; and using word instead of doubleword moves reduces the possibility of unaligned accesses,
280 ; which cost about 20 cycles if they cross a 32-byte boundary on G5. Finally, because we
281 ; might do unaligned accesses this code cannot be called from bcopy_nc().
282 ; r4 = destination
283 ; r5 = length (<32)
284 ; r6 = source
285 ; r12 = (dest - source)
286
287 .align 5
288 shortcopy:
289 cmplw r12,r5 ; must move reverse if (dest-source)<length
290 mtcrf 2,r5 ; move length to cr6 and cr7 one at a time...
291 mtcrf 1,r5 ; ...which is faster on G4 and G5
292 bge++ backend ; handle forward moves (most common case)
293 add r6,r6,r5 ; point one past end of operands in reverse moves
294 add r4,r4,r5
295 b bbackend ; handle reverse moves
296
297 ;
298 ; void bcopy(from, to, nbytes)
299 ;
300 ; NOTE: bcopy is called from copyin and copyout etc with the "thread_recover" ptr set.
301 ; This means bcopy must not set up a stack frame or touch non-volatile registers, and also means that it
302 ; cannot rely on turning off interrupts, because we expect to get DSIs and have execution aborted by a "longjmp"
303 ; to the thread_recover routine. What this means is that it would be hard to use vector or floating point
304 ; registers to accelerate the copy.
305 ;
306 ; NOTE: this code can be called in any of three "modes":
307 ; - on 32-bit processors (32-byte cache line)
308 ; - on 64-bit processors running in 32-bit mode (128-byte cache line)
309 ; - on 64-bit processors running in 64-bit mode (128-byte cache line)
310
311 .align 5
312 .globl EXT(bcopy)
313 .globl EXT(bcopy_nop_if_32bit)
314
315 LEXT(bcopy)
316 cmplwi cr1,r5,kShort ; less than 32 bytes?
317 sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check
318 mr r6,r3 ; Set source (must preserve r3 for memcopy return)
319 blt cr1,shortcopy ; special case short operands
320 crclr noncache ; Set cached
321 LEXT(bcopy_nop_if_32bit)
322 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
323 bne+ copyit32 ; handle 32-bit processor
324 blr ; to==from so nothing to do
325
326 ;
327 ; bcopy_nc(from, to, nbytes)
328 ;
329 ; bcopy_nc() operates on non-cached memory so we can not use any kind of cache instructions.
330 ; Furthermore, we must avoid all unaligned accesses on 64-bit machines, since they take
331 ; alignment exceptions. Thus we cannot use "shortcopy", which could do unaligned lwz/stw.
332 ; Like bcopy(), bcopy_nc() can be called both in 32- and 64-bit mode.
333
334 .align 5
335 .globl EXT(bcopy_nc)
336 .globl EXT(bcopy_nc_nop_if_32bit)
337
338 LEXT(bcopy_nc)
339 cmpwi cr1,r5,0 ; Check if we have a 0 length
340 sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check
341 mr r6,r3 ; Set source (must preserve r3 for memcopy return)
342 crset noncache ; Set non-cached
343 cror cr0_eq,cr1_eq,cr0_eq ; set cr0 beq if either length zero or to==from
344 LEXT(bcopy_nc_nop_if_32bit)
345 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
346 bne+ copyit32 ; handle 32-bit processor
347 blr ; either zero length or to==from
348
349 ;
350 ; void* memcpy(to, from, nbytes)
351 ; void* memmove(to, from, nbytes)
352 ;
353 ; memcpy() and memmove() are only called in 32-bit mode, albeit on both 32- and 64-bit processors.
354 ; However, they would work correctly if called in 64-bit mode.
355
356 .align 5
357 .globl EXT(memcpy)
358 .globl EXT(memmove)
359 .globl EXT(memcpy_nop_if_32bit)
360
361 LEXT(memcpy)
362 LEXT(memmove)
363 cmplwi cr1,r5,kShort ; less than 32 bytes?
364 sub. r12,r3,r4 ; test for to==from in mode-independent way, start fwd/rev check
365 mr r6,r4 ; Set source
366 mr r4,r3 ; Set the "to" (must preserve r3 for return value)
367 blt cr1,shortcopy ; special case short operands
368 crclr noncache ; Set cached
369 LEXT(memcpy_nop_if_32bit)
370 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
371 beqlr- ; exit if to==from
372
373
374 ; Here to copy on 32-bit processors.
375 ;
376 ; When we move the memory, forward overlays must be handled. We
377 ; also can not use the cache instructions if we are from bcopy_nc.
378 ; We need to preserve R3 because it needs to be returned for memcpy.
379 ; We can be interrupted and lose control here.
380 ;
381 ; When entered:
382 ; r4 = destination
383 ; r5 = length (>0)
384 ; r6 = source
385 ; r12 = (dest - source)
386 ; cr5 = noncache flag
387
388 copyit32: ; WARNING! can drop down to this label
389 cmplw cr1,r12,r5 ; must move reverse if (dest-source)<length
390 cntlzw r11,r5 ; get magnitude of length
391 dcbt 0,r6 ; start to touch in source
392 lis r10,hi16(0x80000000) ; get 0x80000000
393 neg r9,r4 ; start to get alignment for destination
394 dcbtst 0,r4 ; start to touch in destination
395 sraw r8,r10,r11 ; get mask based on operand length, to limit alignment
396 blt- cr1,reverse32bit ; reverse move required
397
398 ; Forward moves on 32-bit machines, also word aligned uncached ops on 64-bit machines.
399 ; NOTE: we never do an unaligned access if the source and destination are "relatively"
400 ; word aligned. We depend on this in the uncached case on 64-bit processors.
401 ; r4 = destination
402 ; r5 = length (>0)
403 ; r6 = source
404 ; r8 = inverse of largest mask smaller than operand length
405 ; r9 = neg(dest), used to compute alignment
406 ; cr5 = noncache flag
407
408 forward32bit: ; enter from 64-bit CPUs with word aligned uncached operands
409 rlwinm r7,r9,0,0x1F ; get bytes to 32-byte-align destination
410 andc. r0,r7,r8 ; limit to the maximum front end move
411 mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time...
412 beq alline ; Already on a line...
413
414 mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5
415 sub r5,r5,r0 ; Set the length left to move
416
417 bf 31,alhalf ; No single byte to do...
418 lbz r7,0(r6) ; Get the byte
419 addi r6,r6,1 ; Point to the next
420 stb r7,0(r4) ; Save the single
421 addi r4,r4,1 ; Bump sink
422
423 ; Sink is halfword aligned here
424
425 alhalf: bf 30,alword ; No halfword to do...
426 lhz r7,0(r6) ; Get the halfword
427 addi r6,r6,2 ; Point to the next
428 sth r7,0(r4) ; Save the halfword
429 addi r4,r4,2 ; Bump sink
430
431 ; Sink is word aligned here
432
433 alword: bf 29,aldouble ; No word to do...
434 lwz r7,0(r6) ; Get the word
435 addi r6,r6,4 ; Point to the next
436 stw r7,0(r4) ; Save the word
437 addi r4,r4,4 ; Bump sink
438
439 ; Sink is double aligned here
440
441 aldouble: bf 28,alquad ; No double to do...
442 lwz r7,0(r6) ; Get the first word
443 lwz r8,4(r6) ; Get the second word
444 addi r6,r6,8 ; Point to the next
445 stw r7,0(r4) ; Save the first word
446 stw r8,4(r4) ; Save the second word
447 addi r4,r4,8 ; Bump sink
448
449 ; Sink is quadword aligned here
450
451 alquad: bf 27,alline ; No quad to do...
452 lwz r7,0(r6) ; Get the first word
453 lwz r8,4(r6) ; Get the second word
454 lwz r9,8(r6) ; Get the third word
455 stw r7,0(r4) ; Save the first word
456 lwz r11,12(r6) ; Get the fourth word
457 addi r6,r6,16 ; Point to the next
458 stw r8,4(r4) ; Save the second word
459 stw r9,8(r4) ; Save the third word
460 stw r11,12(r4) ; Save the fourth word
461 addi r4,r4,16 ; Bump sink
462
463 ; Sink is line aligned here
464
465 alline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
466 mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time...
467 mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5
468 beq- backend ; No full lines to move
469
470 mtctr r0 ; set up loop count
471 li r0,96 ; Stride for touch ahead
472 b nxtline
473
474 .align 4
475 nxtline:
476 lwz r2,0(r6) ; Get the first word
477 lwz r5,4(r6) ; Get the second word
478 lwz r7,8(r6) ; Get the third word
479 lwz r8,12(r6) ; Get the fourth word
480 lwz r9,16(r6) ; Get the fifth word
481 lwz r10,20(r6) ; Get the sixth word
482 lwz r11,24(r6) ; Get the seventh word
483 lwz r12,28(r6) ; Get the eighth word
484 bt- noncache,skipz ; Skip if we are not cached...
485 dcbz 0,r4 ; Blow away the whole line because we are replacing it
486 dcbt r6,r0 ; Touch ahead a bit
487 skipz:
488 addi r6,r6,32 ; Point to the next
489 stw r2,0(r4) ; Save the first word
490 stw r5,4(r4) ; Save the second word
491 stw r7,8(r4) ; Save the third word
492 stw r8,12(r4) ; Save the fourth word
493 stw r9,16(r4) ; Save the fifth word
494 stw r10,20(r4) ; Save the sixth word
495 stw r11,24(r4) ; Save the seventh word
496 stw r12,28(r4) ; Save the eighth word
497 addi r4,r4,32 ; Bump sink
498 bdnz+ nxtline ; Do the next line, if any...
499
500
501 ; Move backend quadword
502
503 backend: ; Join here from "shortcopy" for forward moves <32 bytes
504 bf 27,noquad ; No quad to do...
505 lwz r7,0(r6) ; Get the first word
506 lwz r8,4(r6) ; Get the second word
507 lwz r9,8(r6) ; Get the third word
508 lwz r11,12(r6) ; Get the fourth word
509 stw r7,0(r4) ; Save the first word
510 addi r6,r6,16 ; Point to the next
511 stw r8,4(r4) ; Save the second word
512 stw r9,8(r4) ; Save the third word
513 stw r11,12(r4) ; Save the fourth word
514 addi r4,r4,16 ; Bump sink
515
516 ; Move backend double
517
518 noquad: bf 28,nodouble ; No double to do...
519 lwz r7,0(r6) ; Get the first word
520 lwz r8,4(r6) ; Get the second word
521 addi r6,r6,8 ; Point to the next
522 stw r7,0(r4) ; Save the first word
523 stw r8,4(r4) ; Save the second word
524 addi r4,r4,8 ; Bump sink
525
526 ; Move backend word
527
528 nodouble: bf 29,noword ; No word to do...
529 lwz r7,0(r6) ; Get the word
530 addi r6,r6,4 ; Point to the next
531 stw r7,0(r4) ; Save the word
532 addi r4,r4,4 ; Bump sink
533
534 ; Move backend halfword
535
536 noword: bf 30,nohalf ; No halfword to do...
537 lhz r7,0(r6) ; Get the halfword
538 addi r6,r6,2 ; Point to the next
539 sth r7,0(r4) ; Save the halfword
540 addi r4,r4,2 ; Bump sink
541
542 ; Move backend byte
543
544 nohalf: bflr 31 ; Leave cuz we are all done...
545 lbz r7,0(r6) ; Get the byte
546 stb r7,0(r4) ; Save the single
547 blr
548
549
550 ; Reverse moves on 32-bit machines, also reverse word aligned uncached moves on 64-bit machines.
551 ; NOTE: we never do an unaligned access if the source and destination are "relatively"
552 ; word aligned. We depend on this in the uncached case on 64-bit processors.
553 ; These are slower because we don't bother with dcbz. Fortunately, reverse moves are uncommon.
554 ; r4 = destination
555 ; r5 = length (>0)
556 ; r6 = source
557 ; r8 = inverse of largest mask smaller than operand length
558 ; cr5 = noncache flag (but we don't dcbz anyway)
559
560 reverse32bit: ; here from 64-bit code with word aligned uncached operands
561 add r4,r5,r4 ; Point past the last sink byte
562 add r6,r5,r6 ; Point past the last source byte
563 rlwinm r7,r4,0,0x1F ; Calculate the length to align dest on cache boundary
564 li r12,-1 ; Make sure we touch in the actual line
565 andc. r0,r7,r8 ; Apply movement limit
566 dcbt r12,r6 ; Touch in the last line of source
567 mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time...
568 dcbtst r12,r4 ; Touch in the last line of the sink
569 mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5
570 beq- balline ; Aready on cache line boundary (or too short to bother)
571
572 sub r5,r5,r0 ; Precaculate move length left after alignment
573
574 bf 31,balhalf ; No single byte to do...
575 lbz r7,-1(r6) ; Get the byte
576 subi r6,r6,1 ; Point to the next
577 stb r7,-1(r4) ; Save the single
578 subi r4,r4,1 ; Bump sink
579
580 ; Sink is halfword aligned here
581
582 balhalf: bf 30,balword ; No halfword to do...
583 lhz r7,-2(r6) ; Get the halfword
584 subi r6,r6,2 ; Point to the next
585 sth r7,-2(r4) ; Save the halfword
586 subi r4,r4,2 ; Bump sink
587
588 ; Sink is word aligned here
589
590 balword: bf 29,baldouble ; No word to do...
591 lwz r7,-4(r6) ; Get the word
592 subi r6,r6,4 ; Point to the next
593 stw r7,-4(r4) ; Save the word
594 subi r4,r4,4 ; Bump sink
595
596 ; Sink is double aligned here
597
598 baldouble: bf 28,balquad ; No double to do...
599 lwz r7,-8(r6) ; Get the first word
600 lwz r8,-4(r6) ; Get the second word
601 subi r6,r6,8 ; Point to the next
602 stw r7,-8(r4) ; Save the first word
603 stw r8,-4(r4) ; Save the second word
604 subi r4,r4,8 ; Bump sink
605
606 ; Sink is quadword aligned here
607
608 balquad: bf 27,balline ; No quad to do...
609 lwz r7,-16(r6) ; Get the first word
610 lwz r8,-12(r6) ; Get the second word
611 lwz r9,-8(r6) ; Get the third word
612 lwz r11,-4(r6) ; Get the fourth word
613 stw r7,-16(r4) ; Save the first word
614 subi r6,r6,16 ; Point to the next
615 stw r8,-12(r4) ; Save the second word
616 stw r9,-8(r4) ; Save the third word
617 stw r11,-4(r4) ; Save the fourth word
618 subi r4,r4,16 ; Bump sink
619
620 ; Sink is line aligned here
621
622 balline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
623 mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time...
624 mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5
625 beq- bbackend ; No full lines to move
626 mtctr r0 ; set up loop count
627 b bnxtline
628
629 .align 4
630 bnxtline:
631 lwz r7,-32(r6) ; Get the first word
632 lwz r5,-28(r6) ; Get the second word
633 lwz r2,-24(r6) ; Get the third word
634 lwz r12,-20(r6) ; Get the third word
635 lwz r11,-16(r6) ; Get the fifth word
636 lwz r10,-12(r6) ; Get the sixth word
637 lwz r9,-8(r6) ; Get the seventh word
638 lwz r8,-4(r6) ; Get the eighth word
639 subi r6,r6,32 ; Point to the next
640
641 stw r7,-32(r4) ; Get the first word
642 stw r5,-28(r4) ; Get the second word
643 stw r2,-24(r4) ; Get the third word
644 stw r12,-20(r4) ; Get the third word
645 stw r11,-16(r4) ; Get the fifth word
646 stw r10,-12(r4) ; Get the sixth word
647 stw r9,-8(r4) ; Get the seventh word
648 stw r8,-4(r4) ; Get the eighth word
649 subi r4,r4,32 ; Bump sink
650
651 bdnz+ bnxtline ; Do the next line, if any...
652
653 ;
654 ; Note: We touched these lines in at the beginning
655 ;
656
657 ; Move backend quadword
658
659 bbackend: ; Join here from "shortcopy" for reverse moves of <32 bytes
660 bf 27,bnoquad ; No quad to do...
661 lwz r7,-16(r6) ; Get the first word
662 lwz r8,-12(r6) ; Get the second word
663 lwz r9,-8(r6) ; Get the third word
664 lwz r11,-4(r6) ; Get the fourth word
665 stw r7,-16(r4) ; Save the first word
666 subi r6,r6,16 ; Point to the next
667 stw r8,-12(r4) ; Save the second word
668 stw r9,-8(r4) ; Save the third word
669 stw r11,-4(r4) ; Save the fourth word
670 subi r4,r4,16 ; Bump sink
671
672 ; Move backend double
673
674 bnoquad: bf 28,bnodouble ; No double to do...
675 lwz r7,-8(r6) ; Get the first word
676 lwz r8,-4(r6) ; Get the second word
677 subi r6,r6,8 ; Point to the next
678 stw r7,-8(r4) ; Save the first word
679 stw r8,-4(r4) ; Save the second word
680 subi r4,r4,8 ; Bump sink
681
682 ; Move backend word
683
684 bnodouble: bf 29,bnoword ; No word to do...
685 lwz r7,-4(r6) ; Get the word
686 subi r6,r6,4 ; Point to the next
687 stw r7,-4(r4) ; Save the word
688 subi r4,r4,4 ; Bump sink
689
690 ; Move backend halfword
691
692 bnoword: bf 30,bnohalf ; No halfword to do...
693 lhz r7,-2(r6) ; Get the halfword
694 subi r6,r6,2 ; Point to the next
695 sth r7,-2(r4) ; Save the halfword
696 subi r4,r4,2 ; Bump sink
697
698 ; Move backend byte
699
700 bnohalf: bflr 31 ; Leave cuz we are all done...
701 lbz r7,-1(r6) ; Get the byte
702 stb r7,-1(r4) ; Save the single
703 blr
704
705
706 // Here on 64-bit processors, which have a 128-byte cache line. This can be
707 // called either in 32 or 64-bit mode, which makes the test for reverse moves
708 // a little tricky. We've already filtered out the (sou==dest) and (len==0)
709 // special cases.
710 //
711 // When entered:
712 // r4 = destination (32 or 64-bit ptr)
713 // r5 = length (always 32 bits)
714 // r6 = source (32 or 64-bit ptr)
715 // r12 = (dest - source), reverse move required if (dest-source)<length
716 // cr5 = noncache flag
717
718 .align 5
719 copyit64:
720 rlwinm r7,r5,0,0,31 // truncate length to 32-bit, in case we're running in 64-bit mode
721 cntlzw r11,r5 // get magnitude of length
722 dcbt 0,r6 // touch in 1st block of source
723 dcbtst 0,r4 // touch in 1st destination cache block
724 subc r7,r12,r7 // set Carry if (dest-source)>=length, in mode-independent way
725 li r0,0 // get a 0
726 lis r10,hi16(0x80000000)// get 0x80000000
727 addze. r0,r0 // set cr0 on carry bit (beq if reverse move required)
728 neg r9,r4 // start to get alignment for destination
729 sraw r8,r10,r11 // get mask based on operand length, to limit alignment
730 bt-- noncache,c64uncached// skip if uncached
731 beq-- c64rdouble // handle cached reverse moves
732
733
734 // Forward, cached or doubleword aligned uncached. This is the common case.
735 // NOTE: we never do an unaligned access if the source and destination are "relatively"
736 // doubleword aligned. We depend on this in the uncached case.
737 // r4 = destination
738 // r5 = length (>0)
739 // r6 = source
740 // r8 = inverse of largest mask smaller than operand length
741 // r9 = neg(dest), used to compute alignment
742 // cr5 = noncache flag
743
744 c64double:
745 rlwinm r7,r9,0,0x7F // get #bytes to 128-byte align destination
746 andc r7,r7,r8 // limit by operand length
747 andi. r8,r7,7 // r8 <- #bytes to doubleword align
748 srwi r9,r7,3 // r9 <- #doublewords to 128-byte align
749 sub r5,r5,r7 // adjust length remaining
750 cmpwi cr1,r9,0 // any doublewords to move to cache align?
751 srwi r10,r5,7 // r10 <- 128-byte chunks to xfer after aligning dest
752 cmpwi cr7,r10,0 // set cr7 on chunk count
753 beq c64double2 // dest already doubleword aligned
754 mtctr r8
755 b c64double1
756
757 .align 5 // align inner loops
758 c64double1: // copy bytes until dest is doubleword aligned
759 lbz r0,0(r6)
760 addi r6,r6,1
761 stb r0,0(r4)
762 addi r4,r4,1
763 bdnz c64double1
764
765 c64double2: // r9/cr1=doublewords, r10/cr7=128-byte chunks
766 beq cr1,c64double4 // no doublewords to xfer in order to cache align
767 mtctr r9
768 b c64double3
769
770 .align 5 // align inner loops
771 c64double3: // copy doublewords until dest is 128-byte aligned
772 ld r7,0(r6)
773 addi r6,r6,8
774 std r7,0(r4)
775 addi r4,r4,8
776 bdnz c64double3
777
778 // Here to xfer 128-byte chunks, if any. Since we only have 8 GPRs for
779 // data (64 bytes), we load/store each twice per 128-byte chunk.
780
781 c64double4: // r10/cr7=128-byte chunks
782 rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords, after moving chunks
783 cmpwi cr1,r0,0 // set cr1 on leftover doublewords
784 beq cr7,c64double7 // no 128-byte chunks
785
786 ; We must check for (source-dest)<128 in a mode-independent way. If within 128 bytes,
787 ; turn on "noncache" because we cannot use dcbz128 even if operands are cacheable.
788
789 sub r8,r6,r4 // r8 <- (source - dest)
790 rldicr. r0,r8,0,63-7 // zero low 7 bits and check for 0, mode independent
791 cror noncache,cr0_eq,noncache // turn on "noncache" flag if (source-dest)<128
792 mtctr r10
793 b c64InnerLoop
794
795 .align 5 // align inner loop
796 c64InnerLoop: // loop copying 128-byte cache lines to 128-aligned destination
797 ld r0,0(r6) // start pipe: load 1st half-line
798 ld r2,8(r6)
799 ld r7,16(r6)
800 ld r8,24(r6)
801 ld r9,32(r6)
802 ld r10,40(r6)
803 ld r11,48(r6)
804 ld r12,56(r6)
805 bt noncache,c64InnerLoop1 // skip if uncached or overlap
806 dcbz128 0,r4 // avoid prefetch of next cache line
807 c64InnerLoop1:
808
809 std r0,0(r4)
810 std r2,8(r4)
811 std r7,16(r4)
812 std r8,24(r4)
813 std r9,32(r4)
814 std r10,40(r4)
815 std r11,48(r4)
816 std r12,56(r4)
817
818 ld r0,64(r6) // load 2nd half of chunk
819 ld r2,72(r6)
820 ld r7,80(r6)
821 ld r8,88(r6)
822 ld r9,96(r6)
823 ld r10,104(r6)
824 ld r11,112(r6)
825 ld r12,120(r6)
826 addi r6,r6,128
827
828 std r0,64(r4)
829 std r2,72(r4)
830 std r7,80(r4)
831 std r8,88(r4)
832 std r9,96(r4)
833 std r10,104(r4)
834 std r11,112(r4)
835 std r12,120(r4)
836 addi r4,r4,128 // advance to next dest chunk
837
838 bdnz c64InnerLoop // loop if more chunks
839
840
841 c64double7: // r5 <- leftover bytes, cr1 set on doubleword count
842 rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords (0-15)
843 andi. r5,r5,7 // r5/cr0 <- count of leftover bytes (0-7)
844 beq cr1,c64byte // no leftover doublewords
845 mtctr r0
846 b c64double8
847
848 .align 5 // align inner loop
849 c64double8: // loop copying leftover doublewords
850 ld r0,0(r6)
851 addi r6,r6,8
852 std r0,0(r4)
853 addi r4,r4,8
854 bdnz c64double8
855
856
857 // Forward byte loop.
858
859 c64byte: // r5/cr0 <- byte count (can be big if unaligned uncached)
860 beqlr // done if no leftover bytes
861 mtctr r5
862 b c64byte1
863
864 .align 5 // align inner loop
865 c64byte1:
866 lbz r0,0(r6)
867 addi r6,r6,1
868 stb r0,0(r4)
869 addi r4,r4,1
870 bdnz c64byte1
871
872 blr
873
874
875 // Uncached copies. We must avoid unaligned accesses, since they always take alignment
876 // exceptions on uncached memory on 64-bit processors. This may mean we copy long operands
877 // a byte at a time, but that is still much faster than alignment exceptions.
878 // r4 = destination
879 // r5 = length (>0)
880 // r6 = source
881 // r8 = inverse of largest mask smaller than operand length
882 // r9 = neg(dest), used to compute alignment
883 // r12 = (dest-source), used to test relative alignment
884 // cr0 = beq if reverse move required
885 // cr5 = noncache flag
886
887 c64uncached:
888 rlwinm r10,r12,0,29,31 // relatively doubleword aligned?
889 rlwinm r11,r12,0,30,31 // relatively word aligned?
890 cmpwi cr7,r10,0 // set cr7 beq if doubleword aligned
891 cmpwi cr1,r11,0 // set cr1 beq if word aligned
892 beq-- c64reverseUncached
893
894 beq cr7,c64double // doubleword aligned
895 beq cr1,forward32bit // word aligned, use G3/G4 code
896 cmpwi r5,0 // set cr0 on byte count
897 b c64byte // unaligned operands
898
899 c64reverseUncached:
900 beq cr7,c64rdouble // doubleword aligned so can use LD/STD
901 beq cr1,reverse32bit // word aligned, use G3/G4 code
902 add r6,r6,r5 // point to (end+1) of source and dest
903 add r4,r4,r5
904 cmpwi r5,0 // set cr0 on length
905 b c64rbyte // copy a byte at a time
906
907
908
909 // Reverse doubleword copies. This is used for all cached copies, and doubleword
910 // aligned uncached copies.
911 // r4 = destination
912 // r5 = length (>0)
913 // r6 = source
914 // r8 = inverse of largest mask of low-order 1s smaller than operand length
915 // cr5 = noncache flag
916
917 c64rdouble:
918 add r6,r6,r5 // point to (end+1) of source and dest
919 add r4,r4,r5
920 rlwinm r7,r4,0,29,31 // r7 <- #bytes to doubleword align dest
921 andc. r7,r7,r8 // limit by operand length
922 sub r5,r5,r7 // adjust length
923 srwi r8,r5,6 // r8 <- 64-byte chunks to xfer
924 cmpwi cr1,r8,0 // any chunks?
925 beq c64rd2 // source already doubleword aligned
926 mtctr r7
927
928 c64rd1: // copy bytes until source doublword aligned
929 lbzu r0,-1(r6)
930 stbu r0,-1(r4)
931 bdnz c64rd1
932
933 c64rd2: // r8/cr1 <- count of 64-byte chunks
934 rlwinm r0,r5,29,29,31 // r0 <- count of leftover doublewords
935 andi. r5,r5,7 // r5/cr0 <- count of leftover bytes
936 cmpwi cr7,r0,0 // leftover doublewords?
937 beq cr1,c64rd4 // no chunks to xfer
938 mtctr r8
939 b c64rd3
940
941 .align 5 // align inner loop
942 c64rd3: // loop copying 64-byte chunks
943 ld r7,-8(r6)
944 ld r8,-16(r6)
945 ld r9,-24(r6)
946 ld r10,-32(r6)
947 ld r11,-40(r6)
948 ld r12,-48(r6)
949 std r7,-8(r4)
950 std r8,-16(r4)
951 ld r7,-56(r6)
952 ldu r8,-64(r6)
953 std r9,-24(r4)
954 std r10,-32(r4)
955 std r11,-40(r4)
956 std r12,-48(r4)
957 std r7,-56(r4)
958 stdu r8,-64(r4)
959 bdnz c64rd3
960
961 c64rd4: // r0/cr7 = leftover doublewords r5/cr0 = leftover bytes
962 beq cr7,c64rbyte // no leftover doublewords
963 mtctr r0
964
965 c64rd5: // loop copying leftover doublewords
966 ldu r0,-8(r6)
967 stdu r0,-8(r4)
968 bdnz c64rd5
969
970
971 // Reverse byte loop.
972
973 c64rbyte: // r5/cr0 <- byte count (can be big if unaligned uncached)
974 beqlr // done if no leftover bytes
975 mtctr r5
976
977 c64rbyte1:
978 lbzu r0,-1(r6)
979 stbu r0,-1(r4)
980 bdnz c64rbyte1
981
982 blr
983