]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/bcopy.s
xnu-1486.2.11.tar.gz
[apple/xnu.git] / osfmk / ppc / bcopy.s
1 /*
2 * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 ;
29 ; Copy bytes of data around. Handles overlapped data.
30 ;
31 ;
32 #include <ppc/asm.h>
33 #include <ppc/proc_reg.h>
34 #include <assym.s>
35
36 ; These routines use CR5 for certain flags:
37 ; Use CR5_lt to indicate non-cached (in bcopy and memcpy)
38 #define noncache 20
39
40
41 ; The bcopy_phys variants use a stack frame so they can call bcopy as a subroutine.
42 #define BCOPY_SF_SIZE 32 // total size
43 #define BCOPY_SF_MSR 16 // we save caller's MSR here (possibly minus VEC and FP)
44
45
46 #define kShort 32 // short operands are special cased
47
48
49 ; void bcopy_physvir_32(from, to, nbytes)
50 ;
51 ; Attempt to copy physically addressed memory with translation on if conditions are met.
52 ; Otherwise do a normal bcopy_phys. This routine is used because some 32-bit processors
53 ; are very slow doing real-mode (translation off) copies, so we set up temporary BATs
54 ; for the passed phys addrs and do the copy with translation on.
55 ;
56 ; Rules are: - neither source nor destination can cross a page.
57 ; - Interrupts must be disabled when this routine is called.
58 ; - Translation must be on when called.
59 ;
60 ; To do the copy, we build a 128 DBAT for both the source and sink. If both are the same, only one
61 ; is loaded. We do not touch the IBATs, so there is no issue if either physical page
62 ; address is the same as the virtual address of the instructions we are executing.
63 ;
64 ; At the end, we invalidate the used DBATs.
65 ;
66 ; Note that the address parameters are long longs. We will transform these to 64-bit
67 ; values. Note that on 32-bit architectures that this will ignore the high half of the
68 ; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
69 ; there anyhow.
70 ;
71 ; Note also that this routine is used only on 32-bit machines. If you're contemplating use
72 ; on a 64-bit processor, use the physical memory window instead; please refer to copypv()
73 ; for an example of how this is done.
74
75 .align 5
76 .globl EXT(bcopy_physvir_32)
77
78 LEXT(bcopy_physvir_32)
79 mflr r0 ; get return address
80 rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
81 mfsprg r8,2 ; get processor feature flags
82 stw r0,8(r1) ; save return address
83 rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
84 stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy
85 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
86 subi r0,r7,1 ; get length - 1
87 rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
88 add r11,r3,r0 ; Point to last byte of sink
89 mr r5,r7 ; Get the length into the right register
90 rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
91
92 ; This test for page overflow may not work if the length is negative. Negative lengths are invalid input
93 ; to bcopy_physvir() on 32-bit machines, and will result in a panic.
94
95 add r12,r4,r0 ; Point to last byte of source
96 xor r7,r11,r3 ; See if we went to next page
97 xor r8,r12,r4 ; See if we went to next page
98 or r0,r7,r8 ; Combine wrap
99
100 // li r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
101 li r9,((2<<3)|2) ; Set default attributes
102 rlwinm. r0,r0,0,0,19 ; Did we overflow a page?
103 li r7,2 ; Set validity flags
104 li r8,2 ; Set validity flags
105 bne- bcopy_phys1 ; Overflowed page, do normal physical copy...
106
107 rlwimi r11,r9,0,15,31 ; Set sink lower DBAT value
108 rlwimi r12,r9,0,15,31 ; Set source lower DBAT value
109 rlwimi r7,r11,0,0,14 ; Set sink upper DBAT value
110 rlwimi r8,r12,0,0,14 ; Set source upper DBAT value
111 cmplw cr1,r11,r12 ; See if sink and source are same block
112
113 sync
114
115 mtdbatl 0,r11 ; Set sink lower DBAT
116 mtdbatu 0,r7 ; Set sink upper DBAT
117
118 beq- cr1,bcpvsame ; Source and sink are in same block
119
120 mtdbatl 1,r12 ; Set source lower DBAT
121 mtdbatu 1,r8 ; Set source upper DBAT
122
123 bcpvsame:
124 sync ; wait for the BATs to stabilize
125 isync
126
127 bl EXT(bcopy) ; BATs set up, args in r3-r5, so do the copy with DR on
128
129 li r0,0 ; Get set to invalidate upper half of BATs
130 sync ; Make sure all is well
131 mtdbatu 0,r0 ; Clear sink upper DBAT
132 mtdbatu 1,r0 ; Clear source upper DBAT
133 sync
134 isync
135
136 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address
137 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
138 mtlr r0
139 blr
140
141
142 ; void bcopy_phys(from, to, nbytes)
143 ;
144 ; Turns off data translation before the copy. This one will not work in user state.
145 ; This routine is used on 32 and 64-bit machines.
146 ;
147 ; Note that the address parameters are long longs. We will transform these to 64-bit
148 ; values. Note that on 32-bit architectures that this will ignore the high half of the
149 ; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
150 ; there anyhow.
151 ;
152 ; Also note that you probably will not be happy if either the sink or source spans across the
153 ; boundary between RAM and I/O space. Good chance of hanging the machine and this code
154 ; will not check, so be careful.
155 ;
156 ; NOTE: when called, translation must be on, and we must be in 32-bit mode.
157 ; Interrupts may or may not be disabled.
158
159 .align 5
160 .globl EXT(bcopy_phys)
161
162 LEXT(bcopy_phys)
163 mflr r0 ; get return address
164 rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
165 stw r0,8(r1) ; save
166 mfsprg r8,2 ; get processor feature flags
167 stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy
168 rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
169 rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
170 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
171 rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
172 mr r5,r7 ; Get the length into the right register
173
174 bcopy_phys1: ; enter from bcopy_physvir with pf64Bit in cr6 and parms in r3-r5
175 mfmsr r9 ; Get the MSR
176 lis r6,hi16(MASK(MSR_VEC)) ; Get vector enable
177 ori r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR)) ; Add in FP and DR
178 andc r9,r9,r6 ; unconditionally turn DR, VEC, and FP off
179 bt++ pf64Bitb,bcopy_phys64 ; skip if 64-bit (only they take hint)
180
181 ; 32-bit CPUs
182
183 mtmsr r9 ; turn DR, FP, and VEC off
184 isync ; Wait for it
185
186 bl EXT(bcopy) ; do the copy with translation off and caching on
187
188 mfmsr r9 ; Get the MSR
189 ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on (but leave VEC and FP off)
190 mtmsr r9 ; restore msr
191 isync ; wait for it to happen
192 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on
193 mtlr r0
194 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
195 blr
196
197
198 ; 64-bit: turn DR off and SF on.
199
200 bcopy_phys64: ; r9 = MSR with DP, VEC, and FP off
201 ori r8,r9,lo16(MASK(MSR_DR)) ; make a copy with DR back on... this is what we return to caller
202 srdi r2,r3,31 ; Get a 1 if source is in I/O memory
203 li r0,1 ; Note - we use this in a couple places below
204 srdi r10,r4,31 ; Get a 1 if sink is in I/O memory
205 std r8,BCOPY_SF_MSR(r1) ; save caller's MSR so we remember whether EE was on
206 rldimi r9,r0,63,MSR_SF_BIT ; set SF on in MSR we will copy with
207 cmpldi cr0,r2,1 ; Is source in I/O memory?
208 cmpldi cr7,r10,1 ; Is sink in I/O memory?
209 mtmsrd r9 ; turn 64-bit addressing on, data translation off
210 isync ; wait for it to happen
211 cror cr7_eq,cr0_eq,cr7_eq ; See if either source or sink is in I/O area
212 beq-- cr7,io_space_real_mode_copy ; an operand is in I/O space
213
214 bl EXT(bcopy) ; do copy with DR off and SF on, cache enabled
215
216 bcopy_phys64x:
217 mfmsr r9 ; Get the MSR we used to copy
218 rldicl r9,r9,0,MSR_SF_BIT+1 ; clear SF
219 ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on
220 mtmsrd r9 ; turn 64-bit mode off, translation back on
221 isync ; wait for it to happen
222 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on
223 ld r8,BCOPY_SF_MSR(r1) ; get caller's MSR once translation is back on
224 mtlr r0
225 mtmsrd r8,1 ; turn EE back on if necessary
226 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
227 blr
228
229 ; We need to copy with DR off, but one of the operands is in I/O space. To avoid wedging U3,
230 ; which cannot handle a cache burst in I/O space, we must turn caching off for the real memory access.
231 ; This can only be done by setting bits in HID4. We cannot lose control and execute random code in
232 ; this state, so we have to disable interrupts as well. This is an unpleasant hack.
233
234 io_space_real_mode_copy: ; r0=1, r9=MSR we want to copy with
235 sldi r11,r0,31-MSR_EE_BIT ; Get a mask for the EE bit
236 sldi r0,r0,32+8 ; Get the right bit to turn off caching
237 andc r9,r9,r11 ; Turn off EE bit
238 mfspr r2,hid4 ; Get HID4
239 mtmsrd r9,1 ; Force off EE
240 or r2,r2,r0 ; Set bit to make real accesses cache-inhibited
241 sync ; Sync up
242 mtspr hid4,r2 ; Make real accesses cache-inhibited
243 isync ; Toss prefetches
244
245 lis r12,0xE000 ; Get the unlikeliest ESID possible
246 srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000
247 slbie r12 ; Make sure the ERAT is cleared
248
249 sync
250 isync
251
252 bl EXT(bcopy_nc) ; copy with SF on and EE, DR, VEC, and FP off, cache inhibited
253
254 li r0,1 ; Get a 1
255 sldi r0,r0,32+8 ; Get the right bit to turn off caching
256 mfspr r2,hid4 ; Get HID4
257 andc r2,r2,r0 ; Clear bit to make real accesses cache-inhibited
258 sync ; Sync up
259 mtspr hid4,r2 ; Make real accesses not cache-inhibited
260 isync ; Toss prefetches
261
262 lis r12,0xE000 ; Get the unlikeliest ESID possible
263 srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000
264 slbie r12 ; Make sure the ERAT is cleared
265 b bcopy_phys64x
266
267
268 ;
269 ; shortcopy
270 ;
271 ; Special case short operands (<32 bytes), which are very common. Note that the check for
272 ; reverse vs normal moves isn't quite correct in 64-bit mode; in rare cases we will move in
273 ; reverse when it wasn't necessary to do so. This is OK, since performance of the two cases
274 ; is similar. We do get the direction right when it counts (ie, when the operands overlap.)
275 ; Also note that we use the G3/G4 "backend" code, even on G5. This is OK too, since G5 has
276 ; plenty of load/store dispatch bandwidth in this case, the extra ops are hidden by latency,
277 ; and using word instead of doubleword moves reduces the possibility of unaligned accesses,
278 ; which cost about 20 cycles if they cross a 32-byte boundary on G5. Finally, because we
279 ; might do unaligned accesses this code cannot be called from bcopy_nc().
280 ; r4 = destination
281 ; r5 = length (<32)
282 ; r6 = source
283 ; r12 = (dest - source)
284
285 .align 5
286 shortcopy:
287 cmplw r12,r5 ; must move reverse if (dest-source)<length
288 mtcrf 2,r5 ; move length to cr6 and cr7 one at a time...
289 mtcrf 1,r5 ; ...which is faster on G4 and G5
290 bge++ backend ; handle forward moves (most common case)
291 add r6,r6,r5 ; point one past end of operands in reverse moves
292 add r4,r4,r5
293 b bbackend ; handle reverse moves
294
295 ;
296 ; void bcopy(from, to, nbytes)
297 ;
298 ; NOTE: bcopy is called from copyin and copyout etc with the "thread_recover" ptr set.
299 ; This means bcopy must not set up a stack frame or touch non-volatile registers, and also means that it
300 ; cannot rely on turning off interrupts, because we expect to get DSIs and have execution aborted by a "longjmp"
301 ; to the thread_recover routine. What this means is that it would be hard to use vector or floating point
302 ; registers to accelerate the copy.
303 ;
304 ; NOTE: this code can be called in any of three "modes":
305 ; - on 32-bit processors (32-byte cache line)
306 ; - on 64-bit processors running in 32-bit mode (128-byte cache line)
307 ; - on 64-bit processors running in 64-bit mode (128-byte cache line)
308
309 .align 5
310 .globl EXT(bcopy)
311 .globl EXT(bcopy_nop_if_32bit)
312
313 LEXT(bcopy)
314 cmplwi cr1,r5,kShort ; less than 32 bytes?
315 sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check
316 mr r6,r3 ; Set source (must preserve r3 for memcopy return)
317 blt cr1,shortcopy ; special case short operands
318 crclr noncache ; Set cached
319 LEXT(bcopy_nop_if_32bit)
320 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
321 bne+ copyit32 ; handle 32-bit processor
322 blr ; to==from so nothing to do
323
324 ;
325 ; bcopy_nc(from, to, nbytes)
326 ;
327 ; bcopy_nc() operates on non-cached memory so we can not use any kind of cache instructions.
328 ; Furthermore, we must avoid all unaligned accesses on 64-bit machines, since they take
329 ; alignment exceptions. Thus we cannot use "shortcopy", which could do unaligned lwz/stw.
330 ; Like bcopy(), bcopy_nc() can be called both in 32- and 64-bit mode.
331
332 .align 5
333 .globl EXT(bcopy_nc)
334 .globl EXT(bcopy_nc_nop_if_32bit)
335
336 LEXT(bcopy_nc)
337 cmpwi cr1,r5,0 ; Check if we have a 0 length
338 sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check
339 mr r6,r3 ; Set source (must preserve r3 for memcopy return)
340 crset noncache ; Set non-cached
341 cror cr0_eq,cr1_eq,cr0_eq ; set cr0 beq if either length zero or to==from
342 LEXT(bcopy_nc_nop_if_32bit)
343 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
344 bne+ copyit32 ; handle 32-bit processor
345 blr ; either zero length or to==from
346
347 ;
348 ; void* memcpy(to, from, nbytes)
349 ; void* memmove(to, from, nbytes)
350 ;
351 ; memcpy() and memmove() are only called in 32-bit mode, albeit on both 32- and 64-bit processors.
352 ; However, they would work correctly if called in 64-bit mode.
353
354 .align 5
355 .globl EXT(memcpy)
356 .globl EXT(memmove)
357 .globl EXT(memcpy_nop_if_32bit)
358
359 LEXT(memcpy)
360 LEXT(memmove)
361 cmplwi cr1,r5,kShort ; less than 32 bytes?
362 sub. r12,r3,r4 ; test for to==from in mode-independent way, start fwd/rev check
363 mr r6,r4 ; Set source
364 mr r4,r3 ; Set the "to" (must preserve r3 for return value)
365 blt cr1,shortcopy ; special case short operands
366 crclr noncache ; Set cached
367 LEXT(memcpy_nop_if_32bit)
368 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
369 beqlr- ; exit if to==from
370
371
372 ; Here to copy on 32-bit processors.
373 ;
374 ; When we move the memory, forward overlays must be handled. We
375 ; also can not use the cache instructions if we are from bcopy_nc.
376 ; We need to preserve R3 because it needs to be returned for memcpy.
377 ; We can be interrupted and lose control here.
378 ;
379 ; When entered:
380 ; r4 = destination
381 ; r5 = length (>0)
382 ; r6 = source
383 ; r12 = (dest - source)
384 ; cr5 = noncache flag
385
386 copyit32: ; WARNING! can drop down to this label
387 cmplw cr1,r12,r5 ; must move reverse if (dest-source)<length
388 cntlzw r11,r5 ; get magnitude of length
389 dcbt 0,r6 ; start to touch in source
390 lis r10,hi16(0x80000000) ; get 0x80000000
391 neg r9,r4 ; start to get alignment for destination
392 dcbtst 0,r4 ; start to touch in destination
393 sraw r8,r10,r11 ; get mask based on operand length, to limit alignment
394 blt- cr1,reverse32bit ; reverse move required
395
396 ; Forward moves on 32-bit machines, also word aligned uncached ops on 64-bit machines.
397 ; NOTE: we never do an unaligned access if the source and destination are "relatively"
398 ; word aligned. We depend on this in the uncached case on 64-bit processors.
399 ; r4 = destination
400 ; r5 = length (>0)
401 ; r6 = source
402 ; r8 = inverse of largest mask smaller than operand length
403 ; r9 = neg(dest), used to compute alignment
404 ; cr5 = noncache flag
405
406 forward32bit: ; enter from 64-bit CPUs with word aligned uncached operands
407 rlwinm r7,r9,0,0x1F ; get bytes to 32-byte-align destination
408 andc. r0,r7,r8 ; limit to the maximum front end move
409 mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time...
410 beq alline ; Already on a line...
411
412 mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5
413 sub r5,r5,r0 ; Set the length left to move
414
415 bf 31,alhalf ; No single byte to do...
416 lbz r7,0(r6) ; Get the byte
417 addi r6,r6,1 ; Point to the next
418 stb r7,0(r4) ; Save the single
419 addi r4,r4,1 ; Bump sink
420
421 ; Sink is halfword aligned here
422
423 alhalf: bf 30,alword ; No halfword to do...
424 lhz r7,0(r6) ; Get the halfword
425 addi r6,r6,2 ; Point to the next
426 sth r7,0(r4) ; Save the halfword
427 addi r4,r4,2 ; Bump sink
428
429 ; Sink is word aligned here
430
431 alword: bf 29,aldouble ; No word to do...
432 lwz r7,0(r6) ; Get the word
433 addi r6,r6,4 ; Point to the next
434 stw r7,0(r4) ; Save the word
435 addi r4,r4,4 ; Bump sink
436
437 ; Sink is double aligned here
438
439 aldouble: bf 28,alquad ; No double to do...
440 lwz r7,0(r6) ; Get the first word
441 lwz r8,4(r6) ; Get the second word
442 addi r6,r6,8 ; Point to the next
443 stw r7,0(r4) ; Save the first word
444 stw r8,4(r4) ; Save the second word
445 addi r4,r4,8 ; Bump sink
446
447 ; Sink is quadword aligned here
448
449 alquad: bf 27,alline ; No quad to do...
450 lwz r7,0(r6) ; Get the first word
451 lwz r8,4(r6) ; Get the second word
452 lwz r9,8(r6) ; Get the third word
453 stw r7,0(r4) ; Save the first word
454 lwz r11,12(r6) ; Get the fourth word
455 addi r6,r6,16 ; Point to the next
456 stw r8,4(r4) ; Save the second word
457 stw r9,8(r4) ; Save the third word
458 stw r11,12(r4) ; Save the fourth word
459 addi r4,r4,16 ; Bump sink
460
461 ; Sink is line aligned here
462
463 alline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
464 mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time...
465 mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5
466 beq- backend ; No full lines to move
467
468 mtctr r0 ; set up loop count
469 li r0,96 ; Stride for touch ahead
470 b nxtline
471
472 .align 4
473 nxtline:
474 lwz r2,0(r6) ; Get the first word
475 lwz r5,4(r6) ; Get the second word
476 lwz r7,8(r6) ; Get the third word
477 lwz r8,12(r6) ; Get the fourth word
478 lwz r9,16(r6) ; Get the fifth word
479 lwz r10,20(r6) ; Get the sixth word
480 lwz r11,24(r6) ; Get the seventh word
481 lwz r12,28(r6) ; Get the eighth word
482 bt- noncache,skipz ; Skip if we are not cached...
483 dcbz 0,r4 ; Blow away the whole line because we are replacing it
484 dcbt r6,r0 ; Touch ahead a bit
485 skipz:
486 addi r6,r6,32 ; Point to the next
487 stw r2,0(r4) ; Save the first word
488 stw r5,4(r4) ; Save the second word
489 stw r7,8(r4) ; Save the third word
490 stw r8,12(r4) ; Save the fourth word
491 stw r9,16(r4) ; Save the fifth word
492 stw r10,20(r4) ; Save the sixth word
493 stw r11,24(r4) ; Save the seventh word
494 stw r12,28(r4) ; Save the eighth word
495 addi r4,r4,32 ; Bump sink
496 bdnz+ nxtline ; Do the next line, if any...
497
498
499 ; Move backend quadword
500
501 backend: ; Join here from "shortcopy" for forward moves <32 bytes
502 bf 27,noquad ; No quad to do...
503 lwz r7,0(r6) ; Get the first word
504 lwz r8,4(r6) ; Get the second word
505 lwz r9,8(r6) ; Get the third word
506 lwz r11,12(r6) ; Get the fourth word
507 stw r7,0(r4) ; Save the first word
508 addi r6,r6,16 ; Point to the next
509 stw r8,4(r4) ; Save the second word
510 stw r9,8(r4) ; Save the third word
511 stw r11,12(r4) ; Save the fourth word
512 addi r4,r4,16 ; Bump sink
513
514 ; Move backend double
515
516 noquad: bf 28,nodouble ; No double to do...
517 lwz r7,0(r6) ; Get the first word
518 lwz r8,4(r6) ; Get the second word
519 addi r6,r6,8 ; Point to the next
520 stw r7,0(r4) ; Save the first word
521 stw r8,4(r4) ; Save the second word
522 addi r4,r4,8 ; Bump sink
523
524 ; Move backend word
525
526 nodouble: bf 29,noword ; No word to do...
527 lwz r7,0(r6) ; Get the word
528 addi r6,r6,4 ; Point to the next
529 stw r7,0(r4) ; Save the word
530 addi r4,r4,4 ; Bump sink
531
532 ; Move backend halfword
533
534 noword: bf 30,nohalf ; No halfword to do...
535 lhz r7,0(r6) ; Get the halfword
536 addi r6,r6,2 ; Point to the next
537 sth r7,0(r4) ; Save the halfword
538 addi r4,r4,2 ; Bump sink
539
540 ; Move backend byte
541
542 nohalf: bflr 31 ; Leave cuz we are all done...
543 lbz r7,0(r6) ; Get the byte
544 stb r7,0(r4) ; Save the single
545 blr
546
547
548 ; Reverse moves on 32-bit machines, also reverse word aligned uncached moves on 64-bit machines.
549 ; NOTE: we never do an unaligned access if the source and destination are "relatively"
550 ; word aligned. We depend on this in the uncached case on 64-bit processors.
551 ; These are slower because we don't bother with dcbz. Fortunately, reverse moves are uncommon.
552 ; r4 = destination
553 ; r5 = length (>0)
554 ; r6 = source
555 ; r8 = inverse of largest mask smaller than operand length
556 ; cr5 = noncache flag (but we don't dcbz anyway)
557
558 reverse32bit: ; here from 64-bit code with word aligned uncached operands
559 add r4,r5,r4 ; Point past the last sink byte
560 add r6,r5,r6 ; Point past the last source byte
561 rlwinm r7,r4,0,0x1F ; Calculate the length to align dest on cache boundary
562 li r12,-1 ; Make sure we touch in the actual line
563 andc. r0,r7,r8 ; Apply movement limit
564 dcbt r12,r6 ; Touch in the last line of source
565 mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time...
566 dcbtst r12,r4 ; Touch in the last line of the sink
567 mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5
568 beq- balline ; Aready on cache line boundary (or too short to bother)
569
570 sub r5,r5,r0 ; Precaculate move length left after alignment
571
572 bf 31,balhalf ; No single byte to do...
573 lbz r7,-1(r6) ; Get the byte
574 subi r6,r6,1 ; Point to the next
575 stb r7,-1(r4) ; Save the single
576 subi r4,r4,1 ; Bump sink
577
578 ; Sink is halfword aligned here
579
580 balhalf: bf 30,balword ; No halfword to do...
581 lhz r7,-2(r6) ; Get the halfword
582 subi r6,r6,2 ; Point to the next
583 sth r7,-2(r4) ; Save the halfword
584 subi r4,r4,2 ; Bump sink
585
586 ; Sink is word aligned here
587
588 balword: bf 29,baldouble ; No word to do...
589 lwz r7,-4(r6) ; Get the word
590 subi r6,r6,4 ; Point to the next
591 stw r7,-4(r4) ; Save the word
592 subi r4,r4,4 ; Bump sink
593
594 ; Sink is double aligned here
595
596 baldouble: bf 28,balquad ; No double to do...
597 lwz r7,-8(r6) ; Get the first word
598 lwz r8,-4(r6) ; Get the second word
599 subi r6,r6,8 ; Point to the next
600 stw r7,-8(r4) ; Save the first word
601 stw r8,-4(r4) ; Save the second word
602 subi r4,r4,8 ; Bump sink
603
604 ; Sink is quadword aligned here
605
606 balquad: bf 27,balline ; No quad to do...
607 lwz r7,-16(r6) ; Get the first word
608 lwz r8,-12(r6) ; Get the second word
609 lwz r9,-8(r6) ; Get the third word
610 lwz r11,-4(r6) ; Get the fourth word
611 stw r7,-16(r4) ; Save the first word
612 subi r6,r6,16 ; Point to the next
613 stw r8,-12(r4) ; Save the second word
614 stw r9,-8(r4) ; Save the third word
615 stw r11,-4(r4) ; Save the fourth word
616 subi r4,r4,16 ; Bump sink
617
618 ; Sink is line aligned here
619
620 balline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
621 mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time...
622 mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5
623 beq- bbackend ; No full lines to move
624 mtctr r0 ; set up loop count
625 b bnxtline
626
627 .align 4
628 bnxtline:
629 lwz r7,-32(r6) ; Get the first word
630 lwz r5,-28(r6) ; Get the second word
631 lwz r2,-24(r6) ; Get the third word
632 lwz r12,-20(r6) ; Get the third word
633 lwz r11,-16(r6) ; Get the fifth word
634 lwz r10,-12(r6) ; Get the sixth word
635 lwz r9,-8(r6) ; Get the seventh word
636 lwz r8,-4(r6) ; Get the eighth word
637 subi r6,r6,32 ; Point to the next
638
639 stw r7,-32(r4) ; Get the first word
640 stw r5,-28(r4) ; Get the second word
641 stw r2,-24(r4) ; Get the third word
642 stw r12,-20(r4) ; Get the third word
643 stw r11,-16(r4) ; Get the fifth word
644 stw r10,-12(r4) ; Get the sixth word
645 stw r9,-8(r4) ; Get the seventh word
646 stw r8,-4(r4) ; Get the eighth word
647 subi r4,r4,32 ; Bump sink
648
649 bdnz+ bnxtline ; Do the next line, if any...
650
651 ;
652 ; Note: We touched these lines in at the beginning
653 ;
654
655 ; Move backend quadword
656
657 bbackend: ; Join here from "shortcopy" for reverse moves of <32 bytes
658 bf 27,bnoquad ; No quad to do...
659 lwz r7,-16(r6) ; Get the first word
660 lwz r8,-12(r6) ; Get the second word
661 lwz r9,-8(r6) ; Get the third word
662 lwz r11,-4(r6) ; Get the fourth word
663 stw r7,-16(r4) ; Save the first word
664 subi r6,r6,16 ; Point to the next
665 stw r8,-12(r4) ; Save the second word
666 stw r9,-8(r4) ; Save the third word
667 stw r11,-4(r4) ; Save the fourth word
668 subi r4,r4,16 ; Bump sink
669
670 ; Move backend double
671
672 bnoquad: bf 28,bnodouble ; No double to do...
673 lwz r7,-8(r6) ; Get the first word
674 lwz r8,-4(r6) ; Get the second word
675 subi r6,r6,8 ; Point to the next
676 stw r7,-8(r4) ; Save the first word
677 stw r8,-4(r4) ; Save the second word
678 subi r4,r4,8 ; Bump sink
679
680 ; Move backend word
681
682 bnodouble: bf 29,bnoword ; No word to do...
683 lwz r7,-4(r6) ; Get the word
684 subi r6,r6,4 ; Point to the next
685 stw r7,-4(r4) ; Save the word
686 subi r4,r4,4 ; Bump sink
687
688 ; Move backend halfword
689
690 bnoword: bf 30,bnohalf ; No halfword to do...
691 lhz r7,-2(r6) ; Get the halfword
692 subi r6,r6,2 ; Point to the next
693 sth r7,-2(r4) ; Save the halfword
694 subi r4,r4,2 ; Bump sink
695
696 ; Move backend byte
697
698 bnohalf: bflr 31 ; Leave cuz we are all done...
699 lbz r7,-1(r6) ; Get the byte
700 stb r7,-1(r4) ; Save the single
701 blr
702
703
704 // Here on 64-bit processors, which have a 128-byte cache line. This can be
705 // called either in 32 or 64-bit mode, which makes the test for reverse moves
706 // a little tricky. We've already filtered out the (sou==dest) and (len==0)
707 // special cases.
708 //
709 // When entered:
710 // r4 = destination (32 or 64-bit ptr)
711 // r5 = length (always 32 bits)
712 // r6 = source (32 or 64-bit ptr)
713 // r12 = (dest - source), reverse move required if (dest-source)<length
714 // cr5 = noncache flag
715
716 .align 5
717 copyit64:
718 rlwinm r7,r5,0,0,31 // truncate length to 32-bit, in case we're running in 64-bit mode
719 cntlzw r11,r5 // get magnitude of length
720 dcbt 0,r6 // touch in 1st block of source
721 dcbtst 0,r4 // touch in 1st destination cache block
722 subc r7,r12,r7 // set Carry if (dest-source)>=length, in mode-independent way
723 li r0,0 // get a 0
724 lis r10,hi16(0x80000000)// get 0x80000000
725 addze. r0,r0 // set cr0 on carry bit (beq if reverse move required)
726 neg r9,r4 // start to get alignment for destination
727 sraw r8,r10,r11 // get mask based on operand length, to limit alignment
728 bt-- noncache,c64uncached// skip if uncached
729 beq-- c64rdouble // handle cached reverse moves
730
731
732 // Forward, cached or doubleword aligned uncached. This is the common case.
733 // NOTE: we never do an unaligned access if the source and destination are "relatively"
734 // doubleword aligned. We depend on this in the uncached case.
735 // r4 = destination
736 // r5 = length (>0)
737 // r6 = source
738 // r8 = inverse of largest mask smaller than operand length
739 // r9 = neg(dest), used to compute alignment
740 // cr5 = noncache flag
741
742 c64double:
743 rlwinm r7,r9,0,0x7F // get #bytes to 128-byte align destination
744 andc r7,r7,r8 // limit by operand length
745 andi. r8,r7,7 // r8 <- #bytes to doubleword align
746 srwi r9,r7,3 // r9 <- #doublewords to 128-byte align
747 sub r5,r5,r7 // adjust length remaining
748 cmpwi cr1,r9,0 // any doublewords to move to cache align?
749 srwi r10,r5,7 // r10 <- 128-byte chunks to xfer after aligning dest
750 cmpwi cr7,r10,0 // set cr7 on chunk count
751 beq c64double2 // dest already doubleword aligned
752 mtctr r8
753 b c64double1
754
755 .align 5 // align inner loops
756 c64double1: // copy bytes until dest is doubleword aligned
757 lbz r0,0(r6)
758 addi r6,r6,1
759 stb r0,0(r4)
760 addi r4,r4,1
761 bdnz c64double1
762
763 c64double2: // r9/cr1=doublewords, r10/cr7=128-byte chunks
764 beq cr1,c64double4 // no doublewords to xfer in order to cache align
765 mtctr r9
766 b c64double3
767
768 .align 5 // align inner loops
769 c64double3: // copy doublewords until dest is 128-byte aligned
770 ld r7,0(r6)
771 addi r6,r6,8
772 std r7,0(r4)
773 addi r4,r4,8
774 bdnz c64double3
775
776 // Here to xfer 128-byte chunks, if any. Since we only have 8 GPRs for
777 // data (64 bytes), we load/store each twice per 128-byte chunk.
778
779 c64double4: // r10/cr7=128-byte chunks
780 rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords, after moving chunks
781 cmpwi cr1,r0,0 // set cr1 on leftover doublewords
782 beq cr7,c64double7 // no 128-byte chunks
783
784 ; We must check for (source-dest)<128 in a mode-independent way. If within 128 bytes,
785 ; turn on "noncache" because we cannot use dcbz128 even if operands are cacheable.
786
787 sub r8,r6,r4 // r8 <- (source - dest)
788 rldicr. r0,r8,0,63-7 // zero low 7 bits and check for 0, mode independent
789 cror noncache,cr0_eq,noncache // turn on "noncache" flag if (source-dest)<128
790 mtctr r10
791 b c64InnerLoop
792
793 .align 5 // align inner loop
794 c64InnerLoop: // loop copying 128-byte cache lines to 128-aligned destination
795 ld r0,0(r6) // start pipe: load 1st half-line
796 ld r2,8(r6)
797 ld r7,16(r6)
798 ld r8,24(r6)
799 ld r9,32(r6)
800 ld r10,40(r6)
801 ld r11,48(r6)
802 ld r12,56(r6)
803 bt noncache,c64InnerLoop1 // skip if uncached or overlap
804 dcbz128 0,r4 // avoid prefetch of next cache line
805 c64InnerLoop1:
806
807 std r0,0(r4)
808 std r2,8(r4)
809 std r7,16(r4)
810 std r8,24(r4)
811 std r9,32(r4)
812 std r10,40(r4)
813 std r11,48(r4)
814 std r12,56(r4)
815
816 ld r0,64(r6) // load 2nd half of chunk
817 ld r2,72(r6)
818 ld r7,80(r6)
819 ld r8,88(r6)
820 ld r9,96(r6)
821 ld r10,104(r6)
822 ld r11,112(r6)
823 ld r12,120(r6)
824 addi r6,r6,128
825
826 std r0,64(r4)
827 std r2,72(r4)
828 std r7,80(r4)
829 std r8,88(r4)
830 std r9,96(r4)
831 std r10,104(r4)
832 std r11,112(r4)
833 std r12,120(r4)
834 addi r4,r4,128 // advance to next dest chunk
835
836 bdnz c64InnerLoop // loop if more chunks
837
838
839 c64double7: // r5 <- leftover bytes, cr1 set on doubleword count
840 rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords (0-15)
841 andi. r5,r5,7 // r5/cr0 <- count of leftover bytes (0-7)
842 beq cr1,c64byte // no leftover doublewords
843 mtctr r0
844 b c64double8
845
846 .align 5 // align inner loop
847 c64double8: // loop copying leftover doublewords
848 ld r0,0(r6)
849 addi r6,r6,8
850 std r0,0(r4)
851 addi r4,r4,8
852 bdnz c64double8
853
854
855 // Forward byte loop.
856
857 c64byte: // r5/cr0 <- byte count (can be big if unaligned uncached)
858 beqlr // done if no leftover bytes
859 mtctr r5
860 b c64byte1
861
862 .align 5 // align inner loop
863 c64byte1:
864 lbz r0,0(r6)
865 addi r6,r6,1
866 stb r0,0(r4)
867 addi r4,r4,1
868 bdnz c64byte1
869
870 blr
871
872
873 // Uncached copies. We must avoid unaligned accesses, since they always take alignment
874 // exceptions on uncached memory on 64-bit processors. This may mean we copy long operands
875 // a byte at a time, but that is still much faster than alignment exceptions.
876 // r4 = destination
877 // r5 = length (>0)
878 // r6 = source
879 // r8 = inverse of largest mask smaller than operand length
880 // r9 = neg(dest), used to compute alignment
881 // r12 = (dest-source), used to test relative alignment
882 // cr0 = beq if reverse move required
883 // cr5 = noncache flag
884
885 c64uncached:
886 rlwinm r10,r12,0,29,31 // relatively doubleword aligned?
887 rlwinm r11,r12,0,30,31 // relatively word aligned?
888 cmpwi cr7,r10,0 // set cr7 beq if doubleword aligned
889 cmpwi cr1,r11,0 // set cr1 beq if word aligned
890 beq-- c64reverseUncached
891
892 beq cr7,c64double // doubleword aligned
893 beq cr1,forward32bit // word aligned, use G3/G4 code
894 cmpwi r5,0 // set cr0 on byte count
895 b c64byte // unaligned operands
896
897 c64reverseUncached:
898 beq cr7,c64rdouble // doubleword aligned so can use LD/STD
899 beq cr1,reverse32bit // word aligned, use G3/G4 code
900 add r6,r6,r5 // point to (end+1) of source and dest
901 add r4,r4,r5
902 cmpwi r5,0 // set cr0 on length
903 b c64rbyte // copy a byte at a time
904
905
906
907 // Reverse doubleword copies. This is used for all cached copies, and doubleword
908 // aligned uncached copies.
909 // r4 = destination
910 // r5 = length (>0)
911 // r6 = source
912 // r8 = inverse of largest mask of low-order 1s smaller than operand length
913 // cr5 = noncache flag
914
915 c64rdouble:
916 add r6,r6,r5 // point to (end+1) of source and dest
917 add r4,r4,r5
918 rlwinm r7,r4,0,29,31 // r7 <- #bytes to doubleword align dest
919 andc. r7,r7,r8 // limit by operand length
920 sub r5,r5,r7 // adjust length
921 srwi r8,r5,6 // r8 <- 64-byte chunks to xfer
922 cmpwi cr1,r8,0 // any chunks?
923 beq c64rd2 // source already doubleword aligned
924 mtctr r7
925
926 c64rd1: // copy bytes until source doublword aligned
927 lbzu r0,-1(r6)
928 stbu r0,-1(r4)
929 bdnz c64rd1
930
931 c64rd2: // r8/cr1 <- count of 64-byte chunks
932 rlwinm r0,r5,29,29,31 // r0 <- count of leftover doublewords
933 andi. r5,r5,7 // r5/cr0 <- count of leftover bytes
934 cmpwi cr7,r0,0 // leftover doublewords?
935 beq cr1,c64rd4 // no chunks to xfer
936 mtctr r8
937 b c64rd3
938
939 .align 5 // align inner loop
940 c64rd3: // loop copying 64-byte chunks
941 ld r7,-8(r6)
942 ld r8,-16(r6)
943 ld r9,-24(r6)
944 ld r10,-32(r6)
945 ld r11,-40(r6)
946 ld r12,-48(r6)
947 std r7,-8(r4)
948 std r8,-16(r4)
949 ld r7,-56(r6)
950 ldu r8,-64(r6)
951 std r9,-24(r4)
952 std r10,-32(r4)
953 std r11,-40(r4)
954 std r12,-48(r4)
955 std r7,-56(r4)
956 stdu r8,-64(r4)
957 bdnz c64rd3
958
959 c64rd4: // r0/cr7 = leftover doublewords r5/cr0 = leftover bytes
960 beq cr7,c64rbyte // no leftover doublewords
961 mtctr r0
962
963 c64rd5: // loop copying leftover doublewords
964 ldu r0,-8(r6)
965 stdu r0,-8(r4)
966 bdnz c64rd5
967
968
969 // Reverse byte loop.
970
971 c64rbyte: // r5/cr0 <- byte count (can be big if unaligned uncached)
972 beqlr // done if no leftover bytes
973 mtctr r5
974
975 c64rbyte1:
976 lbzu r0,-1(r6)
977 stbu r0,-1(r4)
978 bdnz c64rbyte1
979
980 blr
981