]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/bcopy.s
xnu-517.12.7.tar.gz
[apple/xnu.git] / osfmk / ppc / bcopy.s
1 /*
2 * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 ;
23 ; Copy bytes of data around. handles overlapped data.
24 ;
25 ; Change this to use Altivec later on, and maybe floating point.
26 ;
27 ;
28 #include <ppc/asm.h>
29 #include <ppc/proc_reg.h>
30 #include <assym.s>
31
32 ; Use CR5_lt to indicate non-cached
33 #define noncache 20
34
35 ; Use CR5_gt to indicate that we need to turn data translation back on
36 #define fixxlate 21
37
38 ; Use CR5_eq to indicate that we need to invalidate bats (if 32-bit) or turn off
39 ; 64-bit mode (if 64-bit) before returning to our caller. We overload the
40 ; bit to reduce the number of conditional branches at bcopy exit.
41 #define restorex 22
42
43 ; Use CR5_so to indicate that we need to restore real-mode cachability
44 ; Only needed on 64-bit machines
45 #define flipcache 23
46
47 ;
48 ; bcopy_nc(from, to, nbytes)
49 ;
50 ; bcopy_nc operates on non-cached memory so we can not use any kind
51 ; of cache instructions.
52 ;
53
54 .align 5
55 .globl EXT(bcopy_nc)
56
57 LEXT(bcopy_nc)
58
59 crset noncache ; Set non-cached
60 b bcpswap
61
62 ;
63 ; void bcopy_physvir(from, to, nbytes)
64 ; Attempt to copy physically addressed memory with translation on if conditions are met.
65 ; Otherwise do a normal bcopy_phys. This routine is used because some 32-bit processors
66 ; are very slow doing real-mode (translation off) copies, so we set up temporary BATs
67 ; for the passed phys addrs and do the copy with translation on.
68 ;
69 ; Rules are: neither source nor destination can cross a page.
70 ;
71 ; Interrupts must be disabled throughout the copy when this is called.
72 ; To do this, we build a
73 ; 128 DBAT for both the source and sink. If both are the same, only one is
74 ; loaded. We do not touch the IBATs, so there is no issue if either physical page
75 ; address is the same as the virtual address of the instructions we are executing.
76 ;
77 ; At the end, we invalidate the used DBATs.
78 ;
79 ; Note that the address parameters are long longs. We will transform these to 64-bit
80 ; values. Note that on 32-bit architectures that this will ignore the high half of the
81 ; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
82 ; there anyhow.
83 ;
84 ; Note, this one will not work in user state
85 ;
86
87 .align 5
88 .globl EXT(bcopy_physvir)
89
90 LEXT(bcopy_physvir)
91
92 crclr flipcache ; (HACK) No cache flip needed
93 mfsprg r8,2 ; get processor feature flags
94 rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
95 addic. r0,r7,-1 ; Get length - 1
96 rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
97 add r11,r3,r0 ; Point to last byte of sink
98 rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
99 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
100 rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
101 mr r5,r7 ; Get the length into the right register
102 cmplw cr1,r3,r4 ; Does source == sink?
103 bt++ pf64Bitb,bcopy_phys1 ; if 64-bit processor, use standard routine (no BATs)
104 add r12,r4,r0 ; Point to last byte of source
105 bltlr- ; Bail if length is 0 or way too big
106 xor r7,r11,r3 ; See if we went to next page
107 xor r8,r12,r4 ; See if we went to next page
108 or r0,r7,r8 ; Combine wrap
109
110 // li r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
111 li r9,((2<<3)|2) ; Set default attributes
112 rlwinm. r0,r0,0,0,19 ; Did we overflow a page?
113 li r7,2 ; Set validity flags
114 li r8,2 ; Set validity flags
115 bne- bcopy_phys1 ; Overflowed page, do normal physical copy...
116
117 crset restorex ; Remember to trash BATs on the way out
118 rlwimi r11,r9,0,15,31 ; Set sink lower DBAT value
119 rlwimi r12,r9,0,15,31 ; Set source lower DBAT value
120 rlwimi r7,r11,0,0,14 ; Set sink upper DBAT value
121 rlwimi r8,r12,0,0,14 ; Set source upper DBAT value
122 cmplw cr1,r11,r12 ; See if sink and source are same block
123
124 sync
125
126 mtdbatl 0,r11 ; Set sink lower DBAT
127 mtdbatu 0,r7 ; Set sink upper DBAT
128
129 beq- cr1,bcpvsame ; Source and sink are in same block
130
131 mtdbatl 1,r12 ; Set source lower DBAT
132 mtdbatu 1,r8 ; Set source upper DBAT
133
134 bcpvsame:
135 sync ; wait for BAT to stabilize
136 isync
137 mr r6,r3 ; Set source
138 crclr noncache ; Set cached
139 crclr fixxlate ; Set translation already ok
140
141 b copyit32 ; Go copy it...
142
143 ;
144 ; void bcopy_phys(from, to, nbytes)
145 ; Turns off data translation before the copy. Note, this one will
146 ; not work in user state. This routine is used on 32 and 64-bit
147 ; machines.
148 ;
149 ; Note that the address parameters are long longs. We will transform these to 64-bit
150 ; values. Note that on 32-bit architectures that this will ignore the high half of the
151 ; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
152 ; there anyhow.
153 ;
154 ; Also note that you probably will not be happy if either the sink or source spans across the
155 ; boundary between RAM and I/O space. Good chance of hanging the machine and this code
156 ; will not check, so be careful.
157 ;
158
159 .align 5
160 .globl EXT(bcopy_phys)
161
162 LEXT(bcopy_phys)
163 crclr flipcache ; (HACK) No cache flip needed
164 rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
165 mfsprg r8,2 ; get processor feature flags
166 rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
167 rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
168 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
169 rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
170 mr r5,r7 ; Get the length into the right register
171
172 bcopy_phys1: ; enter from bcopy_physvir with pf64Bit already in cr6
173 mfmsr r9 ; Get the MSR
174 crclr noncache ; Set cached
175 bt++ pf64Bitb,bcopy_phys64 ; skip if 64-bit (only they take hint)
176
177 ; 32-bit CPUs
178
179 sub. r0,r3,r4 ; to==from?
180 rlwinm r8,r9,0,MSR_DR_BIT,MSR_DR_BIT ; was translation on?
181 cmpwi cr1,r8,0 ; set cr1 beq if translation was off
182 oris r8,r8,hi16(MASK(MSR_VEC)) ; Get vector enable
183 cmplwi cr7,r5,0 ; Check if we have a 0 length
184 beqlr- ; bail if to==from
185 ori r8,r8,lo16(MASK(MSR_FP)) ; Get FP
186 mr r6,r3 ; Set source
187 andc r9,r9,r8 ; Turn off translation if it is on (should be) and FP, VEC
188 beqlr- cr7 ; Bail if length is 0
189
190 crclr restorex ; Make sure we do not trash BATs on the way out
191 mtmsr r9 ; Set DR translation off
192 isync ; Wait for it
193
194 crnot fixxlate,cr1_eq ; Remember to turn on translation if it was
195 b copyit32 ; Go copy it...
196
197 ; 64-bit: turn DR off and SF on, remember if we need to restore on way out.
198
199 bcopy_phys64: ; r9 = MSR
200
201 srdi r2,r3,31 ; (HACK) Get a 1 if source is in I/O memory
202 srdi. r0,r9,63-MSR_SF_BIT ; set cr0 beq on if SF was off when we were called
203 rlwinm r8,r9,MSR_DR_BIT+1,31,31 ; r8 <- DR bit right justified
204 cmpld cr1,r3,r4 ; to==from?
205 li r0,1 ; Note - we use this in a couple places below
206 lis r6,hi16(MASK(MSR_VEC)) ; Get vector enable
207 cmpwi cr7,r5,0 ; length==0 ?
208 ori r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR)) ; Add in FP and DR
209 beqlr-- cr1 ; bail if to==from
210 srdi r10,r4,31 ; (HACK) Get a 1 if sink is in I/O memory
211 rldimi r9,r0,63,MSR_SF_BIT ; set SF on
212 beqlr-- cr7 ; bail if length==0
213 andc r9,r9,r6 ; turn DR, VEC, FP off
214 cmpwi cr1,r8,0 ; was DR on?
215 crmove restorex,cr0_eq ; if SF was off, remember to turn back off before we return
216 mtmsrd r9 ; turn 64-bit addressing on, data translation off
217 cmpldi cr0,r2,1 ; (HACK) Is source in I/O memory?
218 isync ; wait for it to happen
219 mr r6,r3 ; Set source
220 cmpldi cr7,r10,1 ; (HACK) Is sink in I/O memory?
221 crnot fixxlate,cr1_eq ; if DR was on, remember to turn back on before we return
222
223 cror flipcache,cr0_eq,cr7_eq ; (HACK) See if either source or sink is in I/O area
224
225 rlwinm r10,r9,MSR_EE_BIT+1,31,31 ; (HACK GLORIOUS HACK) Isolate the EE bit
226 sldi r11,r0,31-MSR_EE_BIT ; (HACK GLORIOUS HACK)) Get a mask for the EE bit
227 sldi r0,r0,32+8 ; (HACK) Get the right bit to turn off caching
228 bf++ flipcache,copyit64 ; (HACK) No need to mess with caching...
229
230 ;
231 ; HACK GLORIOUS HACK - when we force of caching, we need to also force off
232 ; interruptions. We are out of CR bits, so we need to stash the entry EE
233 ; somewheres. It is in the XER.... We NEED to change this!!!!
234 ;
235
236 mtxer r10 ; (HACK GLORIOUS HACK) Remember EE
237 andc r9,r9,r11 ; (HACK GLORIOUS HACK) Turn off EE bit
238 mfspr r2,hid4 ; (HACK) Get HID4
239 crset noncache ; (HACK) Set non-cached
240 mtmsrd r9 ; (HACK GLORIOUS HACK) Force off EE
241 or r2,r2,r0 ; (HACK) Set bit to make real accesses cache-inhibited
242 sync ; (HACK) Sync up
243 li r0,1
244 mtspr hid4,r2 ; (HACK) Make real accesses cache-inhibited
245 isync ; (HACK) Toss prefetches
246
247 lis r12,0xE000 ; (HACK) Get the unlikeliest ESID possible
248 srdi r12,r12,1 ; (HACK) Make 0x7FFFFFFFF0000000
249 slbie r12 ; (HACK) Make sure the ERAT is cleared
250
251 sync ; (HACK)
252 isync ; (HACK)
253
254 b copyit64
255
256
257 ;
258 ; void bcopy(from, to, nbytes)
259 ;
260
261 .align 5
262 .globl EXT(bcopy)
263
264 LEXT(bcopy)
265
266 crclr noncache ; Set cached
267
268 bcpswap:
269 crclr flipcache ; (HACK) No cache flip needed
270 mfsprg r8,2 ; get processor feature flags
271 sub. r0,r4,r3 ; test for to==from in mode-independent way
272 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
273 cmpwi cr1,r5,0 ; Check if we have a 0 length
274 crclr restorex ; Make sure we do not trash BATs on the way out
275 mr r6,r3 ; Set source
276 crclr fixxlate ; Set translation already ok
277 beqlr- ; Bail if "to" and "from" are the same
278 beqlr- cr1 ; Bail if length is 0
279 bt++ pf64Bitb,copyit64 ; handle 64-bit processor
280 b copyit32 ; Go copy it...
281
282 ;
283 ; When we move the memory, forward overlays must be handled. We
284 ; also can not use the cache instructions if we are from bcopy_nc.
285 ; We need to preserve R3 because it needs to be returned for memcpy.
286 ; We can be interrupted and lose control here.
287 ;
288 ; There is no stack, so in order to use vectors, we would
289 ; need to take the vector exception. Any potential gains by using vectors
290 ; would be more than eaten up by this.
291 ;
292 ; NOTE: this code is called in three "modes":
293 ; - on 32-bit processors (32-byte cache line)
294 ; - on 64-bit processors running in 32-bit mode (128-byte cache line)
295 ; - on 64-bit processors running in 64-bit mode (128-byte cache line)
296 ;
297 ; ALSO NOTE: bcopy is called from copyin and copyout etc
298 ; with the "thread_recover" ptr set. This means bcopy must not set up a
299 ; stack frame or touch non-volatile registers, and also means that it
300 ; cannot rely on turning off interrupts, because we expect to get DSIs
301 ; and have execution aborted by a "longjmp" to the thread_recover
302 ; routine.
303 ;
304
305 .align 5
306 .globl EXT(memcpy)
307 ; NB: memcpy is only called in 32-bit mode, albeit on both 32- and 64-bit
308 ; processors...
309 LEXT(memcpy)
310 crclr flipcache ; (HACK) No cache flip needed
311 mfsprg r8,2 ; get processor feature flags
312 cmplw cr1,r3,r4 ; "to" and "from" the same?
313 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
314 mr r6,r4 ; Set the "from"
315 mr. r5,r5 ; Length zero?
316 crclr noncache ; Set cached
317 mr r4,r3 ; Set the "to"
318 crclr fixxlate ; Set translation already ok
319 beqlr- cr1 ; "to" and "from" are the same
320 beqlr- ; Length is 0
321 crclr restorex ; Make sure we do not trash BATs on the way out
322 bt++ pf64Bitb,copyit64 ; handle 64-bit processors
323
324 copyit32: sub r12,r4,r6 ; Get potential overlap (negative if backward move)
325 lis r8,0x7FFF ; Start up a mask
326 srawi r11,r12,31 ; Propagate the sign bit
327 dcbt br0,r6 ; Touch in the first source line
328 cntlzw r7,r5 ; Get the highest power of 2 factor of the length
329 ori r8,r8,0xFFFF ; Make limit 0x7FFFFFFF
330 xor r9,r12,r11 ; If sink - source was negative, invert bits
331 srw r8,r8,r7 ; Get move length limitation
332 sub r9,r9,r11 ; If sink - source was negative, add 1 and get absolute value
333 cmplw r12,r5 ; See if we actually forward overlap
334 cmplwi cr7,r9,32 ; See if at least a line between source and sink
335 dcbtst br0,r4 ; Touch in the first sink line
336 cmplwi cr1,r5,32 ; Are we moving more than a line?
337 cror noncache,noncache,cr7_lt ; Set to not DCBZ output line if not enough space
338 blt- fwdovrlap ; This is a forward overlapping area, handle it...
339
340 ;
341 ; R4 = sink
342 ; R5 = length
343 ; R6 = source
344 ;
345
346 ;
347 ; Here we figure out how much we have to move to get the sink onto a
348 ; cache boundary. If we can, and there are still more that 32 bytes
349 ; left to move, we can really speed things up by DCBZing the sink line.
350 ; We can not do this if noncache is set because we will take an
351 ; alignment exception.
352
353 G4word: ; enter from 64-bit case with word aligned uncached operands
354 neg r0,r4 ; Get the number of bytes to move to align to a line boundary
355 rlwinm. r0,r0,0,27,31 ; Clean it up and test it
356 and r0,r0,r8 ; limit to the maximum front end move
357 mtcrf 3,r0 ; Make branch mask for partial moves
358 sub r5,r5,r0 ; Set the length left to move
359 beq alline ; Already on a line...
360
361 bf 31,alhalf ; No single byte to do...
362 lbz r7,0(r6) ; Get the byte
363 addi r6,r6,1 ; Point to the next
364 stb r7,0(r4) ; Save the single
365 addi r4,r4,1 ; Bump sink
366
367 ; Sink is halfword aligned here
368
369 alhalf: bf 30,alword ; No halfword to do...
370 lhz r7,0(r6) ; Get the halfword
371 addi r6,r6,2 ; Point to the next
372 sth r7,0(r4) ; Save the halfword
373 addi r4,r4,2 ; Bump sink
374
375 ; Sink is word aligned here
376
377 alword: bf 29,aldouble ; No word to do...
378 lwz r7,0(r6) ; Get the word
379 addi r6,r6,4 ; Point to the next
380 stw r7,0(r4) ; Save the word
381 addi r4,r4,4 ; Bump sink
382
383 ; Sink is double aligned here
384
385 aldouble: bf 28,alquad ; No double to do...
386 lwz r7,0(r6) ; Get the first word
387 lwz r8,4(r6) ; Get the second word
388 addi r6,r6,8 ; Point to the next
389 stw r7,0(r4) ; Save the first word
390 stw r8,4(r4) ; Save the second word
391 addi r4,r4,8 ; Bump sink
392
393 ; Sink is quadword aligned here
394
395 alquad: bf 27,alline ; No quad to do...
396 lwz r7,0(r6) ; Get the first word
397 lwz r8,4(r6) ; Get the second word
398 lwz r9,8(r6) ; Get the third word
399 stw r7,0(r4) ; Save the first word
400 lwz r11,12(r6) ; Get the fourth word
401 addi r6,r6,16 ; Point to the next
402 stw r8,4(r4) ; Save the second word
403 stw r9,8(r4) ; Save the third word
404 stw r11,12(r4) ; Save the fourth word
405 addi r4,r4,16 ; Bump sink
406
407 ; Sink is line aligned here
408
409 alline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
410 mtcrf 3,r5 ; Make branch mask for backend partial moves
411 rlwinm r11,r5,0,0,26 ; Get number of bytes we are going to move
412 beq- backend ; No full lines to move
413
414 sub r5,r5,r11 ; Calculate the residual
415 li r10,96 ; Stride for touch ahead
416
417 nxtline: subic. r0,r0,1 ; Account for the line now
418
419 bt- noncache,skipz ; Skip if we are not cached...
420 dcbz br0,r4 ; Blow away the whole line because we are replacing it
421 dcbt r6,r10 ; Touch ahead a bit
422
423 skipz: lwz r7,0(r6) ; Get the first word
424 lwz r8,4(r6) ; Get the second word
425 lwz r9,8(r6) ; Get the third word
426 stw r7,0(r4) ; Save the first word
427 lwz r11,12(r6) ; Get the fourth word
428 stw r8,4(r4) ; Save the second word
429 lwz r7,16(r6) ; Get the fifth word
430 stw r9,8(r4) ; Save the third word
431 lwz r8,20(r6) ; Get the sixth word
432 stw r11,12(r4) ; Save the fourth word
433 lwz r9,24(r6) ; Get the seventh word
434 stw r7,16(r4) ; Save the fifth word
435 lwz r11,28(r6) ; Get the eighth word
436 addi r6,r6,32 ; Point to the next
437 stw r8,20(r4) ; Save the sixth word
438 stw r9,24(r4) ; Save the seventh word
439 stw r11,28(r4) ; Save the eighth word
440 addi r4,r4,32 ; Bump sink
441 bgt+ nxtline ; Do the next line, if any...
442
443
444 ; Move backend quadword
445
446 backend: bf 27,noquad ; No quad to do...
447 lwz r7,0(r6) ; Get the first word
448 lwz r8,4(r6) ; Get the second word
449 lwz r9,8(r6) ; Get the third word
450 lwz r11,12(r6) ; Get the fourth word
451 stw r7,0(r4) ; Save the first word
452 addi r6,r6,16 ; Point to the next
453 stw r8,4(r4) ; Save the second word
454 stw r9,8(r4) ; Save the third word
455 stw r11,12(r4) ; Save the fourth word
456 addi r4,r4,16 ; Bump sink
457
458 ; Move backend double
459
460 noquad: bf 28,nodouble ; No double to do...
461 lwz r7,0(r6) ; Get the first word
462 lwz r8,4(r6) ; Get the second word
463 addi r6,r6,8 ; Point to the next
464 stw r7,0(r4) ; Save the first word
465 stw r8,4(r4) ; Save the second word
466 addi r4,r4,8 ; Bump sink
467
468 ; Move backend word
469
470 nodouble: bf 29,noword ; No word to do...
471 lwz r7,0(r6) ; Get the word
472 addi r6,r6,4 ; Point to the next
473 stw r7,0(r4) ; Save the word
474 addi r4,r4,4 ; Bump sink
475
476 ; Move backend halfword
477
478 noword: bf 30,nohalf ; No halfword to do...
479 lhz r7,0(r6) ; Get the halfword
480 addi r6,r6,2 ; Point to the next
481 sth r7,0(r4) ; Save the halfword
482 addi r4,r4,2 ; Bump sink
483
484 ; Move backend byte
485
486 nohalf: bf 31,bcpydone ; Leave cuz we are all done...
487 lbz r7,0(r6) ; Get the byte
488 stb r7,0(r4) ; Save the single
489
490 bcpydone:
491 mfmsr r9 ; Get the MSR
492 bf++ flipcache,bcpydone0 ; (HACK) No need to mess with caching...
493
494 li r0,1 ; (HACK) Get a 1
495 mfxer r10 ; (HACK GLORIOUS HACK) Get the entry EE
496 sldi r0,r0,32+8 ; (HACK) Get the right bit to turn off caching
497 mfspr r2,hid4 ; (HACK) Get HID4
498 rlwinm r10,r10,31-MSR_EE_BIT,MSR_EE_BIT,MSR_EE_BIT ; (HACK GLORIOUS HACK) Set the EE bit
499 andc r2,r2,r0 ; (HACK) Clear bit to make real accesses cache-inhibited
500 or r9,r9,r10 ; (HACK GLORIOUS HACK) Set the EE in MSR
501 sync ; (HACK) Sync up
502 mtspr hid4,r2 ; (HACK) Make real accesses not cache-inhibited
503 isync ; (HACK) Toss prefetches
504
505 lis r12,0xE000 ; (HACK) Get the unlikeliest ESID possible
506 srdi r12,r12,1 ; (HACK) Make 0x7FFFFFFFF0000000
507 slbie r12 ; (HACK) Make sure the ERAT is cleared
508
509 mtmsr r9 ; (HACK GLORIOUS HACK) Set EE properly
510
511 bcpydone0:
512 lis r0,hi16(MASK(MSR_VEC)) ; Get the vector bit
513 ori r0,r0,lo16(MASK(MSR_FP)) ; Get the float bit
514 bf++ fixxlate,bcpydone1 ; skip if we do not need to fix translation...
515 ori r9,r9,lo16(MASK(MSR_DR)) ; Turn data translation on
516 andc r9,r9,r0 ; Make sure that FP and VEC are off
517 mtmsr r9 ; Just do it
518 isync ; Hang in there
519
520 bcpydone1:
521 bflr++ restorex ; done if we do not have to fix up addressing
522 mfsprg r8,2 ; get the feature flags again
523 mtcrf 0x02,r8 ; put pf64Bit where we can test it
524 bt++ pf64Bitb,bcpydone2 ; skip if 64-bit processor
525
526 ; 32-bit processor, so clear out the BATs we set up for bcopy_physvir
527
528 li r0,0 ; Get set to invalidate upper half
529 sync ; Make sure all is well
530 mtdbatu 0,r0 ; Clear sink upper DBAT
531 mtdbatu 1,r0 ; Clear source upper DBAT
532 sync
533 isync
534 blr
535
536 ; 64-bit processor, so turn off 64-bit mode we turned on to do bcopy_phys
537
538 bcpydone2:
539 mfmsr r9 ; get MSR again
540 andc r9,r9,r0 ; Make sure that FP and VEC are off
541 rldicl r9,r9,0,MSR_SF_BIT+1 ; clear SF
542 mtmsrd r9
543 isync
544 blr
545
546
547 ;
548 ; 0123456789ABCDEF0123456789ABCDEF
549 ; 0123456789ABCDEF0123456789ABCDEF
550 ; F
551 ; DE
552 ; 9ABC
553 ; 12345678
554 ; 123456789ABCDEF0
555 ; 0
556
557 ;
558 ; Here is where we handle a forward overlapping move. These will be slow
559 ; because we can not kill the cache of the destination until after we have
560 ; loaded/saved the source area. Also, because reading memory backwards is
561 ; slower when the cache line needs to be loaded because the critical
562 ; doubleword is loaded first, i.e., the last, then it goes back to the first,
563 ; and on in order. That means that when we are at the second to last DW we
564 ; have to wait until the whole line is in cache before we can proceed.
565 ;
566
567 G4reverseWord: ; here from 64-bit code with word aligned uncached operands
568 fwdovrlap: add r4,r5,r4 ; Point past the last sink byte
569 add r6,r5,r6 ; Point past the last source byte
570 and r0,r4,r8 ; Apply movement limit
571 li r12,-1 ; Make sure we touch in the actual line
572 mtcrf 3,r0 ; Figure out the best way to move backwards
573 dcbt r12,r6 ; Touch in the last line of source
574 rlwinm. r0,r0,0,27,31 ; Calculate the length to adjust to cache boundary
575 dcbtst r12,r4 ; Touch in the last line of the sink
576 beq- balline ; Aready on cache line boundary
577
578 sub r5,r5,r0 ; Precaculate move length left after alignment
579
580 bf 31,balhalf ; No single byte to do...
581 lbz r7,-1(r6) ; Get the byte
582 subi r6,r6,1 ; Point to the next
583 stb r7,-1(r4) ; Save the single
584 subi r4,r4,1 ; Bump sink
585
586 ; Sink is halfword aligned here
587
588 balhalf: bf 30,balword ; No halfword to do...
589 lhz r7,-2(r6) ; Get the halfword
590 subi r6,r6,2 ; Point to the next
591 sth r7,-2(r4) ; Save the halfword
592 subi r4,r4,2 ; Bump sink
593
594 ; Sink is word aligned here
595
596 balword: bf 29,baldouble ; No word to do...
597 lwz r7,-4(r6) ; Get the word
598 subi r6,r6,4 ; Point to the next
599 stw r7,-4(r4) ; Save the word
600 subi r4,r4,4 ; Bump sink
601
602 ; Sink is double aligned here
603
604 baldouble: bf 28,balquad ; No double to do...
605 lwz r7,-8(r6) ; Get the first word
606 lwz r8,-4(r6) ; Get the second word
607 subi r6,r6,8 ; Point to the next
608 stw r7,-8(r4) ; Save the first word
609 stw r8,-4(r4) ; Save the second word
610 subi r4,r4,8 ; Bump sink
611
612 ; Sink is quadword aligned here
613
614 balquad: bf 27,balline ; No quad to do...
615 lwz r7,-16(r6) ; Get the first word
616 lwz r8,-12(r6) ; Get the second word
617 lwz r9,-8(r6) ; Get the third word
618 lwz r11,-4(r6) ; Get the fourth word
619 stw r7,-16(r4) ; Save the first word
620 subi r6,r6,16 ; Point to the next
621 stw r8,-12(r4) ; Save the second word
622 stw r9,-8(r4) ; Save the third word
623 stw r11,-4(r4) ; Save the fourth word
624 subi r4,r4,16 ; Bump sink
625
626 ; Sink is line aligned here
627
628 balline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
629 mtcrf 3,r5 ; Make branch mask for backend partial moves
630 beq- bbackend ; No full lines to move
631
632
633 ; Registers in use: R0, R1, R3, R4, R5, R6
634 ; Registers not in use: R2, R7, R8, R9, R10, R11, R12 - Ok, we can make another free for 8 of them
635
636 bnxtline: subic. r0,r0,1 ; Account for the line now
637
638 lwz r7,-32(r6) ; Get the first word
639 lwz r5,-28(r6) ; Get the second word
640 lwz r2,-24(r6) ; Get the third word
641 lwz r12,-20(r6) ; Get the third word
642 lwz r11,-16(r6) ; Get the fifth word
643 lwz r10,-12(r6) ; Get the sixth word
644 lwz r9,-8(r6) ; Get the seventh word
645 lwz r8,-4(r6) ; Get the eighth word
646 subi r6,r6,32 ; Point to the next
647
648 stw r7,-32(r4) ; Get the first word
649 ble- bnotouch ; Last time, skip touch of source...
650 dcbt br0,r6 ; Touch in next source line
651
652 bnotouch: stw r5,-28(r4) ; Get the second word
653 stw r2,-24(r4) ; Get the third word
654 stw r12,-20(r4) ; Get the third word
655 stw r11,-16(r4) ; Get the fifth word
656 stw r10,-12(r4) ; Get the sixth word
657 stw r9,-8(r4) ; Get the seventh word
658 stw r8,-4(r4) ; Get the eighth word
659 subi r4,r4,32 ; Bump sink
660
661 bgt+ bnxtline ; Do the next line, if any...
662
663 ;
664 ; Note: We touched these lines in at the beginning
665 ;
666
667 ; Move backend quadword
668
669 bbackend: bf 27,bnoquad ; No quad to do...
670 lwz r7,-16(r6) ; Get the first word
671 lwz r8,-12(r6) ; Get the second word
672 lwz r9,-8(r6) ; Get the third word
673 lwz r11,-4(r6) ; Get the fourth word
674 stw r7,-16(r4) ; Save the first word
675 subi r6,r6,16 ; Point to the next
676 stw r8,-12(r4) ; Save the second word
677 stw r9,-8(r4) ; Save the third word
678 stw r11,-4(r4) ; Save the fourth word
679 subi r4,r4,16 ; Bump sink
680
681 ; Move backend double
682
683 bnoquad: bf 28,bnodouble ; No double to do...
684 lwz r7,-8(r6) ; Get the first word
685 lwz r8,-4(r6) ; Get the second word
686 subi r6,r6,8 ; Point to the next
687 stw r7,-8(r4) ; Save the first word
688 stw r8,-4(r4) ; Save the second word
689 subi r4,r4,8 ; Bump sink
690
691 ; Move backend word
692
693 bnodouble: bf 29,bnoword ; No word to do...
694 lwz r7,-4(r6) ; Get the word
695 subi r6,r6,4 ; Point to the next
696 stw r7,-4(r4) ; Save the word
697 subi r4,r4,4 ; Bump sink
698
699 ; Move backend halfword
700
701 bnoword: bf 30,bnohalf ; No halfword to do...
702 lhz r7,-2(r6) ; Get the halfword
703 subi r6,r6,2 ; Point to the next
704 sth r7,-2(r4) ; Save the halfword
705 subi r4,r4,2 ; Bump sink
706
707 ; Move backend byte
708
709 bnohalf: bf 31,bcpydone ; Leave cuz we are all done...
710 lbz r7,-1(r6) ; Get the byte
711 stb r7,-1(r4) ; Save the single
712
713 b bcpydone ; Go exit cuz we are all done...
714
715
716 // Here on 64-bit processors, which have a 128-byte cache line. This can be
717 // called either in 32 or 64-bit mode, which makes the test for reverse moves
718 // a little tricky. We've already filtered out the (sou==dest) and (len==0)
719 // special cases.
720 //
721 // When entered:
722 // r4 = destination (32 or 64-bit ptr)
723 // r5 = length (always 32 bits)
724 // r6 = source (32 or 64-bit ptr)
725 // cr5 = noncache, fixxlate, flipcache, and restorex flags set
726
727 .align 5
728 copyit64:
729 lis r2,0x4000 // r2 = 0x00000000 40000000
730 neg r12,r4 // start to compute #bytes to align dest
731 bt-- noncache,noncache1 // (HACK) Do not even try anything cached...
732 dcbt 0,r6 // touch in 1st block of source
733 noncache1:
734 add. r2,r2,r2 // if 0x00000000 80000000 < 0, we are in 32-bit mode
735 cntlzw r9,r5 // get highest power-of-2 in length
736 rlwinm r7,r12,0,25,31 // r7 <- bytes to 128-byte align dest
737 bt-- noncache,noncache2 // (HACK) Do not even try anything cached...
738 dcbtst 0,r4 // touch in 1st destination cache block
739 noncache2:
740 sraw r2,r2,r9 // get mask with 1s for leading 0s in length, plus 1 more 1-bit
741 bge copyit64a // skip if we are running in 64-bit mode
742 rlwinm r4,r4,0,0,31 // running in 32-bit mode, so truncate ptrs and lengths to 32 bits
743 rlwinm r5,r5,0,0,31
744 rlwinm r6,r6,0,0,31
745 copyit64a: // now we can use 64-bit compares even if running in 32-bit mode
746 sub r8,r4,r6 // get (dest-source)
747 andc r7,r7,r2 // limit bytes to align by operand length
748 cmpld cr1,r8,r5 // if (dest-source)<length, must move reverse
749 bt-- noncache,c64uncached // skip if uncached
750 blt-- cr1,c64rdouble // handle cached reverse moves
751
752
753 // Forward, cached or doubleword aligned uncached. This is the common case.
754 // r4-r6 = dest, length, source (as above)
755 // r7 = #bytes 128-byte align dest (limited by copy length)
756 // cr5 = flags, as above
757
758 c64double:
759 andi. r8,r7,7 // r8 <- #bytes to doubleword align
760 srwi r9,r7,3 // r9 <- #doublewords to 128-byte align
761 sub r5,r5,r7 // adjust length remaining
762 cmpwi cr1,r9,0 // any doublewords to move to cache align?
763 srwi r10,r5,7 // r10 <- 128-byte chunks to xfer after aligning dest
764 cmpwi cr7,r10,0 // set cr7 on chunk count
765 beq c64double2 // dest already doubleword aligned
766 mtctr r8
767 b c64double1
768
769 .align 5 // align inner loops
770 c64double1: // copy bytes until dest is doubleword aligned
771 lbz r0,0(r6)
772 addi r6,r6,1
773 stb r0,0(r4)
774 addi r4,r4,1
775 bdnz c64double1
776
777 c64double2: // r9/cr1=doublewords, r10=128-byte chunks, cr7=blt if r5==0
778 beq cr1,c64double4 // no doublewords to xfer in order to cache align
779 mtctr r9
780 b c64double3
781
782 .align 5 // align inner loops
783 c64double3: // copy doublewords until dest is 128-byte aligned
784 ld r7,0(r6)
785 addi r6,r6,8
786 std r7,0(r4)
787 addi r4,r4,8
788 bdnz c64double3
789
790 // Here to xfer 128-byte chunks, if any. Because the IBM 970 cannot issue two stores/cycle,
791 // we pipeline the inner loop so we can pair loads and stores. Since we only have 8 GPRs for
792 // data (64 bytes), we load/store each twice per 128-byte chunk.
793
794 c64double4: // r10/cr7=128-byte chunks
795 rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords, after moving chunks
796 cmpwi cr1,r0,0 // set cr1 on leftover doublewords
797 beq cr7,c64double7 // no 128-byte chunks
798 sub r8,r6,r4 // r8 <- (source - dest)
799 li r9,128 // start at next cache line (we've already touched in 1st line)
800 cmpldi cr7,r8,128 // if (source-dest)<128, cannot use dcbz128 beacause of overlap
801 cror noncache,cr7_lt,noncache // turn on "noncache" flag if (source-dest)<128
802 bt-- noncache,noncache3 // (HACK) Skip cache touch if noncachable
803 dcbt128 r9,r6,1 // start forward stream
804 noncache3:
805 mtctr r10
806
807 ld r0,0(r6) // start pipe: load 1st half-line
808 ld r2,8(r6)
809 ld r7,16(r6)
810 ld r8,24(r6)
811 ld r9,32(r6)
812 ld r10,40(r6)
813 ld r11,48(r6)
814 ld r12,56(r6)
815 b c64InnerLoopEntryPt
816
817 .align 5 // align inner loop
818 c64InnerLoop: // loop copying 128-byte cache lines to 128-aligned destination
819 std r0,64(r4) // store 2nd half of chunk n
820 ld r0,0(r6) // load 1st half of chunk n+1
821 std r2,72(r4)
822 ld r2,8(r6)
823 std r7,80(r4)
824 ld r7,16(r6)
825 std r8,88(r4)
826 ld r8,24(r6)
827 std r9,96(r4)
828 ld r9,32(r6)
829 std r10,104(r4)
830 ld r10,40(r6)
831 std r11,112(r4)
832 ld r11,48(r6)
833 std r12,120(r4)
834 ld r12,56(r6)
835 addi r4,r4,128 // advance to next dest chunk
836 c64InnerLoopEntryPt: // initial entry into loop, with 1st halfline loaded
837 bt noncache,c64InnerLoop1 // skip if uncached or overlap
838 dcbz128 0,r4 // avoid prefetch of next cache line
839 c64InnerLoop1:
840 std r0,0(r4) // store 1st half of chunk n
841 ld r0,64(r6) // load 2nd half of chunk n
842 std r2,8(r4)
843 ld r2,72(r6)
844 std r7,16(r4)
845 ld r7,80(r6)
846 std r8,24(r4)
847 ld r8,88(r6)
848 std r9,32(r4)
849 ld r9,96(r6)
850 std r10,40(r4)
851 ld r10,104(r6)
852 std r11,48(r4)
853 ld r11,112(r6)
854 std r12,56(r4)
855 ld r12,120(r6)
856 addi r6,r6,128 // advance to next source chunk if any
857 bdnz c64InnerLoop // loop if more chunks
858
859 std r0,64(r4) // store 2nd half of last chunk
860 std r2,72(r4)
861 std r7,80(r4)
862 std r8,88(r4)
863 std r9,96(r4)
864 std r10,104(r4)
865 std r11,112(r4)
866 std r12,120(r4)
867 addi r4,r4,128 // advance to next dest chunk
868
869 c64double7: // r5 <- leftover bytes, cr1 set on doubleword count
870 rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords (0-15)
871 andi. r5,r5,7 // r5/cr0 <- count of leftover bytes (0-7)
872 beq cr1,c64byte // no leftover doublewords
873 mtctr r0
874 b c64double8
875
876 .align 5 // align inner loop
877 c64double8: // loop copying leftover doublewords
878 ld r0,0(r6)
879 addi r6,r6,8
880 std r0,0(r4)
881 addi r4,r4,8
882 bdnz c64double8
883
884
885 // Forward byte loop.
886
887 c64byte: // r5/cr0 <- byte count (can be big if unaligned uncached)
888 beq bcpydone // done if no leftover bytes
889 mtctr r5
890 b c64byte1
891
892 .align 5 // align inner loop
893 c64byte1:
894 lbz r0,0(r6)
895 addi r6,r6,1
896 stb r0,0(r4)
897 addi r4,r4,1
898 bdnz c64byte1
899
900 b bcpydone
901
902
903 // Uncached copies. We must avoid unaligned accesses, since they always take alignment
904 // exceptions on uncached memory on 64-bit processors. This may mean we copy long operands
905 // a byte at a time, but that is still much faster than alignment exceptions.
906 // r4-r6 = dest, length, source (as above)
907 // r2 = mask of 1s for leading 0s in length, plus 1 extra 1
908 // r7 = #bytes to copy to 128-byte align dest (limited by operand length)
909 // cr1 = blt if reverse move required
910
911 c64uncached:
912 xor r0,r6,r4 // get relative alignment
913 rlwinm r10,r0,0,29,31 // relatively doubleword aligned?
914 rlwinm r11,r0,0,30,31 // relatively word aligned?
915 not r8,r2 // get mask to limit initial length of copy for G4word
916 blt cr1,c64reverseUncached
917
918 cmpwi cr0,r10,0 // set cr0 beq if doubleword aligned
919 cmpwi cr1,r11,0 // set cr1 beq if word aligned
920 beq cr0,c64double // doubleword aligned
921 beq cr1,G4word // word aligned, use G3/G4 code
922 cmpwi r5,0 // set cr0 on byte count
923 b c64byte // unaligned operands
924
925 c64reverseUncached:
926 cmpwi cr0,r10,0 // set cr0 beq if doubleword aligned
927 cmpwi cr1,r11,0 // set cr1 beq if word aligned
928 beq cr0,c64rdouble // doubleword aligned so can use LD/STD
929 beq cr1,G4reverseWord // word aligned, use G3/G4 code
930 add r6,r6,r5 // point to (end+1) of source and dest
931 add r4,r4,r5
932 cmpwi r5,0 // set cr0 on length
933 b c64rbyte // copy a byte at a time
934
935
936
937 // Reverse doubleword copies. This is used for all cached copies, and doubleword
938 // aligned uncached copies.
939 // r4 = destination (32 or 64-bit ptr)
940 // r5 = length (always 32 bits)
941 // r6 = source (32 or 64-bit ptr)
942 // cr5 = noncache, fixxlate, and restorex flags set
943
944 c64rdouble:
945 add r6,r6,r5 // point to (end+1) of source and dest
946 add r4,r4,r5
947 rlwinm. r7,r4,0,29,31 // r7 <- #bytes to doubleword align dest
948 cmplw cr1,r7,r5 // operand long enough to doubleword align?
949 blt cr1,c64rd0 // yes
950 mr r7,r5 // no
951 c64rd0:
952 sub r5,r5,r7 // adjust length
953 srwi r8,r5,6 // r8 <- 64-byte chunks to xfer
954 cmpwi cr1,r8,0 // any chunks?
955 beq c64rd2 // source already doubleword aligned
956 mtctr r7
957
958 c64rd1: // copy bytes until source doublword aligned
959 lbzu r0,-1(r6)
960 stbu r0,-1(r4)
961 bdnz c64rd1
962
963 c64rd2: // r8/cr1 <- count of 64-byte chunks
964 rlwinm r0,r5,29,29,31 // r0 <- count of leftover doublewords
965 andi. r5,r5,7 // r5/cr0 <- count of leftover bytes
966 cmpwi cr7,r0,0 // leftover doublewords?
967 beq cr1,c64rd4 // no chunks to xfer
968 li r9,-128 // start at next cache line
969 mtctr r8
970 bt noncache,c64rd3 // (HACK) Do not start a stream if noncachable...
971 dcbt128 r9,r6,3 // start reverse stream
972 b c64rd3
973
974 .align 5 // align inner loop
975 c64rd3: // loop copying 64-byte chunks
976 ld r7,-8(r6)
977 ld r8,-16(r6)
978 ld r9,-24(r6)
979 ld r10,-32(r6)
980 ld r11,-40(r6)
981 ld r12,-48(r6)
982 std r7,-8(r4)
983 std r8,-16(r4)
984 ld r7,-56(r6)
985 ldu r8,-64(r6)
986 std r9,-24(r4)
987 std r10,-32(r4)
988 std r11,-40(r4)
989 std r12,-48(r4)
990 std r7,-56(r4)
991 stdu r8,-64(r4)
992 bdnz c64rd3
993
994 c64rd4: // r0/cr7 = leftover doublewords r5/cr0 = leftover bytes
995 beq cr7,c64rbyte // no leftover doublewords
996 mtctr r0
997
998 c64rd5: // loop copying leftover doublewords
999 ldu r0,-8(r6)
1000 stdu r0,-8(r4)
1001 bdnz c64rd5
1002
1003
1004 // Reverse byte loop.
1005
1006 c64rbyte: // r5/cr0 <- byte count (can be big if unaligned uncached)
1007 beq bcpydone // done if no leftover bytes
1008 mtctr r5
1009
1010 c64rbyte1:
1011 lbzu r0,-1(r6)
1012 stbu r0,-1(r4)
1013 bdnz c64rbyte1
1014
1015 b bcpydone
1016