]> git.saurik.com Git - apple/xnu.git/blob - bsd/dev/ppc/xsumas.s
xnu-792.13.8.tar.gz
[apple/xnu.git] / bsd / dev / ppc / xsumas.s
1 /*
2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
14 * agreement.
15 *
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
18 * file.
19 *
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
27 *
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
29 */
30
31 #define kShort 11
32 #define cr1_gt 5 // bit 1 of cr1
33
34 /*
35 * short xsum_assym( short *p, int len, short xsum, boolean odd);
36 *
37 * r3 - Pointer to data
38 * r4 - Length of data
39 * r5 - Accumulated sum value
40 * r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data)
41 *
42 * Note: If the "odd" flag is set, the address in r3 will be even. Nonetheless, we
43 * correctly handle the case where the flag is set and the address is odd.
44 *
45 * This is the internet (IP, TCP) checksum algorithm, which is the 1s-complement sum
46 * of the data, treated as an array of 16-bit integers. 1s-complement sums are done
47 * via "add with carry" operations on a 2s-complement machine like PPC. Note that
48 * the adds can be done in parallel on 32-bit (or 64-bit) registers, as long as the
49 * final sum is folded down to 16 bits. On 32-bit machines we use "adde", which is
50 * perfect except that it serializes the adds on the carry bit. On 64-bit machines
51 * we avoid this serialization by adding 32-bit words into 64-bit sums, then folding
52 * all 64-bits into a 16-bit sum at the end. We cannot use "adde" on 64-bit sums,
53 * because the kernel runs in 32-bit mode even on 64-bit machines (so the carry bit
54 * is set on the low 32-bits of the sum.)
55 *
56 * Using Altivec is tempting, but the performance impact of the greatly increased
57 * number of exceptions and register save/restore traffic probably make it impractical
58 * for now.
59 */
60 .globl _xsum_assym
61 .globl _xsum_nop_if_32bit
62 .text
63 .align 5
64 _xsum_assym:
65 cmplwi cr0,r4,kShort ; too short to word align?
66 rlwinm r2,r3,0,0x3 ; get byte offset in word
67 dcbt 0,r3 ; touch in 1st cache line
68 cmpwi cr6,r2,0 ; is address word aligned?
69 ble cr0,Lshort ; skip if too short to bother aligning
70
71 subfic r0,r2,4 ; get #bytes in partial word
72 cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set
73 addic r0,r0,0 ; turn off carry
74 beq cr6,Laligned ; skip if already word aligned (r2==0 if aligned)
75
76 ; Partial word at start: zero filled on left, it becomes initial checksum.
77
78 rlwinm r3,r3,0,0,29 ; word align address
79 mtcrf 0x01,r2 ; move byte offset to cr7
80 lwz r6,0(r3) ; get partial word
81 li r7,-1 ; start of mask for partial fill
82 slwi r8,r2,3 ; multiply byte offset by 8
83 sub r4,r4,r0 ; adjust length for bytes in partial word
84 crxor cr1_gt,31,cr1_gt; set flag if byte-lane swap will be necessary
85 srw r7,r7,r8 ; get mask for bytes to keep in partial word
86 addi r3,r3,4 ; point to next word of input
87 and r2,r6,r7 ; zero fill on left
88
89 ; Address is now word aligned. Prepare for inner loop over 32-byte chunks.
90 ; r2 = initial checksum
91 ; r3 = word aligned address
92 ; r4 = length remaining
93 ; r5 = accumulated sum parameter
94 ; carry = off
95 ; cr1_gt = "starting on odd address" flag
96
97 Laligned:
98 srwi. r0,r4,5 ; get count of 32-byte chunks
99 mtcrf 0x02,r4 ; move residual length to cr6 and cr7
100 mtcrf 0x01,r4
101 beq cr0,Lleftovers ; no chunks
102
103 mtctr r0 ; set up loop count
104 li r4,32 ; offset to next chunk
105 _xsum_nop_if_32bit:
106 b L64BitPath ; use the 64-bit path (patched to nop on 32-bit machine)
107 dcbt r4,r3 ; touch in 2nd cache line
108 li r0,96 ; get touch offset
109 b LInnerLoop32 ; enter 32-bit loop
110
111 ; Inner loop for 32-bit machines.
112
113 .align 4
114 LInnerLoop32:
115 lwz r4,0(r3)
116 lwz r6,4(r3)
117 lwz r7,8(r3)
118 lwz r8,12(r3)
119 adde r2,r2,r4
120 lwz r9,16(r3)
121 adde r2,r2,r6
122 lwz r10,20(r3)
123 adde r2,r2,r7
124 lwz r11,24(r3)
125 adde r2,r2,r8
126 lwz r12,28(r3)
127 adde r2,r2,r9
128 dcbt r3,r0
129 adde r2,r2,r10
130 addi r3,r3,32
131 adde r2,r2,r11
132 adde r2,r2,r12
133 bdnz+ LInnerLoop32
134
135 ; Handle leftover bytes.
136 ; r2 = checksum so far
137 ; r3 = word aligned address
138 ; r5 = accumulated sum parameter
139 ; carry = live
140 ; cr1_gt = "starting on odd address" flag
141 ; cr6,cr7 = residual length
142
143 Lleftovers:
144 bf 27,Lleftover8 ; test 0x10 bit of residual length
145 lwz r4,0(r3)
146 lwz r6,4(r3)
147 lwz r7,8(r3)
148 lwz r8,12(r3)
149 addi r3,r3,16
150 adde r2,r2,r4
151 adde r2,r2,r6
152 adde r2,r2,r7
153 adde r2,r2,r8
154 Lleftover8:
155 bf 28,Lleftover4
156 lwz r4,0(r3)
157 lwz r6,4(r3)
158 addi r3,r3,8
159 adde r2,r2,r4
160 adde r2,r2,r6
161 Lleftover4:
162 bf 29,Lleftover2
163 lwz r4,0(r3)
164 addi r3,r3,4
165 adde r2,r2,r4
166 Lleftover2:
167 bf 30,Lleftover1
168 lhz r4,0(r3)
169 addi r3,r3,2
170 adde r2,r2,r4
171 Lleftover1:
172 bf 31,Lwrapup
173 lbz r4,0(r3)
174 slwi r4,r4,8 ; shift last byte into proper lane
175 adde r2,r2,r4
176
177 ; All data bytes checksummed. Wrap up.
178 ; r2 = checksum so far (word parallel)
179 ; r5 = accumulated sum parameter
180 ; carry = live
181 ; cr1_gt = "starting on odd address" flag
182
183 Lwrapup:
184 addze r2,r2 ; add in last carry
185 addze r2,r2 ; in case the "addze" carries
186 Lwrapupx: ; here from short-operand case, with xer(ca) undefined
187 srwi r6,r2,16 ; top half of 32-bit checksum
188 rlwinm r7,r2,0,0xFFFF ; lower half
189 add r2,r6,r7 ; add them together
190 srwi r6,r2,16 ; then do it again, in case first carried
191 rlwinm r7,r2,0,0xFFFF
192 add r2,r6,r7
193 bf cr1_gt,Lswapped ; test "starting on odd address" flag
194
195 ; The checksum began on an odd address, so swap bytes.
196
197 rlwinm r6,r2,24,0x00FF ; move top byte to bottom
198 rlwinm r7,r2,8,0xFF00 ; bottom to top
199 or r2,r6,r7 ; rejoin
200
201 ; Finally, add in checksum passed in as a parameter.
202
203 Lswapped:
204 add r2,r2,r5 ; add passed-in checksum
205 srwi r6,r2,16 ; top half of 32-bit checksum
206 rlwinm r7,r2,0,0xFFFF ; lower half
207 add r2,r6,r7 ; add them together
208 srwi r6,r2,16 ; then do it again, in case first carried
209 rlwinm r7,r2,0,0xFFFF
210 add r3,r6,r7 ; steer result into r3
211 blr
212
213 ; Handle short operands. Do a halfword at a time.
214 ; r3 = address
215 ; r4 = length (<= kShort)
216 ; r5 = accumulated sum parameter
217 ; r6 = "starting on odd byte" flag
218
219 Lshort:
220 cmpwi cr6,r4,2 ; at least two bytes?
221 andi. r0,r4,1 ; odd length?
222 li r2,0 ; initialize checksum
223 cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set
224 blt cr6,Lshort2 ; fewer than two bytes, so skip
225 Lshort1:
226 cmpwi cr6,r4,4 ; two more bytes (after we decrement)?
227 lhz r7,0(r3)
228 subi r4,r4,2
229 addi r3,r3,2
230 add r2,r2,r7 ; note no need for "adde"
231 bge cr6,Lshort1 ; loop for 2 more bytes
232 Lshort2:
233 beq Lwrapupx ; no byte at end, proceed to checkout with carry undefined
234 lbz r7,0(r3)
235 slwi r7,r7,8 ; shift last byte into proper lane
236 add r2,r2,r7
237 b Lwrapupx
238
239 ; Handle 64-bit machine. The major improvement over the 32-bit path is that we use
240 ; four parallel 32-bit accumulators, which carry into the upper half naturally so we
241 ; do not have to use "adde", which serializes on the carry bit. Note that we cannot
242 ; do 64-bit "adde"s, because we run in 32-bit mode so carry would not be set correctly.
243 ; r2 = checksum so far (ie, the zero-filled partial first word)
244 ; r3 = word aligned address
245 ; r5 = accumulated sum parameter
246 ; ctr = number of 32-byte chunks of input
247 ; carry = unused in this code
248 ; cr1_gt = "starting on odd address" flag
249 ; cr6,cr7 = residual length
250
251 L64BitPath:
252 stw r13,-4(r1) ; save a few nonvolatile regs in red zone so we can use them
253 stw r14,-8(r1)
254 stw r15,-12(r1)
255 stw r16,-16(r1)
256 li r0,128 ; to touch next line
257 li r13,0 ; r13-r15 are the accumulators, so initialize them
258 dcbt r3,r0 ; touch in next cache line, and keep loads away from the above stores
259 lwz r4,0(r3) ; start pipeline by loading first 32 bytes into r4, r6-r12
260 lwz r6,4(r3)
261 lwz r7,8(r3)
262 mr r14,r2 ; just copy incoming partial word into one of the accumulators
263 li r15,0
264 lwz r8,12(r3)
265 lwz r9,16(r3)
266 li r16,0
267 li r0,256 ; get touch offset
268 lwz r10,20(r3)
269 lwz r11,24(r3)
270 lwz r12,28(r3) ; load last word of previous chunk
271 addi r3,r3,32 ; skip past the chunk
272 bdnz++ LInnerLoop64 ; enter loop if another chunk to go
273
274 b LAddLastChunk ; only one chunk
275
276 ; Inner loop for 64-bit processors. This loop is scheduled for the 970.
277 ; It is pipelined (loads are one iteration ahead of adds), and unrolled.
278 ; It should take 9-10 cycles per iteration, which consumes 64 bytes of input.
279
280 .align 5
281 LInnerLoop64: ; 64 bytes/iteration
282 add r13,r13,r4 ; cycle 1
283 add r14,r14,r6
284 dcbt r3,r0 ; touch in 2 lines ahead
285 lwz r4,0(r3)
286
287 add r15,r15,r7 ; cycle 2, etc
288 lwz r6,4(r3)
289 lwz r7,8(r3)
290 add r16,r16,r8
291
292 lwz r8,12(r3)
293 add r13,r13,r9
294 add r14,r14,r10
295 lwz r9,16(r3)
296
297 add r15,r15,r11
298 lwz r10,20(r3)
299 lwz r11,24(r3)
300 add r16,r16,r12
301 bdz-- LEarlyExit ; early exit if no more chunks
302
303 lwz r12,28(r3)
304 add r13,r13,r4
305 add r14,r14,r6
306 lwz r4,32(r3)
307
308 add r15,r15,r7
309 lwz r6,36(r3)
310 lwz r7,40(r3)
311 add r16,r16,r8
312
313 lwz r8,44(r3)
314 add r13,r13,r9
315 add r14,r14,r10
316 lwz r9,48(r3)
317
318 add r15,r15,r11
319 lwz r10,52(r3)
320 lwz r11,56(r3)
321 add r16,r16,r12
322
323 nop ; position last load in 2nd dispatch slot
324 lwz r12,60(r3)
325 addi r3,r3,64
326 bdnz++ LInnerLoop64
327
328 b LAddLastChunk
329
330 ; Add in the last 32-byte chunk, and any leftover bytes.
331 ; r3 = word aligned address of next byte of data
332 ; r5 = accumulated sum parameter
333 ; r13-r16 = the four accumulators
334 ; cr1_gt = "starting on odd address" flag
335 ; cr6,cr7 = residual length
336
337 LEarlyExit: ; here from middle of inner loop
338 lwz r12,28(r3) ; load last word of last chunk
339 addi r3,r3,32
340 LAddLastChunk: ; last 32-byte chunk of input is in r4,r6-r12
341 add r13,r13,r4 ; add in last chunk
342 add r14,r14,r6 ; these are 64-bit adds
343 add r15,r15,r7
344 add r16,r16,r8
345 add r13,r13,r9
346 add r14,r14,r10
347 add r15,r15,r11
348 add r16,r16,r12
349
350 ; Handle leftover bytes, if any.
351
352 bf 27,Lleft1 ; test 0x10 bit of residual length
353 lwz r4,0(r3)
354 lwz r6,4(r3)
355 lwz r7,8(r3)
356 lwz r8,12(r3)
357 addi r3,r3,16
358 add r13,r13,r4
359 add r14,r14,r6
360 add r15,r15,r7
361 add r16,r16,r8
362 Lleft1:
363 bf 28,Lleft2
364 lwz r4,0(r3)
365 lwz r6,4(r3)
366 addi r3,r3,8
367 add r13,r13,r4
368 add r14,r14,r6
369 Lleft2:
370 bf 29,Lleft3
371 lwz r4,0(r3)
372 addi r3,r3,4
373 add r14,r14,r4
374 Lleft3:
375 bf 30,Lleft4
376 lhz r4,0(r3)
377 addi r3,r3,2
378 add r15,r15,r4
379 Lleft4:
380 bf 31,Lleft5
381 lbz r4,0(r3)
382 slwi r4,r4,8 ; shift last byte into proper lane
383 add r16,r16,r4
384
385 ; All data bytes have been checksummed. Now we must add together the four
386 ; accumulators and restore the regs from the red zone.
387 ; r3 = word aligned address of next byte of data
388 ; r5 = accumulated sum parameter
389 ; r13-r16 = the four accumulators
390 ; carry = not used so far
391 ; cr1_gt = "starting on odd address" flag
392
393 Lleft5:
394 add r8,r13,r14 ; add the four accumulators together
395 add r9,r15,r16
396 lwz r13,-4(r1) ; start to restore nonvolatiles from red zone
397 lwz r14,-8(r1)
398 add r8,r8,r9 ; now r8 is 64-bit sum of the four accumulators
399 lwz r15,-12(r1)
400 lwz r16,-16(r1)
401 srdi r7,r8,32 ; get upper half of 64-bit sum
402 addc r2,r7,r8 ; finally, do a 32-bit add of the two halves of r8 (setting carry)
403 b Lwrapup ; merge r2, r5, and carry into a 16-bit checksum