]> git.saurik.com Git - apple/xnu.git/blame - bsd/dev/ppc/xsumas.s
xnu-1504.7.4.tar.gz
[apple/xnu.git] / bsd / dev / ppc / xsumas.s
CommitLineData
1c79356b 1/*
5d5c5d0d
A
2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b 27 */
91447636
A
28
29#define kShort 11
30#define cr1_gt 5 // bit 1 of cr1
1c79356b
A
31
32/*
91447636
A
33 * short xsum_assym( short *p, int len, short xsum, boolean odd);
34 *
35 * r3 - Pointer to data
36 * r4 - Length of data
37 * r5 - Accumulated sum value
38 * r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data)
39 *
40 * Note: If the "odd" flag is set, the address in r3 will be even. Nonetheless, we
41 * correctly handle the case where the flag is set and the address is odd.
42 *
43 * This is the internet (IP, TCP) checksum algorithm, which is the 1s-complement sum
44 * of the data, treated as an array of 16-bit integers. 1s-complement sums are done
45 * via "add with carry" operations on a 2s-complement machine like PPC. Note that
46 * the adds can be done in parallel on 32-bit (or 64-bit) registers, as long as the
47 * final sum is folded down to 16 bits. On 32-bit machines we use "adde", which is
48 * perfect except that it serializes the adds on the carry bit. On 64-bit machines
49 * we avoid this serialization by adding 32-bit words into 64-bit sums, then folding
50 * all 64-bits into a 16-bit sum at the end. We cannot use "adde" on 64-bit sums,
51 * because the kernel runs in 32-bit mode even on 64-bit machines (so the carry bit
52 * is set on the low 32-bits of the sum.)
53 *
54 * Using Altivec is tempting, but the performance impact of the greatly increased
55 * number of exceptions and register save/restore traffic probably make it impractical
56 * for now.
57 */
58 .globl _xsum_assym
59 .globl _xsum_nop_if_32bit
60 .text
61 .align 5
62_xsum_assym:
63 cmplwi cr0,r4,kShort ; too short to word align?
64 rlwinm r2,r3,0,0x3 ; get byte offset in word
65 dcbt 0,r3 ; touch in 1st cache line
66 cmpwi cr6,r2,0 ; is address word aligned?
67 ble cr0,Lshort ; skip if too short to bother aligning
68
69 subfic r0,r2,4 ; get #bytes in partial word
70 cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set
71 addic r0,r0,0 ; turn off carry
72 beq cr6,Laligned ; skip if already word aligned (r2==0 if aligned)
73
74; Partial word at start: zero filled on left, it becomes initial checksum.
75
76 rlwinm r3,r3,0,0,29 ; word align address
77 mtcrf 0x01,r2 ; move byte offset to cr7
78 lwz r6,0(r3) ; get partial word
79 li r7,-1 ; start of mask for partial fill
80 slwi r8,r2,3 ; multiply byte offset by 8
81 sub r4,r4,r0 ; adjust length for bytes in partial word
82 crxor cr1_gt,31,cr1_gt; set flag if byte-lane swap will be necessary
83 srw r7,r7,r8 ; get mask for bytes to keep in partial word
84 addi r3,r3,4 ; point to next word of input
85 and r2,r6,r7 ; zero fill on left
86
87; Address is now word aligned. Prepare for inner loop over 32-byte chunks.
88; r2 = initial checksum
89; r3 = word aligned address
90; r4 = length remaining
91; r5 = accumulated sum parameter
92; carry = off
93; cr1_gt = "starting on odd address" flag
94
95Laligned:
96 srwi. r0,r4,5 ; get count of 32-byte chunks
97 mtcrf 0x02,r4 ; move residual length to cr6 and cr7
98 mtcrf 0x01,r4
99 beq cr0,Lleftovers ; no chunks
100
101 mtctr r0 ; set up loop count
102 li r4,32 ; offset to next chunk
103_xsum_nop_if_32bit:
104 b L64BitPath ; use the 64-bit path (patched to nop on 32-bit machine)
105 dcbt r4,r3 ; touch in 2nd cache line
106 li r0,96 ; get touch offset
107 b LInnerLoop32 ; enter 32-bit loop
108
109; Inner loop for 32-bit machines.
110
111 .align 4
112LInnerLoop32:
113 lwz r4,0(r3)
114 lwz r6,4(r3)
115 lwz r7,8(r3)
116 lwz r8,12(r3)
117 adde r2,r2,r4
118 lwz r9,16(r3)
119 adde r2,r2,r6
120 lwz r10,20(r3)
121 adde r2,r2,r7
122 lwz r11,24(r3)
123 adde r2,r2,r8
124 lwz r12,28(r3)
125 adde r2,r2,r9
126 dcbt r3,r0
127 adde r2,r2,r10
128 addi r3,r3,32
129 adde r2,r2,r11
130 adde r2,r2,r12
131 bdnz+ LInnerLoop32
132
133; Handle leftover bytes.
134; r2 = checksum so far
135; r3 = word aligned address
136; r5 = accumulated sum parameter
137; carry = live
138; cr1_gt = "starting on odd address" flag
139; cr6,cr7 = residual length
140
141Lleftovers:
142 bf 27,Lleftover8 ; test 0x10 bit of residual length
143 lwz r4,0(r3)
144 lwz r6,4(r3)
145 lwz r7,8(r3)
146 lwz r8,12(r3)
147 addi r3,r3,16
148 adde r2,r2,r4
149 adde r2,r2,r6
150 adde r2,r2,r7
151 adde r2,r2,r8
152Lleftover8:
153 bf 28,Lleftover4
154 lwz r4,0(r3)
155 lwz r6,4(r3)
156 addi r3,r3,8
157 adde r2,r2,r4
158 adde r2,r2,r6
159Lleftover4:
160 bf 29,Lleftover2
161 lwz r4,0(r3)
162 addi r3,r3,4
163 adde r2,r2,r4
164Lleftover2:
165 bf 30,Lleftover1
166 lhz r4,0(r3)
167 addi r3,r3,2
168 adde r2,r2,r4
169Lleftover1:
170 bf 31,Lwrapup
171 lbz r4,0(r3)
172 slwi r4,r4,8 ; shift last byte into proper lane
173 adde r2,r2,r4
174
175; All data bytes checksummed. Wrap up.
176; r2 = checksum so far (word parallel)
177; r5 = accumulated sum parameter
178; carry = live
179; cr1_gt = "starting on odd address" flag
180
181Lwrapup:
182 addze r2,r2 ; add in last carry
183 addze r2,r2 ; in case the "addze" carries
184Lwrapupx: ; here from short-operand case, with xer(ca) undefined
185 srwi r6,r2,16 ; top half of 32-bit checksum
186 rlwinm r7,r2,0,0xFFFF ; lower half
187 add r2,r6,r7 ; add them together
188 srwi r6,r2,16 ; then do it again, in case first carried
189 rlwinm r7,r2,0,0xFFFF
190 add r2,r6,r7
191 bf cr1_gt,Lswapped ; test "starting on odd address" flag
192
193; The checksum began on an odd address, so swap bytes.
194
195 rlwinm r6,r2,24,0x00FF ; move top byte to bottom
196 rlwinm r7,r2,8,0xFF00 ; bottom to top
197 or r2,r6,r7 ; rejoin
198
199; Finally, add in checksum passed in as a parameter.
200
201Lswapped:
202 add r2,r2,r5 ; add passed-in checksum
203 srwi r6,r2,16 ; top half of 32-bit checksum
204 rlwinm r7,r2,0,0xFFFF ; lower half
205 add r2,r6,r7 ; add them together
206 srwi r6,r2,16 ; then do it again, in case first carried
207 rlwinm r7,r2,0,0xFFFF
208 add r3,r6,r7 ; steer result into r3
209 blr
210
211; Handle short operands. Do a halfword at a time.
212; r3 = address
213; r4 = length (<= kShort)
214; r5 = accumulated sum parameter
215; r6 = "starting on odd byte" flag
216
217Lshort:
218 cmpwi cr6,r4,2 ; at least two bytes?
219 andi. r0,r4,1 ; odd length?
220 li r2,0 ; initialize checksum
221 cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set
222 blt cr6,Lshort2 ; fewer than two bytes, so skip
223Lshort1:
224 cmpwi cr6,r4,4 ; two more bytes (after we decrement)?
225 lhz r7,0(r3)
226 subi r4,r4,2
227 addi r3,r3,2
228 add r2,r2,r7 ; note no need for "adde"
229 bge cr6,Lshort1 ; loop for 2 more bytes
230Lshort2:
231 beq Lwrapupx ; no byte at end, proceed to checkout with carry undefined
232 lbz r7,0(r3)
233 slwi r7,r7,8 ; shift last byte into proper lane
234 add r2,r2,r7
235 b Lwrapupx
236
237; Handle 64-bit machine. The major improvement over the 32-bit path is that we use
238; four parallel 32-bit accumulators, which carry into the upper half naturally so we
239; do not have to use "adde", which serializes on the carry bit. Note that we cannot
240; do 64-bit "adde"s, because we run in 32-bit mode so carry would not be set correctly.
241; r2 = checksum so far (ie, the zero-filled partial first word)
242; r3 = word aligned address
243; r5 = accumulated sum parameter
244; ctr = number of 32-byte chunks of input
245; carry = unused in this code
246; cr1_gt = "starting on odd address" flag
247; cr6,cr7 = residual length
248
249L64BitPath:
250 stw r13,-4(r1) ; save a few nonvolatile regs in red zone so we can use them
251 stw r14,-8(r1)
252 stw r15,-12(r1)
253 stw r16,-16(r1)
254 li r0,128 ; to touch next line
255 li r13,0 ; r13-r15 are the accumulators, so initialize them
256 dcbt r3,r0 ; touch in next cache line, and keep loads away from the above stores
257 lwz r4,0(r3) ; start pipeline by loading first 32 bytes into r4, r6-r12
258 lwz r6,4(r3)
259 lwz r7,8(r3)
260 mr r14,r2 ; just copy incoming partial word into one of the accumulators
261 li r15,0
262 lwz r8,12(r3)
263 lwz r9,16(r3)
264 li r16,0
265 li r0,256 ; get touch offset
266 lwz r10,20(r3)
267 lwz r11,24(r3)
268 lwz r12,28(r3) ; load last word of previous chunk
269 addi r3,r3,32 ; skip past the chunk
270 bdnz++ LInnerLoop64 ; enter loop if another chunk to go
271
272 b LAddLastChunk ; only one chunk
273
274; Inner loop for 64-bit processors. This loop is scheduled for the 970.
275; It is pipelined (loads are one iteration ahead of adds), and unrolled.
276; It should take 9-10 cycles per iteration, which consumes 64 bytes of input.
277
278 .align 5
279LInnerLoop64: ; 64 bytes/iteration
280 add r13,r13,r4 ; cycle 1
281 add r14,r14,r6
282 dcbt r3,r0 ; touch in 2 lines ahead
283 lwz r4,0(r3)
284
285 add r15,r15,r7 ; cycle 2, etc
286 lwz r6,4(r3)
287 lwz r7,8(r3)
288 add r16,r16,r8
289
290 lwz r8,12(r3)
291 add r13,r13,r9
292 add r14,r14,r10
293 lwz r9,16(r3)
294
295 add r15,r15,r11
296 lwz r10,20(r3)
297 lwz r11,24(r3)
298 add r16,r16,r12
299 bdz-- LEarlyExit ; early exit if no more chunks
300
301 lwz r12,28(r3)
302 add r13,r13,r4
303 add r14,r14,r6
304 lwz r4,32(r3)
305
306 add r15,r15,r7
307 lwz r6,36(r3)
308 lwz r7,40(r3)
309 add r16,r16,r8
310
311 lwz r8,44(r3)
312 add r13,r13,r9
313 add r14,r14,r10
314 lwz r9,48(r3)
315
316 add r15,r15,r11
317 lwz r10,52(r3)
318 lwz r11,56(r3)
319 add r16,r16,r12
320
321 nop ; position last load in 2nd dispatch slot
322 lwz r12,60(r3)
323 addi r3,r3,64
324 bdnz++ LInnerLoop64
325
326 b LAddLastChunk
327
328; Add in the last 32-byte chunk, and any leftover bytes.
329; r3 = word aligned address of next byte of data
330; r5 = accumulated sum parameter
331; r13-r16 = the four accumulators
332; cr1_gt = "starting on odd address" flag
333; cr6,cr7 = residual length
334
335LEarlyExit: ; here from middle of inner loop
336 lwz r12,28(r3) ; load last word of last chunk
337 addi r3,r3,32
338LAddLastChunk: ; last 32-byte chunk of input is in r4,r6-r12
339 add r13,r13,r4 ; add in last chunk
340 add r14,r14,r6 ; these are 64-bit adds
341 add r15,r15,r7
342 add r16,r16,r8
343 add r13,r13,r9
344 add r14,r14,r10
345 add r15,r15,r11
346 add r16,r16,r12
347
348; Handle leftover bytes, if any.
349
350 bf 27,Lleft1 ; test 0x10 bit of residual length
351 lwz r4,0(r3)
352 lwz r6,4(r3)
353 lwz r7,8(r3)
354 lwz r8,12(r3)
355 addi r3,r3,16
356 add r13,r13,r4
357 add r14,r14,r6
358 add r15,r15,r7
359 add r16,r16,r8
360Lleft1:
361 bf 28,Lleft2
362 lwz r4,0(r3)
363 lwz r6,4(r3)
364 addi r3,r3,8
365 add r13,r13,r4
366 add r14,r14,r6
367Lleft2:
368 bf 29,Lleft3
369 lwz r4,0(r3)
370 addi r3,r3,4
371 add r14,r14,r4
372Lleft3:
373 bf 30,Lleft4
374 lhz r4,0(r3)
375 addi r3,r3,2
376 add r15,r15,r4
377Lleft4:
378 bf 31,Lleft5
379 lbz r4,0(r3)
380 slwi r4,r4,8 ; shift last byte into proper lane
381 add r16,r16,r4
382
383; All data bytes have been checksummed. Now we must add together the four
384; accumulators and restore the regs from the red zone.
385; r3 = word aligned address of next byte of data
386; r5 = accumulated sum parameter
387; r13-r16 = the four accumulators
388; carry = not used so far
389; cr1_gt = "starting on odd address" flag
390
391Lleft5:
392 add r8,r13,r14 ; add the four accumulators together
393 add r9,r15,r16
394 lwz r13,-4(r1) ; start to restore nonvolatiles from red zone
395 lwz r14,-8(r1)
396 add r8,r8,r9 ; now r8 is 64-bit sum of the four accumulators
397 lwz r15,-12(r1)
398 lwz r16,-16(r1)
399 srdi r7,r8,32 ; get upper half of 64-bit sum
400 addc r2,r7,r8 ; finally, do a 32-bit add of the two halves of r8 (setting carry)
401 b Lwrapup ; merge r2, r5, and carry into a 16-bit checksum