2 * Copyright (c) 2011-2013 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/sysctl.h>
33 #include <sys/mcache.h>
34 #include <sys/socket.h>
35 #include <sys/socketvar.h>
36 #include <net/if_types.h>
37 #include <net/route.h>
38 #include <netinet/in.h>
39 #include <netinet/in_systm.h>
42 #include <netinet/ip.h>
43 #include <netinet/ip_var.h>
44 #include <netinet/in_var.h>
45 #include <netinet/tcp.h>
46 #include <netinet/tcp_seq.h>
47 #include <netinet/tcpip.h>
48 #include <netinet/tcp_var.h>
49 #include <netinet/tcp_lro.h>
50 #include <netinet/lro_ext.h>
51 #include <kern/locks.h>
53 unsigned int lrocount
= 0; /* A counter used for debugging only */
54 unsigned int lro_seq_outoforder
= 0; /* Counter for debugging */
55 unsigned int lro_seq_mismatch
= 0; /* Counter for debugging */
56 unsigned int lro_flushes
= 0; /* Counter for tracking number of flushes */
57 unsigned int lro_single_flushes
= 0;
58 unsigned int lro_double_flushes
= 0;
59 unsigned int lro_good_flushes
= 0;
61 unsigned int coalesc_sz
= LRO_MX_COALESCE_PKTS
;
62 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, lro_sz
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
63 &coalesc_sz
, 0, "Max coalescing size");
65 unsigned int coalesc_time
= LRO_MX_TIME_TO_BUFFER
;
66 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, lro_time
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
67 &coalesc_time
, 0, "Max coalescing time");
69 struct lro_flow lro_flow_list
[TCP_LRO_NUM_FLOWS
];
71 char lro_flow_map
[TCP_LRO_FLOW_MAP
];
73 static lck_attr_t
*tcp_lro_mtx_attr
= NULL
; /* mutex attributes */
74 static lck_grp_t
*tcp_lro_mtx_grp
= NULL
; /* mutex group */
75 static lck_grp_attr_t
*tcp_lro_mtx_grp_attr
= NULL
; /* mutex group attrs */
76 decl_lck_mtx_data(, tcp_lro_lock
); /* Used to synchronize updates */
78 unsigned int lro_byte_count
= 0;
80 uint64_t lro_deadline
= 0; /* LRO's sense of time - protected by tcp_lro_lock */
81 uint32_t lro_timer_set
= 0;
84 u_int32_t lro_pkt_count
= 0; /* Number of packets encountered in an LRO period */
85 thread_call_t tcp_lro_timer
;
87 extern u_int32_t kipf_count
;
89 static void tcp_lro_timer_proc(void*, void*);
90 static void lro_update_stats(struct mbuf
*);
91 static void lro_update_flush_stats(struct mbuf
*);
92 static void tcp_lro_flush_flows(void);
93 static void tcp_lro_sched_timer(uint64_t);
94 static void lro_proto_input(struct mbuf
*);
96 static struct mbuf
*lro_tcp_xsum_validate(struct mbuf
*, struct ip
*,
98 static struct mbuf
*tcp_lro_process_pkt(struct mbuf
*, int);
105 bzero(lro_flow_list
, sizeof(struct lro_flow
) * TCP_LRO_NUM_FLOWS
);
106 for (i
= 0; i
< TCP_LRO_FLOW_MAP
; i
++) {
107 lro_flow_map
[i
] = TCP_LRO_FLOW_UNINIT
;
111 * allocate lock group attribute, group and attribute for tcp_lro_lock
113 tcp_lro_mtx_grp_attr
= lck_grp_attr_alloc_init();
114 tcp_lro_mtx_grp
= lck_grp_alloc_init("tcplro", tcp_lro_mtx_grp_attr
);
115 tcp_lro_mtx_attr
= lck_attr_alloc_init();
116 lck_mtx_init(&tcp_lro_lock
, tcp_lro_mtx_grp
, tcp_lro_mtx_attr
);
118 tcp_lro_timer
= thread_call_allocate(tcp_lro_timer_proc
, NULL
);
119 if (tcp_lro_timer
== NULL
) {
120 panic_plain("%s: unable to allocate lro timer", __func__
);
127 tcp_lro_matching_tuple(struct ip
* ip_hdr
, struct tcphdr
*tcp_hdr
, int *hash
,
130 struct lro_flow
*flow
;
132 unsigned int off
= 0;
135 *hash
= LRO_HASH(ip_hdr
->ip_src
.s_addr
, ip_hdr
->ip_dst
.s_addr
,
136 tcp_hdr
->th_sport
, tcp_hdr
->th_dport
, (TCP_LRO_FLOW_MAP
- 1));
138 *flow_id
= lro_flow_map
[*hash
];
139 if (*flow_id
== TCP_LRO_FLOW_NOTFOUND
) {
143 seqnum
= tcp_hdr
->th_seq
;
144 off
= tcp_hdr
->th_off
<< 2;
145 payload_len
= ip_hdr
->ip_len
- off
;
147 flow
= &lro_flow_list
[*flow_id
];
149 if ((flow
->lr_faddr
.s_addr
== ip_hdr
->ip_src
.s_addr
) &&
150 (flow
->lr_laddr
.s_addr
== ip_hdr
->ip_dst
.s_addr
) &&
151 (flow
->lr_fport
== tcp_hdr
->th_sport
) &&
152 (flow
->lr_lport
== tcp_hdr
->th_dport
)) {
153 if (flow
->lr_tcphdr
== NULL
) {
154 if (ntohl(seqnum
) == flow
->lr_seq
) {
155 return TCP_LRO_COALESCE
;
158 printf("%s: seqnum = %x, lr_seq = %x\n",
159 __func__
, ntohl(seqnum
), flow
->lr_seq
);
162 if (SEQ_GT(ntohl(seqnum
), flow
->lr_seq
)) {
163 lro_seq_outoforder
++;
165 * Whenever we receive out of order packets it
166 * signals loss and recovery and LRO doesn't
167 * let flows recover quickly. So eject.
169 flow
->lr_flags
|= LRO_EJECT_REQ
;
174 if (flow
->lr_flags
& LRO_EJECT_REQ
) {
176 printf("%s: eject. \n", __func__
);
178 return TCP_LRO_EJECT_FLOW
;
180 if (SEQ_GT(tcp_hdr
->th_ack
, flow
->lr_tcphdr
->th_ack
)) {
182 printf("%s: th_ack = %x flow_ack = %x \n",
183 __func__
, tcp_hdr
->th_ack
,
184 flow
->lr_tcphdr
->th_ack
);
186 return TCP_LRO_EJECT_FLOW
;
189 if (ntohl(seqnum
) == (ntohl(lro_flow_list
[*flow_id
].lr_tcphdr
->th_seq
) + lro_flow_list
[*flow_id
].lr_len
)) {
190 return TCP_LRO_COALESCE
;
192 /* LRO does not handle loss recovery well, eject */
193 flow
->lr_flags
|= LRO_EJECT_REQ
;
194 return TCP_LRO_EJECT_FLOW
;
198 printf("tcp_lro_matching_tuple: collision \n");
200 return TCP_LRO_COLLISION
;
204 tcp_lro_init_flow(int flow_id
, struct ip
* ip_hdr
, struct tcphdr
*tcp_hdr
,
205 int hash
, u_int32_t timestamp
, int payload_len
)
207 struct lro_flow
*flow
= NULL
;
209 flow
= &lro_flow_list
[flow_id
];
211 flow
->lr_hash_map
= hash
;
212 flow
->lr_faddr
.s_addr
= ip_hdr
->ip_src
.s_addr
;
213 flow
->lr_laddr
.s_addr
= ip_hdr
->ip_dst
.s_addr
;
214 flow
->lr_fport
= tcp_hdr
->th_sport
;
215 flow
->lr_lport
= tcp_hdr
->th_dport
;
216 lro_flow_map
[hash
] = flow_id
;
217 flow
->lr_timestamp
= timestamp
;
218 flow
->lr_seq
= ntohl(tcp_hdr
->th_seq
) + payload_len
;
224 tcp_lro_coalesce(int flow_id
, struct mbuf
*lro_mb
, struct tcphdr
*tcphdr
,
225 int payload_len
, int drop_hdrlen
, struct tcpopt
*topt
,
226 u_int32_t
* tsval
, u_int32_t
* tsecr
, int thflags
)
228 struct lro_flow
*flow
= NULL
;
230 struct ip
*ip
= NULL
;
232 flow
= &lro_flow_list
[flow_id
];
233 if (flow
->lr_mhead
) {
235 printf("%s: lr_mhead %x %d \n", __func__
, flow
->lr_seq
,
238 m_adj(lro_mb
, drop_hdrlen
);
240 last
= flow
->lr_mtail
;
241 while (last
->m_next
!= NULL
) {
244 last
->m_next
= lro_mb
;
246 flow
->lr_mtail
= lro_mb
;
248 ip
= mtod(flow
->lr_mhead
, struct ip
*);
249 ip
->ip_len
+= lro_mb
->m_pkthdr
.len
;
250 flow
->lr_mhead
->m_pkthdr
.len
+= lro_mb
->m_pkthdr
.len
;
252 if (flow
->lr_len
== 0) {
253 panic_plain("%s: Inconsistent LRO flow state", __func__
);
255 flow
->lr_len
+= payload_len
;
256 flow
->lr_seq
+= payload_len
;
258 * This bit is re-OR'd each time a packet is added to the
259 * large coalesced packet.
261 flow
->lr_mhead
->m_pkthdr
.pkt_flags
|= PKTF_SW_LRO_PKT
;
262 flow
->lr_mhead
->m_pkthdr
.lro_npkts
++; /* for tcpstat.tcps_rcvpack */
263 if (flow
->lr_mhead
->m_pkthdr
.lro_pktlen
<
264 lro_mb
->m_pkthdr
.lro_pktlen
) {
266 * For TCP Inter Arrival Jitter calculation, return max
267 * size encountered while coalescing a stream of pkts.
269 flow
->lr_mhead
->m_pkthdr
.lro_pktlen
=
270 lro_mb
->m_pkthdr
.lro_pktlen
;
272 /* Update the timestamp value */
273 if (topt
->to_flags
& TOF_TS
) {
274 if ((flow
->lr_tsval
) &&
275 (TSTMP_GT(topt
->to_tsval
, ntohl(*(flow
->lr_tsval
))))) {
276 *(flow
->lr_tsval
) = htonl(topt
->to_tsval
);
278 if ((flow
->lr_tsecr
) &&
279 (topt
->to_tsecr
!= 0) &&
280 (TSTMP_GT(topt
->to_tsecr
, ntohl(*(flow
->lr_tsecr
))))) {
282 printf("%s: instantaneous RTT = %d \n", __func__
,
283 topt
->to_tsecr
- ntohl(*(flow
->lr_tsecr
)));
285 *(flow
->lr_tsecr
) = htonl(topt
->to_tsecr
);
288 /* Coalesce the flags */
290 flow
->lr_tcphdr
->th_flags
|= thflags
;
292 /* Update receive window */
293 flow
->lr_tcphdr
->th_win
= tcphdr
->th_win
;
296 flow
->lr_mhead
= flow
->lr_mtail
= lro_mb
;
297 flow
->lr_mhead
->m_pkthdr
.pkt_flags
|= PKTF_SW_LRO_PKT
;
298 flow
->lr_tcphdr
= tcphdr
;
299 if ((topt
) && (topt
->to_flags
& TOF_TS
)) {
300 ASSERT(tsval
!= NULL
);
301 ASSERT(tsecr
!= NULL
);
302 flow
->lr_tsval
= tsval
;
303 flow
->lr_tsecr
= tsecr
;
305 flow
->lr_len
= payload_len
;
306 calculate_tcp_clock();
307 flow
->lr_timestamp
= tcp_now
;
308 tcp_lro_sched_timer(0);
310 flow
->lr_seq
= ntohl(tcphdr
->th_seq
) + payload_len
;
313 tcpstat
.tcps_coalesced_pack
++;
319 tcp_lro_eject_flow(int flow_id
)
321 struct mbuf
*mb
= NULL
;
323 mb
= lro_flow_list
[flow_id
].lr_mhead
;
324 ASSERT(lro_flow_map
[lro_flow_list
[flow_id
].lr_hash_map
] == flow_id
);
325 lro_flow_map
[lro_flow_list
[flow_id
].lr_hash_map
] = TCP_LRO_FLOW_UNINIT
;
326 bzero(&lro_flow_list
[flow_id
], sizeof(struct lro_flow
));
332 tcp_lro_eject_coalesced_pkt(int flow_id
)
334 struct mbuf
*mb
= NULL
;
335 mb
= lro_flow_list
[flow_id
].lr_mhead
;
336 lro_flow_list
[flow_id
].lr_mhead
=
337 lro_flow_list
[flow_id
].lr_mtail
= NULL
;
338 lro_flow_list
[flow_id
].lr_tcphdr
= NULL
;
343 tcp_lro_insert_flow(struct mbuf
*lro_mb
, struct ip
*ip_hdr
,
344 struct tcphdr
*tcp_hdr
, int payload_len
,
345 int drop_hdrlen
, int hash
, struct tcpopt
*topt
,
346 u_int32_t
*tsval
, u_int32_t
*tsecr
)
349 int slot_available
= 0;
350 int candidate_flow
= 0;
351 u_int32_t oldest_timestamp
;
352 struct mbuf
*mb
= NULL
;
355 oldest_timestamp
= tcp_now
;
357 /* handle collision */
358 if (lro_flow_map
[hash
] != TCP_LRO_FLOW_UNINIT
) {
362 candidate_flow
= lro_flow_map
[hash
];
363 tcpstat
.tcps_flowtbl_collision
++;
367 for (i
= 0; i
< TCP_LRO_NUM_FLOWS
; i
++) {
368 if (lro_flow_list
[i
].lr_mhead
== NULL
) {
373 if (oldest_timestamp
>= lro_flow_list
[i
].lr_timestamp
) {
375 oldest_timestamp
= lro_flow_list
[i
].lr_timestamp
;
379 if (!slot_available
) {
380 tcpstat
.tcps_flowtbl_full
++;
382 /* kick the oldest flow */
383 mb
= tcp_lro_eject_flow(candidate_flow
);
386 if (!slot_available
) {
387 printf("%s: slot unavailable.\n", __func__
);
390 printf("%s: collision.\n", __func__
);
394 candidate_flow
= i
; /* this is now the flow to be used */
397 tcp_lro_init_flow(candidate_flow
, ip_hdr
, tcp_hdr
, hash
,
398 tcp_now
, payload_len
);
399 tcp_lro_coalesce(candidate_flow
, lro_mb
, tcp_hdr
, payload_len
,
400 drop_hdrlen
, topt
, tsval
, tsecr
, 0);
405 tcp_lro_process_pkt(struct mbuf
*lro_mb
, int drop_hdrlen
)
407 int flow_id
= TCP_LRO_FLOW_UNINIT
;
409 unsigned int off
= 0;
413 struct mbuf
*mb
= NULL
;
418 int ret_response
= TCP_LRO_CONSUMED
;
419 int coalesced
= 0, tcpflags
= 0, unknown_tcpopts
= 0;
422 struct tcphdr
*tcp_hdr
;
424 if (lro_mb
->m_len
< drop_hdrlen
) {
425 if ((lro_mb
= m_pullup(lro_mb
, drop_hdrlen
)) == NULL
) {
426 tcpstat
.tcps_rcvshort
++;
429 printf("tcp_lro_process_pkt:mbuf too short.\n");
435 ip_hdr
= mtod(lro_mb
, struct ip
*);
436 tcp_hdr
= (struct tcphdr
*)((caddr_t
)ip_hdr
+ sizeof(struct ip
));
439 lro_mb
->m_pkthdr
.pkt_flags
&= ~PKTF_SW_LRO_DID_CSUM
;
441 if ((lro_mb
= lro_tcp_xsum_validate(lro_mb
, ip_hdr
, tcp_hdr
)) == NULL
) {
443 printf("tcp_lro_process_pkt: TCP xsum failed.\n");
451 /* Avoids checksumming in tcp_input */
452 lro_mb
->m_pkthdr
.pkt_flags
|= PKTF_SW_LRO_DID_CSUM
;
454 off
= tcp_hdr
->th_off
<< 2;
455 optlen
= off
- sizeof(struct tcphdr
);
456 payload_len
= ip_hdr
->ip_len
- off
;
457 optp
= (u_char
*)(tcp_hdr
+ 1);
459 * Do quick retrieval of timestamp options ("options
460 * prediction?"). If timestamp is the only option and it's
461 * formatted as recommended in RFC 1323 appendix A, we
462 * quickly get the values now and not bother calling
463 * tcp_dooptions(), etc.
465 bzero(&to
, sizeof(to
));
466 if ((optlen
== TCPOLEN_TSTAMP_APPA
||
467 (optlen
> TCPOLEN_TSTAMP_APPA
&&
468 optp
[TCPOLEN_TSTAMP_APPA
] == TCPOPT_EOL
)) &&
469 *(u_int32_t
*)optp
== htonl(TCPOPT_TSTAMP_HDR
) &&
470 (tcp_hdr
->th_flags
& TH_SYN
) == 0) {
471 to
.to_flags
|= TOF_TS
;
472 to
.to_tsval
= ntohl(*(u_int32_t
*)(void *)(optp
+ 4));
473 to
.to_tsecr
= ntohl(*(u_int32_t
*)(void *)(optp
+ 8));
476 * If TCP timestamps are not in use, or not the first option,
477 * skip LRO path since timestamps are used to avoid LRO
478 * from introducing additional latencies for retransmissions
479 * and other slow-paced transmissions.
481 to
.to_flags
= to
.to_tsecr
= 0;
485 /* list all the conditions that can trigger a flow ejection here */
487 thflags
= tcp_hdr
->th_flags
;
488 if (thflags
& (TH_SYN
| TH_URG
| TH_ECE
| TH_CWR
| TH_PUSH
| TH_RST
| TH_FIN
)) {
489 eject_flow
= tcpflags
= 1;
492 if (optlen
&& !((optlen
== TCPOLEN_TSTAMP_APPA
) &&
493 (to
.to_flags
& TOF_TS
))) {
494 eject_flow
= unknown_tcpopts
= 1;
497 if (payload_len
<= LRO_MIN_COALESC_SZ
) { /* zero payload ACK */
501 /* Can't coalesce ECN marked packets. */
502 ecn
= ip_hdr
->ip_tos
& IPTOS_ECN_MASK
;
503 if (ecn
== IPTOS_ECN_CE
) {
505 * ECN needs quick notification
508 printf("%s: ECE bits set.\n", __func__
);
513 lck_mtx_lock_spin(&tcp_lro_lock
);
515 retval
= tcp_lro_matching_tuple(ip_hdr
, tcp_hdr
, &hash
, &flow_id
);
519 lck_mtx_unlock(&tcp_lro_lock
);
520 ret_response
= TCP_LRO_FLOW_NOTFOUND
;
523 case TCP_LRO_COALESCE
:
524 if ((payload_len
!= 0) && (unknown_tcpopts
== 0) &&
525 (tcpflags
== 0) && (ecn
!= IPTOS_ECN_CE
) && (to
.to_flags
& TOF_TS
)) {
526 tcp_lro_coalesce(flow_id
, lro_mb
, tcp_hdr
, payload_len
,
528 (to
.to_flags
& TOF_TS
) ? (u_int32_t
*)(void *)(optp
+ 4) : NULL
,
529 (to
.to_flags
& TOF_TS
) ? (u_int32_t
*)(void *)(optp
+ 8) : NULL
,
532 printf("tcp_lro_process_pkt: coalesce len = %d. flow_id = %d payload_len = %d drop_hdrlen = %d optlen = %d lport = %d seqnum = %x.\n",
533 lro_flow_list
[flow_id
].lr_len
, flow_id
,
534 payload_len
, drop_hdrlen
, optlen
,
535 ntohs(lro_flow_list
[flow_id
].lr_lport
),
536 ntohl(tcp_hdr
->th_seq
));
538 if (lro_flow_list
[flow_id
].lr_mhead
->m_pkthdr
.lro_npkts
>= coalesc_sz
) {
544 mb
= tcp_lro_eject_coalesced_pkt(flow_id
);
545 lro_flow_list
[flow_id
].lr_seq
= ntohl(tcp_hdr
->th_seq
) +
547 calculate_tcp_clock();
548 u_int8_t timestamp
= tcp_now
- lro_flow_list
[flow_id
].lr_timestamp
;
549 lck_mtx_unlock(&tcp_lro_lock
);
551 mb
->m_pkthdr
.lro_elapsed
= timestamp
;
556 printf("%s: pkt payload_len = %d \n", __func__
, payload_len
);
558 lro_proto_input(lro_mb
);
561 lck_mtx_unlock(&tcp_lro_lock
);
565 case TCP_LRO_EJECT_FLOW
:
566 mb
= tcp_lro_eject_coalesced_pkt(flow_id
);
567 calculate_tcp_clock();
568 u_int8_t timestamp
= tcp_now
- lro_flow_list
[flow_id
].lr_timestamp
;
569 lck_mtx_unlock(&tcp_lro_lock
);
572 printf("tcp_lro_process_pkt eject_flow, len = %d\n", mb
->m_pkthdr
.len
);
574 mb
->m_pkthdr
.lro_elapsed
= timestamp
;
578 lro_proto_input(lro_mb
);
581 case TCP_LRO_COLLISION
:
582 lck_mtx_unlock(&tcp_lro_lock
);
583 ret_response
= TCP_LRO_FLOW_NOTFOUND
;
587 lck_mtx_unlock(&tcp_lro_lock
);
588 panic_plain("%s: unrecognized type %d", __func__
, retval
);
591 if (ret_response
== TCP_LRO_FLOW_NOTFOUND
) {
592 lro_proto_input(lro_mb
);
598 tcp_lro_timer_proc(void *arg1
, void *arg2
)
600 #pragma unused(arg1, arg2)
602 lck_mtx_lock_spin(&tcp_lro_lock
);
604 lck_mtx_unlock(&tcp_lro_lock
);
605 tcp_lro_flush_flows();
609 tcp_lro_flush_flows(void)
613 struct lro_flow
*flow
;
614 int tcpclock_updated
= 0;
616 lck_mtx_lock(&tcp_lro_lock
);
618 while (i
< TCP_LRO_NUM_FLOWS
) {
619 flow
= &lro_flow_list
[i
];
620 if (flow
->lr_mhead
!= NULL
) {
621 if (!tcpclock_updated
) {
622 calculate_tcp_clock();
623 tcpclock_updated
= 1;
627 printf("tcp_lro_flush_flows: len =%d n_pkts = %d %d %d \n",
629 flow
->lr_mhead
->m_pkthdr
.lro_npkts
,
630 flow
->lr_timestamp
, tcp_now
);
633 u_int8_t timestamp
= tcp_now
- flow
->lr_timestamp
;
635 mb
= tcp_lro_eject_flow(i
);
638 mb
->m_pkthdr
.lro_elapsed
= timestamp
;
639 lck_mtx_unlock(&tcp_lro_lock
);
640 lro_update_flush_stats(mb
);
642 lck_mtx_lock(&tcp_lro_lock
);
647 lck_mtx_unlock(&tcp_lro_lock
);
651 * Must be called with tcp_lro_lock held.
652 * The hint is non-zero for longer waits. The wait time dictated by coalesc_time
653 * takes precedence, so lro_timer_set is not set for the hint case
656 tcp_lro_sched_timer(uint64_t hint
)
664 /* the intent is to wake up every coalesc_time msecs */
665 clock_interval_to_deadline(coalesc_time
,
666 (NSEC_PER_SEC
/ TCP_RETRANSHZ
), &lro_deadline
);
668 clock_interval_to_deadline(hint
, NSEC_PER_SEC
/ TCP_RETRANSHZ
,
671 thread_call_enter_delayed(tcp_lro_timer
, lro_deadline
);
675 tcp_lro(struct mbuf
*m
, unsigned int hlen
)
679 struct tcphdr
* tcp_hdr
= NULL
;
680 unsigned int off
= 0;
682 if (kipf_count
!= 0) {
687 * Experiments on cellular show that the RTT is much higher
688 * than the coalescing time of 5 msecs, causing lro to flush
689 * 80% of the time on a single packet. Increasing
690 * coalescing time for cellular does not show marked
691 * improvement to throughput either. Loopback perf is hurt
692 * by the 5 msec latency and it already sends large packets.
694 if (IFNET_IS_CELLULAR(m
->m_pkthdr
.rcvif
) ||
695 (m
->m_pkthdr
.rcvif
->if_type
== IFT_LOOP
)) {
699 ip_hdr
= mtod(m
, struct ip
*);
701 /* don't deal with IP options */
702 if (hlen
!= sizeof(struct ip
)) {
706 /* only TCP is coalesced */
707 if (ip_hdr
->ip_p
!= IPPROTO_TCP
) {
711 if (m
->m_len
< (int32_t) sizeof(struct tcpiphdr
)) {
713 printf("tcp_lro m_pullup \n");
715 if ((m
= m_pullup(m
, sizeof(struct tcpiphdr
))) == NULL
) {
716 tcpstat
.tcps_rcvshort
++;
718 printf("ip_lro: rcvshort.\n");
722 ip_hdr
= mtod(m
, struct ip
*);
725 tcp_hdr
= (struct tcphdr
*)((caddr_t
)ip_hdr
+ hlen
);
726 tlen
= ip_hdr
->ip_len
; //ignore IP header bytes len
727 m
->m_pkthdr
.lro_pktlen
= tlen
; /* Used to return max pkt encountered to tcp */
728 m
->m_pkthdr
.lro_npkts
= 1; /* Initialize a counter to hold num pkts coalesced */
729 m
->m_pkthdr
.lro_elapsed
= 0; /* Initialize the field to carry elapsed time */
730 off
= tcp_hdr
->th_off
<< 2;
731 if (off
< sizeof(struct tcphdr
) || off
> tlen
) {
732 tcpstat
.tcps_rcvbadoff
++;
734 printf("ip_lro: TCP off greater than TCP header.\n");
739 return tcp_lro_process_pkt(m
, hlen
+ off
);
743 lro_proto_input(struct mbuf
*m
)
745 struct ip
* ip_hdr
= mtod(m
, struct ip
*);
748 printf("lro_proto_input: ip_len = %d \n",
752 ip_proto_dispatch_in_wrapper(m
, ip_hdr
->ip_hl
<< 2, ip_hdr
->ip_p
);
756 lro_tcp_xsum_validate(struct mbuf
*m
, struct ip
*ip
, struct tcphdr
* th
)
758 /* Expect 32-bit aligned data pointer on strict-align platforms */
759 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m
);
761 /* we shouldn't get here for IP with options; hence sizeof (ip) */
762 if (tcp_input_checksum(AF_INET
, m
, th
, sizeof(*ip
), ip
->ip_len
)) {
764 printf("%s: bad xsum and drop m = 0x%llx.\n", __func__
,
765 (uint64_t)VM_KERNEL_ADDRPERM(m
));
775 * When TCP detects a stable, steady flow without out of ordering,
776 * with a sufficiently high cwnd, it invokes LRO.
779 tcp_start_coalescing(struct ip
*ip_hdr
, struct tcphdr
*tcp_hdr
, int tlen
)
783 struct mbuf
*eject_mb
;
786 hash
= LRO_HASH(ip_hdr
->ip_src
.s_addr
, ip_hdr
->ip_dst
.s_addr
,
787 tcp_hdr
->th_sport
, tcp_hdr
->th_dport
,
788 (TCP_LRO_FLOW_MAP
- 1));
791 lck_mtx_lock_spin(&tcp_lro_lock
);
792 flow_id
= lro_flow_map
[hash
];
793 if (flow_id
!= TCP_LRO_FLOW_NOTFOUND
) {
794 lf
= &lro_flow_list
[flow_id
];
795 if ((lf
->lr_faddr
.s_addr
== ip_hdr
->ip_src
.s_addr
) &&
796 (lf
->lr_laddr
.s_addr
== ip_hdr
->ip_dst
.s_addr
) &&
797 (lf
->lr_fport
== tcp_hdr
->th_sport
) &&
798 (lf
->lr_lport
== tcp_hdr
->th_dport
)) {
799 if ((lf
->lr_tcphdr
== NULL
) &&
800 (lf
->lr_seq
!= (tcp_hdr
->th_seq
+ tlen
))) {
801 lf
->lr_seq
= tcp_hdr
->th_seq
+ tlen
;
803 lf
->lr_flags
&= ~LRO_EJECT_REQ
;
805 lck_mtx_unlock(&tcp_lro_lock
);
809 HTONL(tcp_hdr
->th_seq
);
810 HTONL(tcp_hdr
->th_ack
);
812 tcp_lro_insert_flow(NULL
, ip_hdr
, tcp_hdr
, tlen
, 0, hash
,
815 lck_mtx_unlock(&tcp_lro_lock
);
817 NTOHL(tcp_hdr
->th_seq
);
818 NTOHL(tcp_hdr
->th_ack
);
820 printf("%s: src = %x dst = %x sport = %d dport = %d seq %x \n",
821 __func__
, ip_hdr
->ip_src
.s_addr
, ip_hdr
->ip_dst
.s_addr
,
822 tcp_hdr
->th_sport
, tcp_hdr
->th_dport
, tcp_hdr
->th_seq
);
824 ASSERT(eject_mb
== NULL
);
829 * When TCP detects loss or idle condition, it stops offloading
833 tcp_lro_remove_state(struct in_addr saddr
, struct in_addr daddr
,
834 unsigned short sport
, unsigned short dport
)
839 hash
= LRO_HASH(daddr
.s_addr
, saddr
.s_addr
, dport
, sport
,
840 (TCP_LRO_FLOW_MAP
- 1));
841 lck_mtx_lock_spin(&tcp_lro_lock
);
842 flow_id
= lro_flow_map
[hash
];
843 if (flow_id
== TCP_LRO_FLOW_UNINIT
) {
844 lck_mtx_unlock(&tcp_lro_lock
);
847 lf
= &lro_flow_list
[flow_id
];
848 if ((lf
->lr_faddr
.s_addr
== daddr
.s_addr
) &&
849 (lf
->lr_laddr
.s_addr
== saddr
.s_addr
) &&
850 (lf
->lr_fport
== dport
) &&
851 (lf
->lr_lport
== sport
)) {
853 printf("%s: %x %x\n", __func__
,
854 lf
->lr_flags
, lf
->lr_seq
);
856 lf
->lr_flags
|= LRO_EJECT_REQ
;
858 lck_mtx_unlock(&tcp_lro_lock
);
863 tcp_update_lro_seq(__uint32_t rcv_nxt
, struct in_addr saddr
, struct in_addr daddr
,
864 unsigned short sport
, unsigned short dport
)
869 hash
= LRO_HASH(daddr
.s_addr
, saddr
.s_addr
, dport
, sport
,
870 (TCP_LRO_FLOW_MAP
- 1));
871 lck_mtx_lock_spin(&tcp_lro_lock
);
872 flow_id
= lro_flow_map
[hash
];
873 if (flow_id
== TCP_LRO_FLOW_UNINIT
) {
874 lck_mtx_unlock(&tcp_lro_lock
);
877 lf
= &lro_flow_list
[flow_id
];
878 if ((lf
->lr_faddr
.s_addr
== daddr
.s_addr
) &&
879 (lf
->lr_laddr
.s_addr
== saddr
.s_addr
) &&
880 (lf
->lr_fport
== dport
) &&
881 (lf
->lr_lport
== sport
) &&
882 (lf
->lr_tcphdr
== NULL
)) {
883 lf
->lr_seq
= (tcp_seq
)rcv_nxt
;
885 lck_mtx_unlock(&tcp_lro_lock
);
890 lro_update_stats(struct mbuf
*m
)
892 switch (m
->m_pkthdr
.lro_npkts
) {
893 case 0: /* fall through */
898 tcpstat
.tcps_lro_twopack
++;
901 case 3: /* fall through */
903 tcpstat
.tcps_lro_multpack
++;
907 tcpstat
.tcps_lro_largepack
++;
914 lro_update_flush_stats(struct mbuf
*m
)
917 switch (m
->m_pkthdr
.lro_npkts
) {
919 case 1: lro_single_flushes
++;
921 case 2: lro_double_flushes
++;
923 default: lro_good_flushes
++;