2 * Copyright (c) 2011-2013 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/sysctl.h>
33 #include <sys/mcache.h>
34 #include <sys/socket.h>
35 #include <sys/socketvar.h>
36 #include <net/if_types.h>
37 #include <net/route.h>
38 #include <netinet/in.h>
39 #include <netinet/in_systm.h>
42 #include <netinet/ip.h>
43 #include <netinet/ip_var.h>
44 #include <netinet/in_var.h>
45 #include <netinet/tcp.h>
46 #include <netinet/tcp_seq.h>
47 #include <netinet/tcpip.h>
48 #include <netinet/tcp_var.h>
49 #include <netinet/tcp_lro.h>
50 #include <netinet/lro_ext.h>
51 #include <kern/locks.h>
53 unsigned int lrocount
= 0; /* A counter used for debugging only */
54 unsigned int lro_seq_outoforder
= 0; /* Counter for debugging */
55 unsigned int lro_seq_mismatch
= 0; /* Counter for debugging */
56 unsigned int lro_flushes
= 0; /* Counter for tracking number of flushes */
57 unsigned int lro_single_flushes
= 0;
58 unsigned int lro_double_flushes
= 0;
59 unsigned int lro_good_flushes
= 0;
61 unsigned int coalesc_sz
= LRO_MX_COALESCE_PKTS
;
62 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, lro_sz
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
63 &coalesc_sz
, 0, "Max coalescing size");
65 unsigned int coalesc_time
= LRO_MX_TIME_TO_BUFFER
;
66 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, lro_time
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
67 &coalesc_time
, 0, "Max coalescing time");
69 struct lro_flow lro_flow_list
[TCP_LRO_NUM_FLOWS
];
71 char lro_flow_map
[TCP_LRO_FLOW_MAP
];
73 static lck_attr_t
*tcp_lro_mtx_attr
= NULL
; /* mutex attributes */
74 static lck_grp_t
*tcp_lro_mtx_grp
= NULL
; /* mutex group */
75 static lck_grp_attr_t
*tcp_lro_mtx_grp_attr
= NULL
; /* mutex group attrs */
76 decl_lck_mtx_data( ,tcp_lro_lock
); /* Used to synchronize updates */
78 unsigned int lro_byte_count
= 0;
80 uint64_t lro_deadline
= 0; /* LRO's sense of time - protected by tcp_lro_lock */
81 uint32_t lro_timer_set
= 0;
84 u_int32_t lro_pkt_count
= 0; /* Number of packets encountered in an LRO period */
85 thread_call_t tcp_lro_timer
;
87 extern u_int32_t kipf_count
;
89 static void tcp_lro_timer_proc(void*, void*);
90 static void lro_update_stats(struct mbuf
*);
91 static void lro_update_flush_stats(struct mbuf
*);
92 static void tcp_lro_flush_flows(void);
93 static void tcp_lro_sched_timer(uint64_t);
94 static void lro_proto_input(struct mbuf
*);
96 static struct mbuf
*lro_tcp_xsum_validate(struct mbuf
*, struct ip
*,
98 static struct mbuf
*tcp_lro_process_pkt(struct mbuf
*, int);
105 bzero(lro_flow_list
, sizeof (struct lro_flow
) * TCP_LRO_NUM_FLOWS
);
106 for (i
= 0; i
< TCP_LRO_FLOW_MAP
; i
++) {
107 lro_flow_map
[i
] = TCP_LRO_FLOW_UNINIT
;
111 * allocate lock group attribute, group and attribute for tcp_lro_lock
113 tcp_lro_mtx_grp_attr
= lck_grp_attr_alloc_init();
114 tcp_lro_mtx_grp
= lck_grp_alloc_init("tcplro", tcp_lro_mtx_grp_attr
);
115 tcp_lro_mtx_attr
= lck_attr_alloc_init();
116 lck_mtx_init(&tcp_lro_lock
, tcp_lro_mtx_grp
, tcp_lro_mtx_attr
);
118 tcp_lro_timer
= thread_call_allocate(tcp_lro_timer_proc
, NULL
);
119 if (tcp_lro_timer
== NULL
) {
120 panic_plain("%s: unable to allocate lro timer", __func__
);
127 tcp_lro_matching_tuple(struct ip
* ip_hdr
, struct tcphdr
*tcp_hdr
, int *hash
,
130 struct lro_flow
*flow
;
132 unsigned int off
= 0;
135 *hash
= LRO_HASH(ip_hdr
->ip_src
.s_addr
, ip_hdr
->ip_dst
.s_addr
,
136 tcp_hdr
->th_sport
, tcp_hdr
->th_dport
, (TCP_LRO_FLOW_MAP
- 1));
138 *flow_id
= lro_flow_map
[*hash
];
139 if (*flow_id
== TCP_LRO_FLOW_NOTFOUND
) {
143 seqnum
= tcp_hdr
->th_seq
;
144 off
= tcp_hdr
->th_off
<< 2;
145 payload_len
= ip_hdr
->ip_len
- off
;
147 flow
= &lro_flow_list
[*flow_id
];
149 if ((flow
->lr_faddr
.s_addr
== ip_hdr
->ip_src
.s_addr
) &&
150 (flow
->lr_laddr
.s_addr
== ip_hdr
->ip_dst
.s_addr
) &&
151 (flow
->lr_fport
== tcp_hdr
->th_sport
) &&
152 (flow
->lr_lport
== tcp_hdr
->th_dport
)) {
153 if (flow
->lr_tcphdr
== NULL
) {
154 if (ntohl(seqnum
) == flow
->lr_seq
) {
155 return TCP_LRO_COALESCE
;
158 printf("%s: seqnum = %x, lr_seq = %x\n",
159 __func__
, ntohl(seqnum
), flow
->lr_seq
);
162 if (SEQ_GT(ntohl(seqnum
), flow
->lr_seq
)) {
163 lro_seq_outoforder
++;
165 * Whenever we receive out of order packets it
166 * signals loss and recovery and LRO doesn't
167 * let flows recover quickly. So eject.
169 flow
->lr_flags
|= LRO_EJECT_REQ
;
175 if (flow
->lr_flags
& LRO_EJECT_REQ
) {
177 printf("%s: eject. \n", __func__
);
178 return TCP_LRO_EJECT_FLOW
;
180 if (SEQ_GT(tcp_hdr
->th_ack
, flow
->lr_tcphdr
->th_ack
)) {
182 printf("%s: th_ack = %x flow_ack = %x \n",
183 __func__
, tcp_hdr
->th_ack
,
184 flow
->lr_tcphdr
->th_ack
);
186 return TCP_LRO_EJECT_FLOW
;
189 if (ntohl(seqnum
) == (ntohl(lro_flow_list
[*flow_id
].lr_tcphdr
->th_seq
) + lro_flow_list
[*flow_id
].lr_len
)) {
190 return TCP_LRO_COALESCE
;
192 /* LRO does not handle loss recovery well, eject */
193 flow
->lr_flags
|= LRO_EJECT_REQ
;
194 return TCP_LRO_EJECT_FLOW
;
197 if (lrodebug
) printf("tcp_lro_matching_tuple: collision \n");
198 return TCP_LRO_COLLISION
;
202 tcp_lro_init_flow(int flow_id
, struct ip
* ip_hdr
, struct tcphdr
*tcp_hdr
,
203 int hash
, u_int32_t timestamp
, int payload_len
)
205 struct lro_flow
*flow
= NULL
;
207 flow
= &lro_flow_list
[flow_id
];
209 flow
->lr_hash_map
= hash
;
210 flow
->lr_faddr
.s_addr
= ip_hdr
->ip_src
.s_addr
;
211 flow
->lr_laddr
.s_addr
= ip_hdr
->ip_dst
.s_addr
;
212 flow
->lr_fport
= tcp_hdr
->th_sport
;
213 flow
->lr_lport
= tcp_hdr
->th_dport
;
214 lro_flow_map
[hash
] = flow_id
;
215 flow
->lr_timestamp
= timestamp
;
216 flow
->lr_seq
= ntohl(tcp_hdr
->th_seq
) + payload_len
;
222 tcp_lro_coalesce(int flow_id
, struct mbuf
*lro_mb
, struct tcphdr
*tcphdr
,
223 int payload_len
, int drop_hdrlen
, struct tcpopt
*topt
,
224 u_int32_t
* tsval
, u_int32_t
* tsecr
, int thflags
)
226 struct lro_flow
*flow
= NULL
;
228 struct ip
*ip
= NULL
;
230 flow
= &lro_flow_list
[flow_id
];
231 if (flow
->lr_mhead
) {
233 printf("%s: lr_mhead %x %d \n", __func__
, flow
->lr_seq
,
235 m_adj(lro_mb
, drop_hdrlen
);
237 last
= flow
->lr_mtail
;
238 while (last
->m_next
!= NULL
) {
241 last
->m_next
= lro_mb
;
243 flow
->lr_mtail
= lro_mb
;
245 ip
= mtod(flow
->lr_mhead
, struct ip
*);
246 ip
->ip_len
+= lro_mb
->m_pkthdr
.len
;
247 flow
->lr_mhead
->m_pkthdr
.len
+= lro_mb
->m_pkthdr
.len
;
249 if (flow
->lr_len
== 0) {
250 panic_plain("%s: Inconsistent LRO flow state", __func__
);
252 flow
->lr_len
+= payload_len
;
253 flow
->lr_seq
+= payload_len
;
255 * This bit is re-OR'd each time a packet is added to the
256 * large coalesced packet.
258 flow
->lr_mhead
->m_pkthdr
.pkt_flags
|= PKTF_SW_LRO_PKT
;
259 flow
->lr_mhead
->m_pkthdr
.lro_npkts
++; /* for tcpstat.tcps_rcvpack */
260 if (flow
->lr_mhead
->m_pkthdr
.lro_pktlen
<
261 lro_mb
->m_pkthdr
.lro_pktlen
) {
263 * For TCP Inter Arrival Jitter calculation, return max
264 * size encountered while coalescing a stream of pkts.
266 flow
->lr_mhead
->m_pkthdr
.lro_pktlen
=
267 lro_mb
->m_pkthdr
.lro_pktlen
;
269 /* Update the timestamp value */
270 if (topt
->to_flags
& TOF_TS
) {
271 if ((flow
->lr_tsval
) &&
272 (TSTMP_GT(topt
->to_tsval
, ntohl(*(flow
->lr_tsval
))))) {
273 *(flow
->lr_tsval
) = htonl(topt
->to_tsval
);
275 if ((flow
->lr_tsecr
) &&
276 (topt
->to_tsecr
!= 0) &&
277 (TSTMP_GT(topt
->to_tsecr
, ntohl(*(flow
->lr_tsecr
))))) {
279 printf("%s: instantaneous RTT = %d \n", __func__
,
280 topt
->to_tsecr
- ntohl(*(flow
->lr_tsecr
)));
282 *(flow
->lr_tsecr
) = htonl(topt
->to_tsecr
);
285 /* Coalesce the flags */
287 flow
->lr_tcphdr
->th_flags
|= thflags
;
289 /* Update receive window */
290 flow
->lr_tcphdr
->th_win
= tcphdr
->th_win
;
293 flow
->lr_mhead
= flow
->lr_mtail
= lro_mb
;
294 flow
->lr_mhead
->m_pkthdr
.pkt_flags
|= PKTF_SW_LRO_PKT
;
295 flow
->lr_tcphdr
= tcphdr
;
296 if ((topt
) && (topt
->to_flags
& TOF_TS
)) {
297 ASSERT(tsval
!= NULL
);
298 ASSERT(tsecr
!= NULL
);
299 flow
->lr_tsval
= tsval
;
300 flow
->lr_tsecr
= tsecr
;
302 flow
->lr_len
= payload_len
;
303 calculate_tcp_clock();
304 flow
->lr_timestamp
= tcp_now
;
305 tcp_lro_sched_timer(0);
307 flow
->lr_seq
= ntohl(tcphdr
->th_seq
) + payload_len
;
310 tcpstat
.tcps_coalesced_pack
++;
316 tcp_lro_eject_flow(int flow_id
)
318 struct mbuf
*mb
= NULL
;
320 mb
= lro_flow_list
[flow_id
].lr_mhead
;
321 ASSERT(lro_flow_map
[lro_flow_list
[flow_id
].lr_hash_map
] == flow_id
);
322 lro_flow_map
[lro_flow_list
[flow_id
].lr_hash_map
] = TCP_LRO_FLOW_UNINIT
;
323 bzero(&lro_flow_list
[flow_id
], sizeof(struct lro_flow
));
329 tcp_lro_eject_coalesced_pkt(int flow_id
)
331 struct mbuf
*mb
= NULL
;
332 mb
= lro_flow_list
[flow_id
].lr_mhead
;
333 lro_flow_list
[flow_id
].lr_mhead
=
334 lro_flow_list
[flow_id
].lr_mtail
= NULL
;
335 lro_flow_list
[flow_id
].lr_tcphdr
= NULL
;
340 tcp_lro_insert_flow(struct mbuf
*lro_mb
, struct ip
*ip_hdr
,
341 struct tcphdr
*tcp_hdr
, int payload_len
,
342 int drop_hdrlen
, int hash
, struct tcpopt
*topt
,
343 u_int32_t
*tsval
, u_int32_t
*tsecr
)
346 int slot_available
= 0;
347 int candidate_flow
= 0;
348 u_int32_t oldest_timestamp
;
349 struct mbuf
*mb
= NULL
;
352 oldest_timestamp
= tcp_now
;
354 /* handle collision */
355 if (lro_flow_map
[hash
] != TCP_LRO_FLOW_UNINIT
) {
359 candidate_flow
= lro_flow_map
[hash
];
360 tcpstat
.tcps_flowtbl_collision
++;
364 for (i
= 0; i
< TCP_LRO_NUM_FLOWS
; i
++) {
365 if (lro_flow_list
[i
].lr_mhead
== NULL
) {
370 if (oldest_timestamp
>= lro_flow_list
[i
].lr_timestamp
) {
372 oldest_timestamp
= lro_flow_list
[i
].lr_timestamp
;
376 if (!slot_available
) {
377 tcpstat
.tcps_flowtbl_full
++;
379 /* kick the oldest flow */
380 mb
= tcp_lro_eject_flow(candidate_flow
);
383 if (!slot_available
) {
384 printf("%s: slot unavailable.\n",__func__
);
387 printf("%s: collision.\n",__func__
);
391 candidate_flow
= i
; /* this is now the flow to be used */
395 tcp_lro_init_flow(candidate_flow
, ip_hdr
, tcp_hdr
, hash
,
396 tcp_now
, payload_len
);
397 tcp_lro_coalesce(candidate_flow
, lro_mb
, tcp_hdr
, payload_len
,
398 drop_hdrlen
, topt
, tsval
, tsecr
, 0);
403 tcp_lro_process_pkt(struct mbuf
*lro_mb
, int drop_hdrlen
)
405 int flow_id
= TCP_LRO_FLOW_UNINIT
;
407 unsigned int off
= 0;
411 struct mbuf
*mb
= NULL
;
416 int ret_response
= TCP_LRO_CONSUMED
;
417 int coalesced
= 0, tcpflags
= 0, unknown_tcpopts
= 0;
420 struct tcphdr
*tcp_hdr
;
422 if (lro_mb
->m_len
< drop_hdrlen
) {
423 if ((lro_mb
= m_pullup(lro_mb
, drop_hdrlen
)) == NULL
) {
424 tcpstat
.tcps_rcvshort
++;
427 printf("tcp_lro_process_pkt:mbuf too short.\n");
433 ip_hdr
= mtod(lro_mb
, struct ip
*);
434 tcp_hdr
= (struct tcphdr
*)((caddr_t
)ip_hdr
+ sizeof(struct ip
));
437 lro_mb
->m_pkthdr
.pkt_flags
&= ~PKTF_SW_LRO_DID_CSUM
;
439 if ((lro_mb
= lro_tcp_xsum_validate(lro_mb
, ip_hdr
, tcp_hdr
)) == NULL
) {
441 printf("tcp_lro_process_pkt: TCP xsum failed.\n");
449 /* Avoids checksumming in tcp_input */
450 lro_mb
->m_pkthdr
.pkt_flags
|= PKTF_SW_LRO_DID_CSUM
;
452 off
= tcp_hdr
->th_off
<< 2;
453 optlen
= off
- sizeof (struct tcphdr
);
454 payload_len
= ip_hdr
->ip_len
- off
;
455 optp
= (u_char
*)(tcp_hdr
+ 1);
457 * Do quick retrieval of timestamp options ("options
458 * prediction?"). If timestamp is the only option and it's
459 * formatted as recommended in RFC 1323 appendix A, we
460 * quickly get the values now and not bother calling
461 * tcp_dooptions(), etc.
463 if ((optlen
== TCPOLEN_TSTAMP_APPA
||
464 (optlen
> TCPOLEN_TSTAMP_APPA
&&
465 optp
[TCPOLEN_TSTAMP_APPA
] == TCPOPT_EOL
)) &&
466 *(u_int32_t
*)optp
== htonl(TCPOPT_TSTAMP_HDR
) &&
467 (tcp_hdr
->th_flags
& TH_SYN
) == 0) {
468 to
.to_flags
|= TOF_TS
;
469 to
.to_tsval
= ntohl(*(u_int32_t
*)(void *)(optp
+ 4));
470 to
.to_tsecr
= ntohl(*(u_int32_t
*)(void *)(optp
+ 8));
473 * If TCP timestamps are not in use, or not the first option,
474 * skip LRO path since timestamps are used to avoid LRO
475 * from introducing additional latencies for retransmissions
476 * and other slow-paced transmissions.
478 to
.to_flags
= to
.to_tsecr
= 0;
482 /* list all the conditions that can trigger a flow ejection here */
484 thflags
= tcp_hdr
->th_flags
;
485 if (thflags
& (TH_SYN
| TH_URG
| TH_ECE
| TH_CWR
| TH_PUSH
| TH_RST
| TH_FIN
)) {
486 eject_flow
= tcpflags
= 1;
489 if (optlen
&& !((optlen
== TCPOLEN_TSTAMP_APPA
) &&
490 (to
.to_flags
& TOF_TS
))) {
491 eject_flow
= unknown_tcpopts
= 1;
494 if (payload_len
<= LRO_MIN_COALESC_SZ
) { /* zero payload ACK */
498 /* Can't coalesce ECN marked packets. */
499 ecn
= ip_hdr
->ip_tos
& IPTOS_ECN_MASK
;
500 if (ecn
== IPTOS_ECN_CE
) {
502 * ECN needs quick notification
505 printf("%s: ECE bits set.\n", __func__
);
510 lck_mtx_lock_spin(&tcp_lro_lock
);
512 retval
= tcp_lro_matching_tuple(ip_hdr
, tcp_hdr
, &hash
, &flow_id
);
516 lck_mtx_unlock(&tcp_lro_lock
);
517 ret_response
= TCP_LRO_FLOW_NOTFOUND
;
520 case TCP_LRO_COALESCE
:
521 if ((payload_len
!= 0) && (unknown_tcpopts
== 0) &&
522 (tcpflags
== 0) && (ecn
!= IPTOS_ECN_CE
) && (to
.to_flags
& TOF_TS
)) {
523 tcp_lro_coalesce(flow_id
, lro_mb
, tcp_hdr
, payload_len
,
525 (to
.to_flags
& TOF_TS
) ? (u_int32_t
*)(void *)(optp
+ 4) : NULL
,
526 (to
.to_flags
& TOF_TS
) ? (u_int32_t
*)(void *)(optp
+ 8) : NULL
,
529 printf("tcp_lro_process_pkt: coalesce len = %d. flow_id = %d payload_len = %d drop_hdrlen = %d optlen = %d lport = %d seqnum = %x.\n",
530 lro_flow_list
[flow_id
].lr_len
, flow_id
,
531 payload_len
, drop_hdrlen
, optlen
,
532 ntohs(lro_flow_list
[flow_id
].lr_lport
),
533 ntohl(tcp_hdr
->th_seq
));
535 if (lro_flow_list
[flow_id
].lr_mhead
->m_pkthdr
.lro_npkts
>= coalesc_sz
) {
541 mb
= tcp_lro_eject_coalesced_pkt(flow_id
);
542 lro_flow_list
[flow_id
].lr_seq
= ntohl(tcp_hdr
->th_seq
) +
544 calculate_tcp_clock();
545 u_int8_t timestamp
= tcp_now
- lro_flow_list
[flow_id
].lr_timestamp
;
546 lck_mtx_unlock(&tcp_lro_lock
);
548 mb
->m_pkthdr
.lro_elapsed
= timestamp
;
553 printf("%s: pkt payload_len = %d \n", __func__
, payload_len
);
555 lro_proto_input(lro_mb
);
558 lck_mtx_unlock(&tcp_lro_lock
);
562 case TCP_LRO_EJECT_FLOW
:
563 mb
= tcp_lro_eject_coalesced_pkt(flow_id
);
564 calculate_tcp_clock();
565 u_int8_t timestamp
= tcp_now
- lro_flow_list
[flow_id
].lr_timestamp
;
566 lck_mtx_unlock(&tcp_lro_lock
);
569 printf("tcp_lro_process_pkt eject_flow, len = %d\n", mb
->m_pkthdr
.len
);
570 mb
->m_pkthdr
.lro_elapsed
= timestamp
;
574 lro_proto_input(lro_mb
);
577 case TCP_LRO_COLLISION
:
578 lck_mtx_unlock(&tcp_lro_lock
);
579 ret_response
= TCP_LRO_FLOW_NOTFOUND
;
583 lck_mtx_unlock(&tcp_lro_lock
);
584 panic_plain("%s: unrecognized type %d", __func__
, retval
);
588 if (ret_response
== TCP_LRO_FLOW_NOTFOUND
) {
589 lro_proto_input(lro_mb
);
595 tcp_lro_timer_proc(void *arg1
, void *arg2
)
597 #pragma unused(arg1, arg2)
599 lck_mtx_lock_spin(&tcp_lro_lock
);
601 lck_mtx_unlock(&tcp_lro_lock
);
602 tcp_lro_flush_flows();
606 tcp_lro_flush_flows(void)
610 struct lro_flow
*flow
;
611 int tcpclock_updated
= 0;
613 lck_mtx_lock(&tcp_lro_lock
);
615 while (i
< TCP_LRO_NUM_FLOWS
) {
616 flow
= &lro_flow_list
[i
];
617 if (flow
->lr_mhead
!= NULL
) {
619 if (!tcpclock_updated
) {
620 calculate_tcp_clock();
621 tcpclock_updated
= 1;
625 printf("tcp_lro_flush_flows: len =%d n_pkts = %d %d %d \n",
627 flow
->lr_mhead
->m_pkthdr
.lro_npkts
,
628 flow
->lr_timestamp
, tcp_now
);
630 u_int8_t timestamp
= tcp_now
- flow
->lr_timestamp
;
632 mb
= tcp_lro_eject_flow(i
);
635 mb
->m_pkthdr
.lro_elapsed
= timestamp
;
636 lck_mtx_unlock(&tcp_lro_lock
);
637 lro_update_flush_stats(mb
);
639 lck_mtx_lock(&tcp_lro_lock
);
644 lck_mtx_unlock(&tcp_lro_lock
);
648 * Must be called with tcp_lro_lock held.
649 * The hint is non-zero for longer waits. The wait time dictated by coalesc_time
650 * takes precedence, so lro_timer_set is not set for the hint case
653 tcp_lro_sched_timer(uint64_t hint
)
661 /* the intent is to wake up every coalesc_time msecs */
662 clock_interval_to_deadline(coalesc_time
,
663 (NSEC_PER_SEC
/ TCP_RETRANSHZ
), &lro_deadline
);
665 clock_interval_to_deadline(hint
, NSEC_PER_SEC
/ TCP_RETRANSHZ
,
668 thread_call_enter_delayed(tcp_lro_timer
, lro_deadline
);
672 tcp_lro(struct mbuf
*m
, unsigned int hlen
)
676 struct tcphdr
* tcp_hdr
= NULL
;
677 unsigned int off
= 0;
683 * Experiments on cellular show that the RTT is much higher
684 * than the coalescing time of 5 msecs, causing lro to flush
685 * 80% of the time on a single packet. Increasing
686 * coalescing time for cellular does not show marked
687 * improvement to throughput either. Loopback perf is hurt
688 * by the 5 msec latency and it already sends large packets.
690 if (IFNET_IS_CELLULAR(m
->m_pkthdr
.rcvif
) ||
691 (m
->m_pkthdr
.rcvif
->if_type
== IFT_LOOP
)) {
695 ip_hdr
= mtod(m
, struct ip
*);
697 /* don't deal with IP options */
698 if (hlen
!= sizeof (struct ip
))
701 /* only TCP is coalesced */
702 if (ip_hdr
->ip_p
!= IPPROTO_TCP
) {
706 if (m
->m_len
< (int32_t) sizeof (struct tcpiphdr
)) {
707 if (lrodebug
) printf("tcp_lro m_pullup \n");
708 if ((m
= m_pullup(m
, sizeof (struct tcpiphdr
))) == NULL
) {
709 tcpstat
.tcps_rcvshort
++;
711 printf("ip_lro: rcvshort.\n");
715 ip_hdr
= mtod(m
, struct ip
*);
718 tcp_hdr
= (struct tcphdr
*)((caddr_t
)ip_hdr
+ hlen
);
719 tlen
= ip_hdr
->ip_len
; //ignore IP header bytes len
720 m
->m_pkthdr
.lro_pktlen
= tlen
; /* Used to return max pkt encountered to tcp */
721 m
->m_pkthdr
.lro_npkts
= 1; /* Initialize a counter to hold num pkts coalesced */
722 m
->m_pkthdr
.lro_elapsed
= 0; /* Initialize the field to carry elapsed time */
723 off
= tcp_hdr
->th_off
<< 2;
724 if (off
< sizeof (struct tcphdr
) || off
> tlen
) {
725 tcpstat
.tcps_rcvbadoff
++;
727 printf("ip_lro: TCP off greater than TCP header.\n");
732 return (tcp_lro_process_pkt(m
, hlen
+ off
));
736 lro_proto_input(struct mbuf
*m
)
738 struct ip
* ip_hdr
= mtod(m
, struct ip
*);
741 printf("lro_proto_input: ip_len = %d \n",
745 ip_proto_dispatch_in_wrapper(m
, ip_hdr
->ip_hl
<< 2, ip_hdr
->ip_p
);
749 lro_tcp_xsum_validate(struct mbuf
*m
, struct ip
*ip
, struct tcphdr
* th
)
751 /* Expect 32-bit aligned data pointer on strict-align platforms */
752 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m
);
754 /* we shouldn't get here for IP with options; hence sizeof (ip) */
755 if (tcp_input_checksum(AF_INET
, m
, th
, sizeof (*ip
), ip
->ip_len
)) {
757 printf("%s: bad xsum and drop m = 0x%llx.\n", __func__
,
758 (uint64_t)VM_KERNEL_ADDRPERM(m
));
767 * When TCP detects a stable, steady flow without out of ordering,
768 * with a sufficiently high cwnd, it invokes LRO.
771 tcp_start_coalescing(struct ip
*ip_hdr
, struct tcphdr
*tcp_hdr
, int tlen
)
775 struct mbuf
*eject_mb
;
778 hash
= LRO_HASH(ip_hdr
->ip_src
.s_addr
, ip_hdr
->ip_dst
.s_addr
,
779 tcp_hdr
->th_sport
, tcp_hdr
->th_dport
,
780 (TCP_LRO_FLOW_MAP
- 1));
783 lck_mtx_lock_spin(&tcp_lro_lock
);
784 flow_id
= lro_flow_map
[hash
];
785 if (flow_id
!= TCP_LRO_FLOW_NOTFOUND
) {
786 lf
= &lro_flow_list
[flow_id
];
787 if ((lf
->lr_faddr
.s_addr
== ip_hdr
->ip_src
.s_addr
) &&
788 (lf
->lr_laddr
.s_addr
== ip_hdr
->ip_dst
.s_addr
) &&
789 (lf
->lr_fport
== tcp_hdr
->th_sport
) &&
790 (lf
->lr_lport
== tcp_hdr
->th_dport
)) {
791 if ((lf
->lr_tcphdr
== NULL
) &&
792 (lf
->lr_seq
!= (tcp_hdr
->th_seq
+ tlen
))) {
793 lf
->lr_seq
= tcp_hdr
->th_seq
+ tlen
;
795 lf
->lr_flags
&= ~LRO_EJECT_REQ
;
797 lck_mtx_unlock(&tcp_lro_lock
);
801 HTONL(tcp_hdr
->th_seq
);
802 HTONL(tcp_hdr
->th_ack
);
804 tcp_lro_insert_flow(NULL
, ip_hdr
, tcp_hdr
, tlen
, 0, hash
,
807 lck_mtx_unlock(&tcp_lro_lock
);
809 NTOHL(tcp_hdr
->th_seq
);
810 NTOHL(tcp_hdr
->th_ack
);
812 printf("%s: src = %x dst = %x sport = %d dport = %d seq %x \n",
813 __func__
, ip_hdr
->ip_src
.s_addr
, ip_hdr
->ip_dst
.s_addr
,
814 tcp_hdr
->th_sport
, tcp_hdr
->th_dport
, tcp_hdr
->th_seq
);
816 ASSERT(eject_mb
== NULL
);
821 * When TCP detects loss or idle condition, it stops offloading
825 tcp_lro_remove_state(struct in_addr saddr
, struct in_addr daddr
,
826 unsigned short sport
, unsigned short dport
)
831 hash
= LRO_HASH(daddr
.s_addr
, saddr
.s_addr
, dport
, sport
,
832 (TCP_LRO_FLOW_MAP
- 1));
833 lck_mtx_lock_spin(&tcp_lro_lock
);
834 flow_id
= lro_flow_map
[hash
];
835 if (flow_id
== TCP_LRO_FLOW_UNINIT
) {
836 lck_mtx_unlock(&tcp_lro_lock
);
839 lf
= &lro_flow_list
[flow_id
];
840 if ((lf
->lr_faddr
.s_addr
== daddr
.s_addr
) &&
841 (lf
->lr_laddr
.s_addr
== saddr
.s_addr
) &&
842 (lf
->lr_fport
== dport
) &&
843 (lf
->lr_lport
== sport
)) {
845 printf("%s: %x %x\n", __func__
,
846 lf
->lr_flags
, lf
->lr_seq
);
848 lf
->lr_flags
|= LRO_EJECT_REQ
;
850 lck_mtx_unlock(&tcp_lro_lock
);
855 tcp_update_lro_seq(__uint32_t rcv_nxt
, struct in_addr saddr
, struct in_addr daddr
,
856 unsigned short sport
, unsigned short dport
)
861 hash
= LRO_HASH(daddr
.s_addr
, saddr
.s_addr
, dport
, sport
,
862 (TCP_LRO_FLOW_MAP
- 1));
863 lck_mtx_lock_spin(&tcp_lro_lock
);
864 flow_id
= lro_flow_map
[hash
];
865 if (flow_id
== TCP_LRO_FLOW_UNINIT
) {
866 lck_mtx_unlock(&tcp_lro_lock
);
869 lf
= &lro_flow_list
[flow_id
];
870 if ((lf
->lr_faddr
.s_addr
== daddr
.s_addr
) &&
871 (lf
->lr_laddr
.s_addr
== saddr
.s_addr
) &&
872 (lf
->lr_fport
== dport
) &&
873 (lf
->lr_lport
== sport
) &&
874 (lf
->lr_tcphdr
== NULL
)) {
875 lf
->lr_seq
= (tcp_seq
)rcv_nxt
;
877 lck_mtx_unlock(&tcp_lro_lock
);
882 lro_update_stats(struct mbuf
*m
)
884 switch(m
->m_pkthdr
.lro_npkts
) {
885 case 0: /* fall through */
890 tcpstat
.tcps_lro_twopack
++;
893 case 3: /* fall through */
895 tcpstat
.tcps_lro_multpack
++;
899 tcpstat
.tcps_lro_largepack
++;
906 lro_update_flush_stats(struct mbuf
*m
)
909 switch(m
->m_pkthdr
.lro_npkts
) {
911 case 1: lro_single_flushes
++;
913 case 2: lro_double_flushes
++;
915 default: lro_good_flushes
++;