]> git.saurik.com Git - apple/xnu.git/blame - bsd/netinet/tcp_lro.c
xnu-2050.48.11.tar.gz
[apple/xnu.git] / bsd / netinet / tcp_lro.c
CommitLineData
316670eb
A
1/*
2 * Copyright (c) 2011 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/sysctl.h>
32#include <sys/mbuf.h>
33#include <sys/mcache.h>
34#include <sys/socket.h>
35#include <sys/socketvar.h>
36#include <net/if_types.h>
37#include <net/route.h>
38#include <netinet/in.h>
39#include <netinet/in_systm.h>
40#include <net/if.h>
41#include <netinet/ip.h>
42#include <netinet/ip_var.h>
43#include <netinet/in_var.h>
44#include <netinet/tcp.h>
45#include <netinet/tcp_seq.h>
46#include <netinet/tcpip.h>
47#include <netinet/tcp_var.h>
48#include <netinet/tcp_lro.h>
49#include <netinet/lro_ext.h>
50#include <kern/locks.h>
51
52unsigned int lrocount = 0; /* A counter used for debugging only */
53unsigned int lro_seq_outoforder = 0; /* Counter for debugging */
54unsigned int lro_seq_mismatch = 0; /* Counter for debugging */
55unsigned int lro_eject_req = 0; /* Counter for tracking flow ejections */
56unsigned int lro_flushes = 0; /* Counter for tracking number of flushes */
57unsigned int lro_single_flushes = 0;
58unsigned int lro_double_flushes = 0;
59unsigned int lro_good_flushes = 0;
60
61unsigned int coalesc_sz = LRO_MX_COALESCE_PKTS;
62SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro_sz, CTLFLAG_RW | CTLFLAG_LOCKED,
63 &coalesc_sz, 0, "Max coalescing size");
64
65unsigned int coalesc_time = LRO_MX_TIME_TO_BUFFER;
66SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro_time, CTLFLAG_RW | CTLFLAG_LOCKED,
67 &coalesc_time, 0, "Max coalescing time");
68
69struct lro_flow lro_flow_list[TCP_LRO_NUM_FLOWS];
70
71char lro_flow_map[TCP_LRO_FLOW_MAP];
72
73static lck_attr_t *tcp_lro_mtx_attr = NULL; /* mutex attributes */
74static lck_grp_t *tcp_lro_mtx_grp = NULL; /* mutex group */
75static lck_grp_attr_t *tcp_lro_mtx_grp_attr = NULL; /* mutex group attrs */
76decl_lck_mtx_data( ,tcp_lro_lock); /* Used to synchronize updates */
77
78unsigned int lro_byte_count = 0;
79
80uint64_t lro_deadline = 0; /* LRO's sense of time - protected by tcp_lro_lock */
81uint32_t lro_timer_set = 0;
82
83/* Some LRO stats */
84u_int32_t lro_pkt_count = 0; /* Number of packets encountered in an LRO period */
85thread_call_t tcp_lro_timer;
86
87extern u_int32_t kipf_count;
88
89static void tcp_lro_timer_proc(void*, void*);
90static void lro_update_stats(struct mbuf*);
91static void lro_update_flush_stats(struct mbuf *);
92static void tcp_lro_flush_flows(void);
93static void tcp_lro_sched_timer(uint64_t);
94static void lro_proto_input(struct mbuf *);
95
96static struct mbuf *lro_tcp_xsum_validate(struct mbuf*, struct ipovly *,
97 struct tcphdr*);
98static struct mbuf *tcp_lro_process_pkt(struct mbuf*, struct ip*, struct tcphdr*,
99 int);
100
101void
102tcp_lro_init(void)
103{
104 int i;
105
106 bzero(lro_flow_list, sizeof (struct lro_flow) * TCP_LRO_NUM_FLOWS);
107 for (i = 0; i < TCP_LRO_FLOW_MAP; i++) {
108 lro_flow_map[i] = TCP_LRO_FLOW_UNINIT;
109 }
110
111 /*
112 * allocate lock group attribute, group and attribute for tcp_lro_lock
113 */
114 tcp_lro_mtx_grp_attr = lck_grp_attr_alloc_init();
115 tcp_lro_mtx_grp = lck_grp_alloc_init("tcplro", tcp_lro_mtx_grp_attr);
116 tcp_lro_mtx_attr = lck_attr_alloc_init();
117 lck_mtx_init(&tcp_lro_lock, tcp_lro_mtx_grp, tcp_lro_mtx_attr);
118
119 tcp_lro_timer = thread_call_allocate(tcp_lro_timer_proc, NULL);
120 if (tcp_lro_timer == NULL) {
121 panic_plain("%s: unable to allocate lro timer", __func__);
122 }
123
124 return;
125}
126
127static int
128tcp_lro_matching_tuple(struct ip* ip_hdr, struct tcphdr *tcp_hdr, int *hash,
129 int *flow_id )
130{
131 struct lro_flow *flow;
132 tcp_seq seqnum;
133 unsigned int off = 0;
134 int payload_len = 0;
135
136 *hash = LRO_HASH(ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr,
137 tcp_hdr->th_sport, tcp_hdr->th_dport, (TCP_LRO_FLOW_MAP - 1));
138
139 *flow_id = lro_flow_map[*hash];
140 if (*flow_id == TCP_LRO_FLOW_NOTFOUND) {
141 return TCP_LRO_NAN;
142 }
143
144 seqnum = tcp_hdr->th_seq;
145 off = tcp_hdr->th_off << 2;
146 payload_len = ip_hdr->ip_len - off;
147
148 flow = &lro_flow_list[*flow_id];
149
150 if ((flow->lr_faddr.s_addr == ip_hdr->ip_src.s_addr) &&
151 (flow->lr_laddr.s_addr == ip_hdr->ip_dst.s_addr) &&
152 (flow->lr_fport == tcp_hdr->th_sport) &&
153 (flow->lr_lport == tcp_hdr->th_dport)) {
154 if (flow->lr_tcphdr == NULL) {
155 if (ntohl(seqnum) == flow->lr_seq) {
156 return TCP_LRO_COALESCE;
157 }
158 if (lrodebug >= 4) {
159 printf("%s: seqnum = %x, lr_seq = %x\n",
160 __func__, ntohl(seqnum), flow->lr_seq);
161 }
162 lro_seq_mismatch++;
163 if (SEQ_GT(ntohl(seqnum), flow->lr_seq)) {
164 lro_seq_outoforder++;
165 /*
166 * Whenever we receive out of order packets it
167 * signals loss and recovery and LRO doesn't
168 * let flows recover quickly. So eject.
169 */
170 flow->lr_flags |= LRO_EJECT_REQ;
171
172 }
173 return TCP_LRO_NAN;
174 }
175
176 if (flow->lr_flags & LRO_EJECT_REQ) {
177 if (lrodebug)
178 printf("%s: eject. \n", __func__);
179 return TCP_LRO_EJECT_FLOW;
180 }
181 if (SEQ_GT(tcp_hdr->th_ack, flow->lr_tcphdr->th_ack)) {
182 if (lrodebug) {
183 printf("%s: th_ack = %x flow_ack = %x \n",
184 __func__, tcp_hdr->th_ack,
185 flow->lr_tcphdr->th_ack);
186 }
187 return TCP_LRO_EJECT_FLOW;
188 }
189
190 if (ntohl(seqnum) == (ntohl(lro_flow_list[*flow_id].lr_tcphdr->th_seq) + lro_flow_list[*flow_id].lr_len)) {
191 return TCP_LRO_COALESCE;
192 } else {
193 /* LRO does not handle loss recovery well, eject */
194 flow->lr_flags |= LRO_EJECT_REQ;
195 return TCP_LRO_EJECT_FLOW;
196 }
197 }
198 if (lrodebug) printf("tcp_lro_matching_tuple: collision \n");
199 return TCP_LRO_COLLISION;
200}
201
202static void
203tcp_lro_init_flow(int flow_id, struct ip* ip_hdr, struct tcphdr *tcp_hdr,
204 int hash, u_int32_t timestamp, int payload_len)
205{
206 struct lro_flow *flow = NULL;
207
208 flow = &lro_flow_list[flow_id];
209
210 flow->lr_hash_map = hash;
211 flow->lr_faddr.s_addr = ip_hdr->ip_src.s_addr;
212 flow->lr_laddr.s_addr = ip_hdr->ip_dst.s_addr;
213 flow->lr_fport = tcp_hdr->th_sport;
214 flow->lr_lport = tcp_hdr->th_dport;
215 lro_flow_map[hash] = flow_id;
216 flow->lr_timestamp = timestamp;
217 flow->lr_seq = ntohl(tcp_hdr->th_seq) + payload_len;
218 flow->lr_flags = 0;
219 return;
220}
221
222static void
223tcp_lro_coalesce(int flow_id, struct mbuf *lro_mb, struct tcphdr *tcphdr,
224 int payload_len, int drop_hdrlen, struct tcpopt *topt,
225 u_int32_t* tsval, u_int32_t* tsecr, int thflags)
226{
227 struct lro_flow *flow = NULL;
228 struct mbuf *last;
229 struct ip *ip = NULL;
230
231 flow = &lro_flow_list[flow_id];
232 if (flow->lr_mhead) {
233 if (lrodebug)
234 printf("%s: lr_mhead %x %d \n", __func__, flow->lr_seq,
235 payload_len);
236 m_adj(lro_mb, drop_hdrlen);
237
238 last = flow->lr_mtail;
239 while (last->m_next != NULL) {
240 last = last->m_next;
241 }
242 last->m_next = lro_mb;
243
244 flow->lr_mtail = lro_mb;
245
246 ip = mtod(flow->lr_mhead, struct ip *);
247 ip->ip_len += lro_mb->m_pkthdr.len;
248 flow->lr_mhead->m_pkthdr.len += lro_mb->m_pkthdr.len;
249
250 if (flow->lr_len == 0) {
251 panic_plain("%s: Inconsistent LRO flow state", __func__);
252 }
253 flow->lr_len += payload_len;
254 flow->lr_seq += payload_len;
255 /*
256 * This bit is re-OR'd each time a packet is added to the
257 * large coalesced packet.
258 */
259 flow->lr_mhead->m_pkthdr.aux_flags |= MAUXF_SW_LRO_PKT;
260 flow->lr_mhead->m_pkthdr.lro_npkts++; /* for tcpstat.tcps_rcvpack */
261 if (flow->lr_mhead->m_pkthdr.lro_pktlen <
262 lro_mb->m_pkthdr.lro_pktlen) {
263 /*
264 * For TCP Inter Arrival Jitter calculation, return max
265 * size encountered while coalescing a stream of pkts.
266 */
267 flow->lr_mhead->m_pkthdr.lro_pktlen =
268 lro_mb->m_pkthdr.lro_pktlen;
269 }
270 /* Update the timestamp value */
271 if (topt->to_flags & TOF_TS) {
272 if ((flow->lr_tsval) &&
273 (TSTMP_GT(topt->to_tsval, ntohl(*(flow->lr_tsval))))) {
274 *(flow->lr_tsval) = htonl(topt->to_tsval);
275 }
276 if ((flow->lr_tsecr) &&
277 (topt->to_tsecr != 0) &&
278 (TSTMP_GT(topt->to_tsecr, ntohl(*(flow->lr_tsecr))))) {
279 if (lrodebug >= 2) {
280 printf("%s: instantaneous RTT = %d \n", __func__,
281 topt->to_tsecr - ntohl(*(flow->lr_tsecr)));
282 }
283 *(flow->lr_tsecr) = htonl(topt->to_tsecr);
284 }
285 }
286 /* Coalesce the flags */
287 if (thflags) {
288 flow->lr_tcphdr->th_flags |= thflags;
289 }
290 /* Update receive window */
291 flow->lr_tcphdr->th_win = tcphdr->th_win;
292 } else {
293 if (lro_mb) {
294 flow->lr_mhead = flow->lr_mtail = lro_mb;
295 flow->lr_mhead->m_pkthdr.aux_flags |= MAUXF_SW_LRO_PKT;
296 flow->lr_tcphdr = tcphdr;
297 if ((topt) && (topt->to_flags & TOF_TS)) {
298 ASSERT(tsval != NULL);
299 ASSERT(tsecr != NULL);
300 flow->lr_tsval = tsval;
301 flow->lr_tsecr = tsecr;
302 }
303 flow->lr_len = payload_len;
304 flow->lr_timestamp = tcp_now;
305 tcp_lro_sched_timer(0);
306 }
307 flow->lr_seq = ntohl(tcphdr->th_seq) + payload_len;
308 }
309 if (lro_mb) {
310 tcpstat.tcps_coalesced_pack++;
311 }
312 return;
313}
314
315static struct mbuf *
316tcp_lro_eject_flow(int flow_id)
317{
318 struct mbuf *mb = NULL;
319
320 mb = lro_flow_list[flow_id].lr_mhead;
321 ASSERT(lro_flow_map[lro_flow_list[flow_id].lr_hash_map] == flow_id);
322 lro_flow_map[lro_flow_list[flow_id].lr_hash_map] = TCP_LRO_FLOW_UNINIT;
323 bzero(&lro_flow_list[flow_id], sizeof(struct lro_flow));
324
325 return mb;
326}
327
328static struct mbuf*
329tcp_lro_eject_coalesced_pkt(int flow_id)
330{
331 struct mbuf *mb = NULL;
332 mb = lro_flow_list[flow_id].lr_mhead;
333 lro_flow_list[flow_id].lr_mhead =
334 lro_flow_list[flow_id].lr_mtail = NULL;
335 lro_flow_list[flow_id].lr_tcphdr = NULL;
336 return mb;
337}
338
339static struct mbuf*
340tcp_lro_insert_flow(struct mbuf *lro_mb, struct ip *ip_hdr,
341 struct tcphdr *tcp_hdr, int payload_len,
342 int drop_hdrlen, int hash, struct tcpopt *topt,
343 u_int32_t *tsval, u_int32_t *tsecr)
344{
345 int i;
346 int slot_available = 0;
347 int candidate_flow = 0;
348 u_int32_t oldest_timestamp;
349 struct mbuf *mb = NULL;
350 int collision = 0;
351
352 oldest_timestamp = tcp_now;
353
354 /* handle collision */
355 if (lro_flow_map[hash] != TCP_LRO_FLOW_UNINIT) {
356 if (lrodebug) {
357 collision = 1;
358 }
359 candidate_flow = lro_flow_map[hash];
360 tcpstat.tcps_flowtbl_collision++;
361 goto kick_flow;
362 }
363
364 for (i = 0; i < TCP_LRO_NUM_FLOWS; i++) {
365 if (lro_flow_list[i].lr_mhead == NULL) {
366 candidate_flow = i;
367 slot_available = 1;
368 break;
369 }
370 if (oldest_timestamp >= lro_flow_list[i].lr_timestamp) {
371 candidate_flow = i;
372 oldest_timestamp = lro_flow_list[i].lr_timestamp;
373 }
374 }
375
376 if (!slot_available) {
377 tcpstat.tcps_flowtbl_full++;
378kick_flow:
379 /* kick the oldest flow */
380 mb = tcp_lro_eject_flow(candidate_flow);
381
382 if (lrodebug) {
383 if (!slot_available) {
384 printf("%s: slot unavailable.\n",__func__);
385 }
386 if (collision) {
387 printf("%s: collision.\n",__func__);
388 }
389 }
390 } else {
391 candidate_flow = i; /* this is now the flow to be used */
392
393 }
394
395 tcp_lro_init_flow(candidate_flow, ip_hdr, tcp_hdr, hash,
396 tcp_now, payload_len);
397 tcp_lro_coalesce(candidate_flow, lro_mb, tcp_hdr, payload_len,
398 drop_hdrlen, topt, tsval, tsecr, 0);
399 return mb;
400}
401
402struct mbuf*
403tcp_lro_process_pkt(struct mbuf *lro_mb, struct ip *ip_hdr,
404 struct tcphdr *tcp_hdr, int drop_hdrlen)
405{
406 int flow_id = TCP_LRO_FLOW_UNINIT;
407 int hash;
408 unsigned int off = 0;
409 int eject_flow = 0;
410 int optlen;
411 int retval = 0;
412 struct mbuf *mb = NULL;
413 int payload_len = 0;
414 u_char *optp = NULL;
415 int thflags = 0;
416 struct tcpopt to;
417 int ret_response = TCP_LRO_CONSUMED;
418 int coalesced = 0, tcpflags = 0, unknown_tcpopts = 0;
419 u_int8_t ecn;
420
421 if (lro_mb->m_len < (int32_t)sizeof (struct tcpiphdr)) {
422 if ((lro_mb = m_pullup(lro_mb, sizeof(struct tcpiphdr))) == 0) {
423 tcpstat.tcps_rcvshort++;
424 m_freem(lro_mb);
425 if (lrodebug) {
426 printf("tcp_lro_process_pkt:mbuf too short.\n");
427 }
428 return NULL;
429 }
430 }
431
432 if ((lro_mb = lro_tcp_xsum_validate(lro_mb,
433 (struct ipovly*)ip_hdr, tcp_hdr)) == NULL) {
434 if (lrodebug) {
435 printf("tcp_lro_process_pkt: TCP xsum failed.\n");
436 }
437 return NULL;
438 }
439
440 /* Update stats */
441 lro_pkt_count++;
442
443 /* Avoids checksumming in tcp_input */
444 lro_mb->m_pkthdr.aux_flags |= MAUXF_SW_LRO_DID_CSUM;
445
446 off = tcp_hdr->th_off << 2;
447 optlen = off - sizeof (struct tcphdr);
448 payload_len = ip_hdr->ip_len - off;
449 optp = (u_char *)(tcp_hdr + 1);
450 /*
451 * Do quick retrieval of timestamp options ("options
452 * prediction?"). If timestamp is the only option and it's
453 * formatted as recommended in RFC 1323 appendix A, we
454 * quickly get the values now and not bother calling
455 * tcp_dooptions(), etc.
456 */
457 if ((optlen == TCPOLEN_TSTAMP_APPA ||
458 (optlen > TCPOLEN_TSTAMP_APPA &&
459 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
460 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
461 (tcp_hdr->th_flags & TH_SYN) == 0) {
462 to.to_flags |= TOF_TS;
463 to.to_tsval = ntohl(*(u_int32_t *)(void *)(optp + 4));
464 to.to_tsecr = ntohl(*(u_int32_t *)(void *)(optp + 8));
465 } else {
466 /*
467 * If TCP timestamps are not in use, or not the first option,
468 * skip LRO path since timestamps are used to avoid LRO
469 * from introducing additional latencies for retransmissions
470 * and other slow-paced transmissions.
471 */
472 to.to_flags = to.to_tsecr = 0;
473 eject_flow = 1;
474 }
475
476 /* list all the conditions that can trigger a flow ejection here */
477
478 thflags = tcp_hdr->th_flags;
479 if (thflags & (TH_SYN | TH_URG | TH_ECE | TH_CWR | TH_PUSH | TH_RST | TH_FIN)) {
480 eject_flow = tcpflags = 1;
481 }
482
483 if (optlen && !((optlen == TCPOLEN_TSTAMP_APPA) &&
484 (to.to_flags & TOF_TS))) {
485 eject_flow = unknown_tcpopts = 1;
486 }
487
488 if (payload_len <= LRO_MIN_COALESC_SZ) { /* zero payload ACK */
489 eject_flow = 1;
490 }
491
492 /* Can't coalesce ECN marked packets. */
493 ecn = ip_hdr->ip_tos & IPTOS_ECN_MASK;
494 if (ecn == IPTOS_ECN_CE) {
495 /*
496 * ECN needs quick notification
497 */
498 if (lrodebug) {
499 printf("%s: ECE bits set.\n", __func__);
500 }
501 eject_flow = 1;
502 }
503
504 lck_mtx_lock_spin(&tcp_lro_lock);
505
506 retval = tcp_lro_matching_tuple(ip_hdr, tcp_hdr, &hash, &flow_id);
507
508 switch (retval) {
509 case TCP_LRO_NAN:
510 lck_mtx_unlock(&tcp_lro_lock);
511 ret_response = TCP_LRO_FLOW_NOTFOUND;
512 break;
513
514 case TCP_LRO_COALESCE:
515 if ((payload_len != 0) && (unknown_tcpopts == 0) &&
99c3a104 516 (tcpflags == 0) && (ecn != IPTOS_ECN_CE) && (to.to_flags & TOF_TS)) {
316670eb
A
517 tcp_lro_coalesce(flow_id, lro_mb, tcp_hdr, payload_len,
518 drop_hdrlen, &to,
519 (to.to_flags & TOF_TS) ? (u_int32_t *)(void *)(optp + 4) : NULL,
520 (to.to_flags & TOF_TS) ? (u_int32_t *)(void *)(optp + 8) : NULL,
521 thflags);
522 if (lrodebug >= 2) {
523 printf("tcp_lro_process_pkt: coalesce len = %d. flow_id = %d payload_len = %d drop_hdrlen = %d optlen = %d lport = %d seqnum = %x.\n",
524 lro_flow_list[flow_id].lr_len, flow_id,
525 payload_len, drop_hdrlen, optlen,
526 ntohs(lro_flow_list[flow_id].lr_lport),
527 ntohl(tcp_hdr->th_seq));
528 }
529 if (lro_flow_list[flow_id].lr_mhead->m_pkthdr.lro_npkts >= coalesc_sz) {
530 eject_flow = 1;
531 }
532 coalesced = 1;
533 }
534 if (eject_flow) {
535 mb = tcp_lro_eject_coalesced_pkt(flow_id);
536 lro_flow_list[flow_id].lr_seq = ntohl(tcp_hdr->th_seq) +
537 payload_len;
538 lck_mtx_unlock(&tcp_lro_lock);
539 if (mb) {
540 lro_proto_input(mb);
541 }
542 if (!coalesced) {
543 if (lrodebug >= 2) {
544 printf("%s: pkt payload_len = %d \n", __func__, payload_len);
545 }
546 lro_proto_input(lro_mb);
547 }
548 } else {
549 lck_mtx_unlock(&tcp_lro_lock);
550 }
551 break;
552
553 case TCP_LRO_EJECT_FLOW:
554 mb = tcp_lro_eject_coalesced_pkt(flow_id);
555 lck_mtx_unlock(&tcp_lro_lock);
556 if (mb) {
557 if (lrodebug)
558 printf("tcp_lro_process_pkt eject_flow, len = %d\n", mb->m_pkthdr.len);
559 lro_proto_input(mb);
560 }
561
562 lro_proto_input(lro_mb);
563 break;
564
565 case TCP_LRO_COLLISION:
566 lck_mtx_unlock(&tcp_lro_lock);
567 ret_response = TCP_LRO_FLOW_NOTFOUND;
568 break;
569
570 default:
571 lck_mtx_unlock(&tcp_lro_lock);
572 panic_plain("%s: unrecognized type %d", __func__, retval);
573 break;
574 }
575
576 if (ret_response == TCP_LRO_FLOW_NOTFOUND) {
577 lro_proto_input(lro_mb);
578 }
579 return NULL;
580}
581
582static void
583tcp_lro_timer_proc(void *arg1, void *arg2)
584{
585#pragma unused(arg1, arg2)
586
587 lck_mtx_lock_spin(&tcp_lro_lock);
588 lro_timer_set = 0;
589 lck_mtx_unlock(&tcp_lro_lock);
590 tcp_lro_flush_flows();
591}
592
593static void
594tcp_lro_flush_flows(void)
595{
596 int i = 0;
597 struct mbuf *mb;
598 struct lro_flow *flow;
599 int active_flows = 0;
600 int outstanding_flows = 0;
601 int tcpclock_updated = 0;
602
603 lck_mtx_lock(&tcp_lro_lock);
604
605 while (i < TCP_LRO_NUM_FLOWS) {
606 flow = &lro_flow_list[i];
607 if (flow->lr_mhead != NULL) {
608 active_flows++;
609 if (!tcpclock_updated) {
610 calculate_tcp_clock();
611 tcpclock_updated = 1;
612 }
613 if (((tcp_now - flow->lr_timestamp) >= coalesc_time) ||
614 (flow->lr_mhead->m_pkthdr.lro_npkts >=
615 coalesc_sz)) {
616
617 if (lrodebug >= 2)
618 printf("tcp_lro_flush_flows: len =%d n_pkts = %d %d %d \n",
619 flow->lr_len,
620 flow->lr_mhead->m_pkthdr.lro_npkts,
621 flow->lr_timestamp, tcp_now);
622
623 mb = tcp_lro_eject_flow(i);
624
625 if (mb) {
626 lck_mtx_unlock(&tcp_lro_lock);
627 lro_update_flush_stats(mb);
628 lro_proto_input(mb);
629 lck_mtx_lock(&tcp_lro_lock);
630 }
631
632 } else {
633 tcp_lro_sched_timer(0);
634 outstanding_flows++;
635 if (lrodebug >= 2) {
636 printf("tcp_lro_flush_flows: did not flush flow of len =%d deadline = %x timestamp = %x \n",
637 flow->lr_len, tcp_now, flow->lr_timestamp);
638 }
639 }
640 }
641 if (flow->lr_flags & LRO_EJECT_REQ) {
642 mb = tcp_lro_eject_flow(i);
643 if (mb) {
644 lck_mtx_unlock(&tcp_lro_lock);
645 lro_proto_input(mb);
646 lro_eject_req++;
647 lck_mtx_lock(&tcp_lro_lock);
648 }
649 }
650 i++;
651 }
652 lck_mtx_unlock(&tcp_lro_lock);
653#if 0
654 if (lrocount == 900) {
655 printf("%s: %d %d %d %d oo: %d mismatch: %d ej_req: %d coll: %d \n",
656 __func__,
657 tcpstat.tcps_coalesced_pack,
658 tcpstat.tcps_lro_twopack,
659 tcpstat.tcps_lro_multpack,
660 tcpstat.tcps_lro_largepack,
661 lro_seq_outoforder,
662 lro_seq_mismatch,
663 lro_eject_req,
664 tcpstat.tcps_flowtbl_collision);
665 printf("%s: all: %d single: %d double: %d good: %d \n",
666 __func__, lro_flushes, lro_single_flushes,
667 lro_double_flushes, lro_good_flushes);
668 lrocount = 0;
669 } else {
670 lrocount++;
671 }
672 if ((lrodebug >= 2) && (active_flows > 1)) {
673 printf("lro_flush_flows: active_flows = %d \n", active_flows);
674 }
675#endif
676}
677
678/*
679 * Must be called with tcp_lro_lock held.
680 * The hint is non-zero for longer waits. The wait time dictated by coalesc_time
681 * takes precedence, so lro_timer_set is not set for the hint case
682 */
683static void
684tcp_lro_sched_timer(uint64_t hint)
685{
686 if (lro_timer_set) {
687 return;
688 }
689
690 lro_timer_set = 1;
691 if (!hint) {
692 /* the intent is to wake up every coalesc_time msecs */
693 clock_interval_to_deadline(coalesc_time,
694 (NSEC_PER_SEC / TCP_RETRANSHZ), &lro_deadline);
695 } else {
696 clock_interval_to_deadline(hint, NSEC_PER_SEC / TCP_RETRANSHZ,
697 &lro_deadline);
698 }
699 thread_call_enter_delayed(tcp_lro_timer, lro_deadline);
700}
701
702struct mbuf*
703tcp_lro(struct mbuf *m, unsigned int hlen)
704{
705 struct ip *ip_hdr;
706 unsigned int tlen;
707 struct tcphdr * tcp_hdr = NULL;
708 unsigned int off = 0;
709
710 if (kipf_count != 0)
711 return m;
712
713 /*
714 * Experiments on cellular show that the RTT is much higher
715 * than the coalescing time of 5 msecs, causing lro to flush
716 * 80% of the time on a single packet. Increasing
717 * coalescing time for cellular does not show marked
718 * improvement to throughput either. Loopback perf is hurt
719 * by the 5 msec latency and it already sends large packets.
720 */
721 if ((m->m_pkthdr.rcvif->if_type == IFT_CELLULAR) ||
722 (m->m_pkthdr.rcvif->if_type == IFT_LOOP)) {
723 return m;
724 }
725
726 ip_hdr = mtod(m, struct ip*);
727
728 /* only TCP is coalesced */
729 if (ip_hdr->ip_p != IPPROTO_TCP) {
730 return m;
731 }
732
733 if (m->m_len < (int32_t) sizeof (struct tcpiphdr)) {
734 if (lrodebug) printf("tcp_lro m_pullup \n");
735 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
736 tcpstat.tcps_rcvshort++;
737 if (lrodebug) {
738 printf("ip_lro: rcvshort.\n");
739 }
740 return NULL;
741 }
742 }
743
744 tcp_hdr = (struct tcphdr *)((caddr_t)ip_hdr + hlen);
745 tlen = ip_hdr->ip_len ; //ignore IP header bytes len
746 m->m_pkthdr.lro_pktlen = tlen; /* Used to return max pkt encountered to tcp */
747 m->m_pkthdr.lro_npkts = 1; /* Initialize a counter to hold num pkts coalesced */
748 off = tcp_hdr->th_off << 2;
749 if (off < sizeof (struct tcphdr) || off > tlen) {
750 tcpstat.tcps_rcvbadoff++;
751 if (lrodebug) {
752 printf("ip_lro: TCP off greater than TCP header.\n");
753 }
754 return m;
755 }
756
757 return (tcp_lro_process_pkt(m, ip_hdr, tcp_hdr, hlen + off));
758}
759
760static void
761lro_proto_input(struct mbuf *m)
762{
763 struct ip* ip_hdr = mtod(m, struct ip*);
764
765 if (lrodebug >= 3) {
766 printf("lro_proto_input: ip_len = %d \n",
767 ip_hdr->ip_len);
768 }
769 lro_update_stats(m);
770 ip_proto_dispatch_in_wrapper(m, ip_hdr->ip_hl << 2, ip_hdr->ip_p);
771}
772
773static struct mbuf *
774lro_tcp_xsum_validate(struct mbuf *m, struct ipovly *ipov, struct tcphdr * th)
775{
776
777 struct ip* ip = (struct ip*)ipov;
778 int tlen = ip->ip_len;
779 int len;
780 struct ifnet *ifp = ((m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif != NULL) ?
781 m->m_pkthdr.rcvif: NULL;
782
783 /* Expect 32-bit aligned data pointer on strict-align platforms */
784 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
785
786 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
787 if (m->m_pkthdr.csum_flags & CSUM_TCP_SUM16) {
788 u_short pseudo;
789 char b[9];
790
791 bcopy(ipov->ih_x1, b, sizeof (ipov->ih_x1));
792 bzero(ipov->ih_x1, sizeof (ipov->ih_x1));
793 ipov->ih_len = (u_short)tlen;
794#if BYTE_ORDER != BIG_ENDIAN
795 HTONS(ipov->ih_len);
796#endif
797 pseudo = in_cksum(m, sizeof (struct ip));
798 bcopy(b, ipov->ih_x1, sizeof (ipov->ih_x1));
799
800 th->th_sum = in_addword(pseudo, (m->m_pkthdr.csum_data & 0xFFFF));
801 } else {
802 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
803 th->th_sum = m->m_pkthdr.csum_data;
804 else
805 th->th_sum = in_pseudo(ip->ip_src.s_addr,
806 ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data +
807 ip->ip_len + IPPROTO_TCP));
808 }
809 th->th_sum ^= 0xffff;
810 } else {
811 char b[9];
812 /*
813 * Checksum extended TCP header and data.
814 */
815 bcopy(ipov->ih_x1, b, sizeof (ipov->ih_x1));
816 bzero(ipov->ih_x1, sizeof (ipov->ih_x1));
817 ipov->ih_len = (u_short)tlen;
818#if BYTE_ORDER != BIG_ENDIAN
819 HTONS(ipov->ih_len);
820#endif
821 len = sizeof (struct ip) + tlen;
822 th->th_sum = in_cksum(m, len);
823 bcopy(b, ipov->ih_x1, sizeof (ipov->ih_x1));
824
825 tcp_in_cksum_stats(len);
826 }
827 if (th->th_sum) {
828 tcpstat.tcps_rcvbadsum++;
829 if (ifp != NULL && ifp->if_tcp_stat != NULL) {
830 atomic_add_64(&ifp->if_tcp_stat->badformat, 1);
831 }
832 if (lrodebug)
833 printf("lro_tcp_xsum_validate: bad xsum and drop m = %p.\n",m);
834 m_freem(m);
835 return NULL;
836 }
837 /* revert back the order as IP will look into this again. */
838#if BYTE_ORDER != BIG_ENDIAN
839 NTOHS(ipov->ih_len);
840#endif
841 return m;
842}
843
844/*
845 * When TCP detects a stable, steady flow without out of ordering,
846 * with a sufficiently high cwnd, it invokes LRO.
847 */
848int
849tcp_start_coalescing(struct ip *ip_hdr, struct tcphdr *tcp_hdr, int tlen)
850{
851 int hash;
852 int flow_id;
853 struct mbuf *eject_mb;
854 struct lro_flow *lf;
855
856 hash = LRO_HASH(ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr,
857 tcp_hdr->th_sport, tcp_hdr->th_dport,
858 (TCP_LRO_FLOW_MAP - 1));
859
860
861 lck_mtx_lock_spin(&tcp_lro_lock);
862 flow_id = lro_flow_map[hash];
863 if (flow_id != TCP_LRO_FLOW_NOTFOUND) {
864 lf = &lro_flow_list[flow_id];
865 if ((lf->lr_faddr.s_addr == ip_hdr->ip_src.s_addr) &&
866 (lf->lr_laddr.s_addr == ip_hdr->ip_dst.s_addr) &&
867 (lf->lr_fport == tcp_hdr->th_sport) &&
868 (lf->lr_lport == tcp_hdr->th_dport)) {
869 if ((lf->lr_tcphdr == NULL) &&
870 (lf->lr_seq != (tcp_hdr->th_seq + tlen))) {
871 lf->lr_seq = tcp_hdr->th_seq + tlen;
872 }
873 lf->lr_flags &= ~LRO_EJECT_REQ;
874 }
875 lck_mtx_unlock(&tcp_lro_lock);
876 return 0;
877 }
878
879 HTONL(tcp_hdr->th_seq);
880 HTONL(tcp_hdr->th_ack);
881 eject_mb =
882 tcp_lro_insert_flow(NULL, ip_hdr, tcp_hdr, tlen, 0, hash,
883 NULL, NULL, NULL);
884
885 lck_mtx_unlock(&tcp_lro_lock);
886
887 NTOHL(tcp_hdr->th_seq);
888 NTOHL(tcp_hdr->th_ack);
889 if (lrodebug >= 3) {
890 printf("%s: src = %x dst = %x sport = %d dport = %d seq %x \n",
891 __func__, ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr,
892 tcp_hdr->th_sport, tcp_hdr->th_dport, tcp_hdr->th_seq);
893 }
894 ASSERT(eject_mb == NULL);
895 return 0;
896}
897
898/*
899 * When TCP detects loss or idle condition, it stops offloading
900 * to LRO.
901 */
902int
903tcp_lro_remove_state(struct in_addr saddr, struct in_addr daddr,
904 unsigned short sport, unsigned short dport)
905{
906 int hash, flow_id;
907 struct lro_flow *lf;
908
909 hash = LRO_HASH(daddr.s_addr, saddr.s_addr, dport, sport,
910 (TCP_LRO_FLOW_MAP - 1));
911 lck_mtx_lock_spin(&tcp_lro_lock);
912 flow_id = lro_flow_map[hash];
913 if (flow_id == TCP_LRO_FLOW_UNINIT) {
914 lck_mtx_unlock(&tcp_lro_lock);
915 return 0;
916 }
917 lf = &lro_flow_list[flow_id];
918 if ((lf->lr_faddr.s_addr == daddr.s_addr) &&
919 (lf->lr_laddr.s_addr == saddr.s_addr) &&
920 (lf->lr_fport == dport) &&
921 (lf->lr_lport == sport)) {
922 if (lrodebug) {
923 printf("%s: %x %x\n", __func__,
924 lf->lr_flags, lf->lr_seq);
925 }
926 lf->lr_flags |= LRO_EJECT_REQ;
927 }
928 lck_mtx_unlock(&tcp_lro_lock);
929 return 0;
930}
931
932void
933tcp_update_lro_seq(__uint32_t rcv_nxt, struct in_addr saddr, struct in_addr daddr,
934 unsigned short sport, unsigned short dport)
935{
936 int hash, flow_id;
937 struct lro_flow *lf;
938
939 hash = LRO_HASH(daddr.s_addr, saddr.s_addr, dport, sport,
940 (TCP_LRO_FLOW_MAP - 1));
941 lck_mtx_lock_spin(&tcp_lro_lock);
942 flow_id = lro_flow_map[hash];
943 if (flow_id == TCP_LRO_FLOW_UNINIT) {
944 lck_mtx_unlock(&tcp_lro_lock);
945 return;
946 }
947 lf = &lro_flow_list[flow_id];
948 if ((lf->lr_faddr.s_addr == daddr.s_addr) &&
949 (lf->lr_laddr.s_addr == saddr.s_addr) &&
950 (lf->lr_fport == dport) &&
951 (lf->lr_lport == sport) &&
952 (lf->lr_tcphdr == NULL)) {
953 lf->lr_seq = (tcp_seq)rcv_nxt;
954 }
955 lck_mtx_unlock(&tcp_lro_lock);
956 return;
957}
958
959static void
960lro_update_stats(struct mbuf *m)
961{
962 switch(m->m_pkthdr.lro_npkts) {
963 case 0: /* fall through */
964 case 1:
965 break;
966
967 case 2:
968 tcpstat.tcps_lro_twopack++;
969 break;
970
971 case 3: /* fall through */
972 case 4:
973 tcpstat.tcps_lro_multpack++;
974 break;
975
976 default:
977 tcpstat.tcps_lro_largepack++;
978 break;
979 }
980 return;
981}
982
983static void
984lro_update_flush_stats(struct mbuf *m)
985{
986 lro_flushes++;
987 switch(m->m_pkthdr.lro_npkts) {
988 case 0: ASSERT(0);
989 case 1: lro_single_flushes++;
990 break;
991 case 2: lro_double_flushes++;
992 break;
993 default: lro_good_flushes++;
994 break;
995 }
996 return;
997}