]> git.saurik.com Git - apple/xnu.git/blame - bsd/netinet/tcp_lro.c
xnu-4903.241.1.tar.gz
[apple/xnu.git] / bsd / netinet / tcp_lro.c
CommitLineData
316670eb 1/*
39236c6e 2 * Copyright (c) 2011-2013 Apple Inc. All rights reserved.
316670eb
A
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/sysctl.h>
32#include <sys/mbuf.h>
33#include <sys/mcache.h>
34#include <sys/socket.h>
35#include <sys/socketvar.h>
36#include <net/if_types.h>
37#include <net/route.h>
38#include <netinet/in.h>
39#include <netinet/in_systm.h>
40#include <net/if.h>
39236c6e 41#include <net/dlil.h>
316670eb
A
42#include <netinet/ip.h>
43#include <netinet/ip_var.h>
44#include <netinet/in_var.h>
45#include <netinet/tcp.h>
46#include <netinet/tcp_seq.h>
47#include <netinet/tcpip.h>
48#include <netinet/tcp_var.h>
49#include <netinet/tcp_lro.h>
50#include <netinet/lro_ext.h>
51#include <kern/locks.h>
52
53unsigned int lrocount = 0; /* A counter used for debugging only */
54unsigned int lro_seq_outoforder = 0; /* Counter for debugging */
55unsigned int lro_seq_mismatch = 0; /* Counter for debugging */
316670eb
A
56unsigned int lro_flushes = 0; /* Counter for tracking number of flushes */
57unsigned int lro_single_flushes = 0;
58unsigned int lro_double_flushes = 0;
59unsigned int lro_good_flushes = 0;
60
61unsigned int coalesc_sz = LRO_MX_COALESCE_PKTS;
62SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro_sz, CTLFLAG_RW | CTLFLAG_LOCKED,
63 &coalesc_sz, 0, "Max coalescing size");
64
65unsigned int coalesc_time = LRO_MX_TIME_TO_BUFFER;
66SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro_time, CTLFLAG_RW | CTLFLAG_LOCKED,
67 &coalesc_time, 0, "Max coalescing time");
68
69struct lro_flow lro_flow_list[TCP_LRO_NUM_FLOWS];
70
71char lro_flow_map[TCP_LRO_FLOW_MAP];
72
73static lck_attr_t *tcp_lro_mtx_attr = NULL; /* mutex attributes */
74static lck_grp_t *tcp_lro_mtx_grp = NULL; /* mutex group */
75static lck_grp_attr_t *tcp_lro_mtx_grp_attr = NULL; /* mutex group attrs */
76decl_lck_mtx_data( ,tcp_lro_lock); /* Used to synchronize updates */
77
78unsigned int lro_byte_count = 0;
79
80uint64_t lro_deadline = 0; /* LRO's sense of time - protected by tcp_lro_lock */
81uint32_t lro_timer_set = 0;
82
83/* Some LRO stats */
84u_int32_t lro_pkt_count = 0; /* Number of packets encountered in an LRO period */
85thread_call_t tcp_lro_timer;
86
87extern u_int32_t kipf_count;
88
89static void tcp_lro_timer_proc(void*, void*);
90static void lro_update_stats(struct mbuf*);
91static void lro_update_flush_stats(struct mbuf *);
92static void tcp_lro_flush_flows(void);
93static void tcp_lro_sched_timer(uint64_t);
94static void lro_proto_input(struct mbuf *);
95
39236c6e 96static struct mbuf *lro_tcp_xsum_validate(struct mbuf*, struct ip *,
316670eb 97 struct tcphdr*);
39037602 98static struct mbuf *tcp_lro_process_pkt(struct mbuf*, int);
316670eb
A
99
100void
101tcp_lro_init(void)
102{
103 int i;
104
105 bzero(lro_flow_list, sizeof (struct lro_flow) * TCP_LRO_NUM_FLOWS);
106 for (i = 0; i < TCP_LRO_FLOW_MAP; i++) {
107 lro_flow_map[i] = TCP_LRO_FLOW_UNINIT;
108 }
109
110 /*
111 * allocate lock group attribute, group and attribute for tcp_lro_lock
112 */
113 tcp_lro_mtx_grp_attr = lck_grp_attr_alloc_init();
114 tcp_lro_mtx_grp = lck_grp_alloc_init("tcplro", tcp_lro_mtx_grp_attr);
115 tcp_lro_mtx_attr = lck_attr_alloc_init();
116 lck_mtx_init(&tcp_lro_lock, tcp_lro_mtx_grp, tcp_lro_mtx_attr);
117
118 tcp_lro_timer = thread_call_allocate(tcp_lro_timer_proc, NULL);
119 if (tcp_lro_timer == NULL) {
120 panic_plain("%s: unable to allocate lro timer", __func__);
121 }
122
123 return;
124}
125
126static int
127tcp_lro_matching_tuple(struct ip* ip_hdr, struct tcphdr *tcp_hdr, int *hash,
128 int *flow_id )
129{
130 struct lro_flow *flow;
131 tcp_seq seqnum;
132 unsigned int off = 0;
133 int payload_len = 0;
134
135 *hash = LRO_HASH(ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr,
136 tcp_hdr->th_sport, tcp_hdr->th_dport, (TCP_LRO_FLOW_MAP - 1));
137
138 *flow_id = lro_flow_map[*hash];
139 if (*flow_id == TCP_LRO_FLOW_NOTFOUND) {
140 return TCP_LRO_NAN;
141 }
142
143 seqnum = tcp_hdr->th_seq;
144 off = tcp_hdr->th_off << 2;
145 payload_len = ip_hdr->ip_len - off;
146
147 flow = &lro_flow_list[*flow_id];
148
149 if ((flow->lr_faddr.s_addr == ip_hdr->ip_src.s_addr) &&
150 (flow->lr_laddr.s_addr == ip_hdr->ip_dst.s_addr) &&
151 (flow->lr_fport == tcp_hdr->th_sport) &&
152 (flow->lr_lport == tcp_hdr->th_dport)) {
153 if (flow->lr_tcphdr == NULL) {
154 if (ntohl(seqnum) == flow->lr_seq) {
155 return TCP_LRO_COALESCE;
156 }
157 if (lrodebug >= 4) {
158 printf("%s: seqnum = %x, lr_seq = %x\n",
159 __func__, ntohl(seqnum), flow->lr_seq);
160 }
161 lro_seq_mismatch++;
162 if (SEQ_GT(ntohl(seqnum), flow->lr_seq)) {
163 lro_seq_outoforder++;
164 /*
165 * Whenever we receive out of order packets it
166 * signals loss and recovery and LRO doesn't
167 * let flows recover quickly. So eject.
168 */
169 flow->lr_flags |= LRO_EJECT_REQ;
170
171 }
172 return TCP_LRO_NAN;
173 }
174
175 if (flow->lr_flags & LRO_EJECT_REQ) {
176 if (lrodebug)
177 printf("%s: eject. \n", __func__);
178 return TCP_LRO_EJECT_FLOW;
179 }
180 if (SEQ_GT(tcp_hdr->th_ack, flow->lr_tcphdr->th_ack)) {
181 if (lrodebug) {
182 printf("%s: th_ack = %x flow_ack = %x \n",
183 __func__, tcp_hdr->th_ack,
184 flow->lr_tcphdr->th_ack);
185 }
186 return TCP_LRO_EJECT_FLOW;
187 }
188
189 if (ntohl(seqnum) == (ntohl(lro_flow_list[*flow_id].lr_tcphdr->th_seq) + lro_flow_list[*flow_id].lr_len)) {
190 return TCP_LRO_COALESCE;
191 } else {
192 /* LRO does not handle loss recovery well, eject */
193 flow->lr_flags |= LRO_EJECT_REQ;
194 return TCP_LRO_EJECT_FLOW;
195 }
196 }
197 if (lrodebug) printf("tcp_lro_matching_tuple: collision \n");
198 return TCP_LRO_COLLISION;
199}
200
201static void
202tcp_lro_init_flow(int flow_id, struct ip* ip_hdr, struct tcphdr *tcp_hdr,
203 int hash, u_int32_t timestamp, int payload_len)
204{
205 struct lro_flow *flow = NULL;
206
207 flow = &lro_flow_list[flow_id];
208
209 flow->lr_hash_map = hash;
210 flow->lr_faddr.s_addr = ip_hdr->ip_src.s_addr;
211 flow->lr_laddr.s_addr = ip_hdr->ip_dst.s_addr;
212 flow->lr_fport = tcp_hdr->th_sport;
213 flow->lr_lport = tcp_hdr->th_dport;
214 lro_flow_map[hash] = flow_id;
215 flow->lr_timestamp = timestamp;
216 flow->lr_seq = ntohl(tcp_hdr->th_seq) + payload_len;
217 flow->lr_flags = 0;
218 return;
219}
220
221static void
222tcp_lro_coalesce(int flow_id, struct mbuf *lro_mb, struct tcphdr *tcphdr,
223 int payload_len, int drop_hdrlen, struct tcpopt *topt,
224 u_int32_t* tsval, u_int32_t* tsecr, int thflags)
225{
226 struct lro_flow *flow = NULL;
227 struct mbuf *last;
228 struct ip *ip = NULL;
229
230 flow = &lro_flow_list[flow_id];
231 if (flow->lr_mhead) {
232 if (lrodebug)
233 printf("%s: lr_mhead %x %d \n", __func__, flow->lr_seq,
234 payload_len);
235 m_adj(lro_mb, drop_hdrlen);
236
237 last = flow->lr_mtail;
238 while (last->m_next != NULL) {
239 last = last->m_next;
240 }
241 last->m_next = lro_mb;
242
243 flow->lr_mtail = lro_mb;
244
245 ip = mtod(flow->lr_mhead, struct ip *);
246 ip->ip_len += lro_mb->m_pkthdr.len;
247 flow->lr_mhead->m_pkthdr.len += lro_mb->m_pkthdr.len;
248
249 if (flow->lr_len == 0) {
250 panic_plain("%s: Inconsistent LRO flow state", __func__);
251 }
252 flow->lr_len += payload_len;
253 flow->lr_seq += payload_len;
254 /*
255 * This bit is re-OR'd each time a packet is added to the
256 * large coalesced packet.
257 */
39236c6e 258 flow->lr_mhead->m_pkthdr.pkt_flags |= PKTF_SW_LRO_PKT;
316670eb
A
259 flow->lr_mhead->m_pkthdr.lro_npkts++; /* for tcpstat.tcps_rcvpack */
260 if (flow->lr_mhead->m_pkthdr.lro_pktlen <
261 lro_mb->m_pkthdr.lro_pktlen) {
262 /*
263 * For TCP Inter Arrival Jitter calculation, return max
264 * size encountered while coalescing a stream of pkts.
265 */
266 flow->lr_mhead->m_pkthdr.lro_pktlen =
267 lro_mb->m_pkthdr.lro_pktlen;
268 }
269 /* Update the timestamp value */
270 if (topt->to_flags & TOF_TS) {
271 if ((flow->lr_tsval) &&
272 (TSTMP_GT(topt->to_tsval, ntohl(*(flow->lr_tsval))))) {
273 *(flow->lr_tsval) = htonl(topt->to_tsval);
274 }
275 if ((flow->lr_tsecr) &&
276 (topt->to_tsecr != 0) &&
277 (TSTMP_GT(topt->to_tsecr, ntohl(*(flow->lr_tsecr))))) {
278 if (lrodebug >= 2) {
279 printf("%s: instantaneous RTT = %d \n", __func__,
280 topt->to_tsecr - ntohl(*(flow->lr_tsecr)));
281 }
282 *(flow->lr_tsecr) = htonl(topt->to_tsecr);
283 }
284 }
285 /* Coalesce the flags */
286 if (thflags) {
287 flow->lr_tcphdr->th_flags |= thflags;
288 }
289 /* Update receive window */
290 flow->lr_tcphdr->th_win = tcphdr->th_win;
291 } else {
292 if (lro_mb) {
293 flow->lr_mhead = flow->lr_mtail = lro_mb;
39236c6e 294 flow->lr_mhead->m_pkthdr.pkt_flags |= PKTF_SW_LRO_PKT;
316670eb
A
295 flow->lr_tcphdr = tcphdr;
296 if ((topt) && (topt->to_flags & TOF_TS)) {
297 ASSERT(tsval != NULL);
298 ASSERT(tsecr != NULL);
299 flow->lr_tsval = tsval;
300 flow->lr_tsecr = tsecr;
301 }
302 flow->lr_len = payload_len;
39236c6e 303 calculate_tcp_clock();
316670eb
A
304 flow->lr_timestamp = tcp_now;
305 tcp_lro_sched_timer(0);
306 }
307 flow->lr_seq = ntohl(tcphdr->th_seq) + payload_len;
308 }
309 if (lro_mb) {
310 tcpstat.tcps_coalesced_pack++;
311 }
312 return;
313}
314
315static struct mbuf *
316tcp_lro_eject_flow(int flow_id)
317{
318 struct mbuf *mb = NULL;
319
320 mb = lro_flow_list[flow_id].lr_mhead;
321 ASSERT(lro_flow_map[lro_flow_list[flow_id].lr_hash_map] == flow_id);
322 lro_flow_map[lro_flow_list[flow_id].lr_hash_map] = TCP_LRO_FLOW_UNINIT;
323 bzero(&lro_flow_list[flow_id], sizeof(struct lro_flow));
324
325 return mb;
326}
327
328static struct mbuf*
329tcp_lro_eject_coalesced_pkt(int flow_id)
330{
331 struct mbuf *mb = NULL;
332 mb = lro_flow_list[flow_id].lr_mhead;
333 lro_flow_list[flow_id].lr_mhead =
334 lro_flow_list[flow_id].lr_mtail = NULL;
335 lro_flow_list[flow_id].lr_tcphdr = NULL;
336 return mb;
337}
338
339static struct mbuf*
340tcp_lro_insert_flow(struct mbuf *lro_mb, struct ip *ip_hdr,
341 struct tcphdr *tcp_hdr, int payload_len,
342 int drop_hdrlen, int hash, struct tcpopt *topt,
343 u_int32_t *tsval, u_int32_t *tsecr)
344{
345 int i;
346 int slot_available = 0;
347 int candidate_flow = 0;
348 u_int32_t oldest_timestamp;
349 struct mbuf *mb = NULL;
350 int collision = 0;
351
352 oldest_timestamp = tcp_now;
353
354 /* handle collision */
355 if (lro_flow_map[hash] != TCP_LRO_FLOW_UNINIT) {
356 if (lrodebug) {
357 collision = 1;
358 }
359 candidate_flow = lro_flow_map[hash];
360 tcpstat.tcps_flowtbl_collision++;
361 goto kick_flow;
362 }
363
364 for (i = 0; i < TCP_LRO_NUM_FLOWS; i++) {
365 if (lro_flow_list[i].lr_mhead == NULL) {
366 candidate_flow = i;
367 slot_available = 1;
368 break;
369 }
370 if (oldest_timestamp >= lro_flow_list[i].lr_timestamp) {
371 candidate_flow = i;
372 oldest_timestamp = lro_flow_list[i].lr_timestamp;
373 }
374 }
375
376 if (!slot_available) {
377 tcpstat.tcps_flowtbl_full++;
378kick_flow:
379 /* kick the oldest flow */
380 mb = tcp_lro_eject_flow(candidate_flow);
381
382 if (lrodebug) {
383 if (!slot_available) {
384 printf("%s: slot unavailable.\n",__func__);
385 }
386 if (collision) {
387 printf("%s: collision.\n",__func__);
388 }
389 }
390 } else {
391 candidate_flow = i; /* this is now the flow to be used */
392
393 }
394
395 tcp_lro_init_flow(candidate_flow, ip_hdr, tcp_hdr, hash,
396 tcp_now, payload_len);
397 tcp_lro_coalesce(candidate_flow, lro_mb, tcp_hdr, payload_len,
398 drop_hdrlen, topt, tsval, tsecr, 0);
399 return mb;
400}
401
402struct mbuf*
39037602 403tcp_lro_process_pkt(struct mbuf *lro_mb, int drop_hdrlen)
316670eb
A
404{
405 int flow_id = TCP_LRO_FLOW_UNINIT;
406 int hash;
407 unsigned int off = 0;
408 int eject_flow = 0;
409 int optlen;
410 int retval = 0;
411 struct mbuf *mb = NULL;
412 int payload_len = 0;
413 u_char *optp = NULL;
414 int thflags = 0;
415 struct tcpopt to;
416 int ret_response = TCP_LRO_CONSUMED;
417 int coalesced = 0, tcpflags = 0, unknown_tcpopts = 0;
418 u_int8_t ecn;
39037602
A
419 struct ip *ip_hdr;
420 struct tcphdr *tcp_hdr;
316670eb 421
39037602
A
422 if (lro_mb->m_len < drop_hdrlen) {
423 if ((lro_mb = m_pullup(lro_mb, drop_hdrlen)) == NULL) {
316670eb
A
424 tcpstat.tcps_rcvshort++;
425 m_freem(lro_mb);
426 if (lrodebug) {
427 printf("tcp_lro_process_pkt:mbuf too short.\n");
428 }
39037602 429 return (NULL);
316670eb
A
430 }
431 }
39037602
A
432
433 ip_hdr = mtod(lro_mb, struct ip*);
434 tcp_hdr = (struct tcphdr *)((caddr_t)ip_hdr + sizeof(struct ip));
435
39236c6e
A
436 /* Just in case */
437 lro_mb->m_pkthdr.pkt_flags &= ~PKTF_SW_LRO_DID_CSUM;
438
439 if ((lro_mb = lro_tcp_xsum_validate(lro_mb, ip_hdr, tcp_hdr)) == NULL) {
316670eb
A
440 if (lrodebug) {
441 printf("tcp_lro_process_pkt: TCP xsum failed.\n");
442 }
39037602 443 return (NULL);
316670eb
A
444 }
445
446 /* Update stats */
447 lro_pkt_count++;
448
449 /* Avoids checksumming in tcp_input */
39236c6e
A
450 lro_mb->m_pkthdr.pkt_flags |= PKTF_SW_LRO_DID_CSUM;
451
316670eb
A
452 off = tcp_hdr->th_off << 2;
453 optlen = off - sizeof (struct tcphdr);
454 payload_len = ip_hdr->ip_len - off;
455 optp = (u_char *)(tcp_hdr + 1);
456 /*
457 * Do quick retrieval of timestamp options ("options
458 * prediction?"). If timestamp is the only option and it's
459 * formatted as recommended in RFC 1323 appendix A, we
460 * quickly get the values now and not bother calling
461 * tcp_dooptions(), etc.
462 */
5ba3f43e 463 bzero(&to, sizeof(to));
316670eb
A
464 if ((optlen == TCPOLEN_TSTAMP_APPA ||
465 (optlen > TCPOLEN_TSTAMP_APPA &&
466 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
467 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
468 (tcp_hdr->th_flags & TH_SYN) == 0) {
469 to.to_flags |= TOF_TS;
470 to.to_tsval = ntohl(*(u_int32_t *)(void *)(optp + 4));
471 to.to_tsecr = ntohl(*(u_int32_t *)(void *)(optp + 8));
472 } else {
473 /*
474 * If TCP timestamps are not in use, or not the first option,
475 * skip LRO path since timestamps are used to avoid LRO
476 * from introducing additional latencies for retransmissions
477 * and other slow-paced transmissions.
478 */
479 to.to_flags = to.to_tsecr = 0;
480 eject_flow = 1;
481 }
482
483 /* list all the conditions that can trigger a flow ejection here */
484
485 thflags = tcp_hdr->th_flags;
486 if (thflags & (TH_SYN | TH_URG | TH_ECE | TH_CWR | TH_PUSH | TH_RST | TH_FIN)) {
487 eject_flow = tcpflags = 1;
488 }
489
490 if (optlen && !((optlen == TCPOLEN_TSTAMP_APPA) &&
491 (to.to_flags & TOF_TS))) {
492 eject_flow = unknown_tcpopts = 1;
493 }
494
495 if (payload_len <= LRO_MIN_COALESC_SZ) { /* zero payload ACK */
496 eject_flow = 1;
497 }
498
499 /* Can't coalesce ECN marked packets. */
500 ecn = ip_hdr->ip_tos & IPTOS_ECN_MASK;
501 if (ecn == IPTOS_ECN_CE) {
502 /*
503 * ECN needs quick notification
504 */
505 if (lrodebug) {
506 printf("%s: ECE bits set.\n", __func__);
507 }
508 eject_flow = 1;
509 }
510
511 lck_mtx_lock_spin(&tcp_lro_lock);
512
513 retval = tcp_lro_matching_tuple(ip_hdr, tcp_hdr, &hash, &flow_id);
514
515 switch (retval) {
516 case TCP_LRO_NAN:
517 lck_mtx_unlock(&tcp_lro_lock);
518 ret_response = TCP_LRO_FLOW_NOTFOUND;
519 break;
520
521 case TCP_LRO_COALESCE:
522 if ((payload_len != 0) && (unknown_tcpopts == 0) &&
99c3a104 523 (tcpflags == 0) && (ecn != IPTOS_ECN_CE) && (to.to_flags & TOF_TS)) {
316670eb
A
524 tcp_lro_coalesce(flow_id, lro_mb, tcp_hdr, payload_len,
525 drop_hdrlen, &to,
526 (to.to_flags & TOF_TS) ? (u_int32_t *)(void *)(optp + 4) : NULL,
527 (to.to_flags & TOF_TS) ? (u_int32_t *)(void *)(optp + 8) : NULL,
528 thflags);
529 if (lrodebug >= 2) {
530 printf("tcp_lro_process_pkt: coalesce len = %d. flow_id = %d payload_len = %d drop_hdrlen = %d optlen = %d lport = %d seqnum = %x.\n",
531 lro_flow_list[flow_id].lr_len, flow_id,
532 payload_len, drop_hdrlen, optlen,
533 ntohs(lro_flow_list[flow_id].lr_lport),
534 ntohl(tcp_hdr->th_seq));
535 }
536 if (lro_flow_list[flow_id].lr_mhead->m_pkthdr.lro_npkts >= coalesc_sz) {
537 eject_flow = 1;
538 }
539 coalesced = 1;
540 }
541 if (eject_flow) {
542 mb = tcp_lro_eject_coalesced_pkt(flow_id);
543 lro_flow_list[flow_id].lr_seq = ntohl(tcp_hdr->th_seq) +
544 payload_len;
39236c6e
A
545 calculate_tcp_clock();
546 u_int8_t timestamp = tcp_now - lro_flow_list[flow_id].lr_timestamp;
316670eb
A
547 lck_mtx_unlock(&tcp_lro_lock);
548 if (mb) {
39236c6e 549 mb->m_pkthdr.lro_elapsed = timestamp;
316670eb
A
550 lro_proto_input(mb);
551 }
552 if (!coalesced) {
553 if (lrodebug >= 2) {
554 printf("%s: pkt payload_len = %d \n", __func__, payload_len);
555 }
556 lro_proto_input(lro_mb);
557 }
558 } else {
559 lck_mtx_unlock(&tcp_lro_lock);
560 }
561 break;
562
563 case TCP_LRO_EJECT_FLOW:
564 mb = tcp_lro_eject_coalesced_pkt(flow_id);
39236c6e
A
565 calculate_tcp_clock();
566 u_int8_t timestamp = tcp_now - lro_flow_list[flow_id].lr_timestamp;
316670eb
A
567 lck_mtx_unlock(&tcp_lro_lock);
568 if (mb) {
569 if (lrodebug)
570 printf("tcp_lro_process_pkt eject_flow, len = %d\n", mb->m_pkthdr.len);
39236c6e 571 mb->m_pkthdr.lro_elapsed = timestamp;
316670eb
A
572 lro_proto_input(mb);
573 }
574
575 lro_proto_input(lro_mb);
576 break;
577
578 case TCP_LRO_COLLISION:
579 lck_mtx_unlock(&tcp_lro_lock);
580 ret_response = TCP_LRO_FLOW_NOTFOUND;
581 break;
582
583 default:
584 lck_mtx_unlock(&tcp_lro_lock);
585 panic_plain("%s: unrecognized type %d", __func__, retval);
586 break;
587 }
588
589 if (ret_response == TCP_LRO_FLOW_NOTFOUND) {
590 lro_proto_input(lro_mb);
591 }
39037602 592 return (NULL);
316670eb
A
593}
594
595static void
596tcp_lro_timer_proc(void *arg1, void *arg2)
597{
598#pragma unused(arg1, arg2)
599
600 lck_mtx_lock_spin(&tcp_lro_lock);
601 lro_timer_set = 0;
602 lck_mtx_unlock(&tcp_lro_lock);
603 tcp_lro_flush_flows();
604}
605
606static void
607tcp_lro_flush_flows(void)
608{
609 int i = 0;
610 struct mbuf *mb;
611 struct lro_flow *flow;
316670eb
A
612 int tcpclock_updated = 0;
613
614 lck_mtx_lock(&tcp_lro_lock);
615
616 while (i < TCP_LRO_NUM_FLOWS) {
617 flow = &lro_flow_list[i];
618 if (flow->lr_mhead != NULL) {
39236c6e 619
316670eb
A
620 if (!tcpclock_updated) {
621 calculate_tcp_clock();
622 tcpclock_updated = 1;
623 }
316670eb 624
39236c6e
A
625 if (lrodebug >= 2)
626 printf("tcp_lro_flush_flows: len =%d n_pkts = %d %d %d \n",
316670eb
A
627 flow->lr_len,
628 flow->lr_mhead->m_pkthdr.lro_npkts,
629 flow->lr_timestamp, tcp_now);
630
39236c6e 631 u_int8_t timestamp = tcp_now - flow->lr_timestamp;
316670eb 632
316670eb 633 mb = tcp_lro_eject_flow(i);
39236c6e 634
316670eb 635 if (mb) {
39236c6e 636 mb->m_pkthdr.lro_elapsed = timestamp;
316670eb 637 lck_mtx_unlock(&tcp_lro_lock);
39236c6e 638 lro_update_flush_stats(mb);
316670eb 639 lro_proto_input(mb);
316670eb
A
640 lck_mtx_lock(&tcp_lro_lock);
641 }
642 }
643 i++;
644 }
645 lck_mtx_unlock(&tcp_lro_lock);
316670eb
A
646}
647
648/*
649 * Must be called with tcp_lro_lock held.
650 * The hint is non-zero for longer waits. The wait time dictated by coalesc_time
651 * takes precedence, so lro_timer_set is not set for the hint case
652 */
653static void
654tcp_lro_sched_timer(uint64_t hint)
655{
656 if (lro_timer_set) {
657 return;
658 }
659
660 lro_timer_set = 1;
661 if (!hint) {
662 /* the intent is to wake up every coalesc_time msecs */
663 clock_interval_to_deadline(coalesc_time,
664 (NSEC_PER_SEC / TCP_RETRANSHZ), &lro_deadline);
665 } else {
666 clock_interval_to_deadline(hint, NSEC_PER_SEC / TCP_RETRANSHZ,
667 &lro_deadline);
668 }
669 thread_call_enter_delayed(tcp_lro_timer, lro_deadline);
670}
671
672struct mbuf*
673tcp_lro(struct mbuf *m, unsigned int hlen)
674{
675 struct ip *ip_hdr;
676 unsigned int tlen;
677 struct tcphdr * tcp_hdr = NULL;
678 unsigned int off = 0;
679
680 if (kipf_count != 0)
39037602 681 return (m);
316670eb
A
682
683 /*
684 * Experiments on cellular show that the RTT is much higher
685 * than the coalescing time of 5 msecs, causing lro to flush
686 * 80% of the time on a single packet. Increasing
687 * coalescing time for cellular does not show marked
688 * improvement to throughput either. Loopback perf is hurt
689 * by the 5 msec latency and it already sends large packets.
690 */
39236c6e 691 if (IFNET_IS_CELLULAR(m->m_pkthdr.rcvif) ||
316670eb 692 (m->m_pkthdr.rcvif->if_type == IFT_LOOP)) {
39037602 693 return (m);
316670eb
A
694 }
695
696 ip_hdr = mtod(m, struct ip*);
697
39236c6e 698 /* don't deal with IP options */
39037602 699 if (hlen != sizeof (struct ip))
39236c6e
A
700 return (m);
701
316670eb
A
702 /* only TCP is coalesced */
703 if (ip_hdr->ip_p != IPPROTO_TCP) {
39037602 704 return (m);
316670eb
A
705 }
706
707 if (m->m_len < (int32_t) sizeof (struct tcpiphdr)) {
708 if (lrodebug) printf("tcp_lro m_pullup \n");
39037602 709 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == NULL) {
316670eb
A
710 tcpstat.tcps_rcvshort++;
711 if (lrodebug) {
712 printf("ip_lro: rcvshort.\n");
713 }
39037602 714 return (NULL);
316670eb 715 }
39037602 716 ip_hdr = mtod(m, struct ip*);
316670eb
A
717 }
718
719 tcp_hdr = (struct tcphdr *)((caddr_t)ip_hdr + hlen);
720 tlen = ip_hdr->ip_len ; //ignore IP header bytes len
721 m->m_pkthdr.lro_pktlen = tlen; /* Used to return max pkt encountered to tcp */
722 m->m_pkthdr.lro_npkts = 1; /* Initialize a counter to hold num pkts coalesced */
39236c6e 723 m->m_pkthdr.lro_elapsed = 0; /* Initialize the field to carry elapsed time */
316670eb
A
724 off = tcp_hdr->th_off << 2;
725 if (off < sizeof (struct tcphdr) || off > tlen) {
726 tcpstat.tcps_rcvbadoff++;
727 if (lrodebug) {
728 printf("ip_lro: TCP off greater than TCP header.\n");
729 }
39037602 730 return (m);
316670eb
A
731 }
732
39037602 733 return (tcp_lro_process_pkt(m, hlen + off));
316670eb
A
734}
735
736static void
737lro_proto_input(struct mbuf *m)
738{
739 struct ip* ip_hdr = mtod(m, struct ip*);
740
741 if (lrodebug >= 3) {
742 printf("lro_proto_input: ip_len = %d \n",
743 ip_hdr->ip_len);
744 }
745 lro_update_stats(m);
746 ip_proto_dispatch_in_wrapper(m, ip_hdr->ip_hl << 2, ip_hdr->ip_p);
747}
748
749static struct mbuf *
39236c6e 750lro_tcp_xsum_validate(struct mbuf *m, struct ip *ip, struct tcphdr * th)
316670eb 751{
316670eb
A
752 /* Expect 32-bit aligned data pointer on strict-align platforms */
753 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
754
39236c6e
A
755 /* we shouldn't get here for IP with options; hence sizeof (ip) */
756 if (tcp_input_checksum(AF_INET, m, th, sizeof (*ip), ip->ip_len)) {
757 if (lrodebug)
758 printf("%s: bad xsum and drop m = 0x%llx.\n", __func__,
759 (uint64_t)VM_KERNEL_ADDRPERM(m));
316670eb 760 m_freem(m);
39236c6e 761 return (NULL);
316670eb 762 }
39236c6e
A
763
764 return (m);
316670eb
A
765}
766
767/*
768 * When TCP detects a stable, steady flow without out of ordering,
769 * with a sufficiently high cwnd, it invokes LRO.
770 */
771int
772tcp_start_coalescing(struct ip *ip_hdr, struct tcphdr *tcp_hdr, int tlen)
773{
774 int hash;
775 int flow_id;
776 struct mbuf *eject_mb;
777 struct lro_flow *lf;
778
779 hash = LRO_HASH(ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr,
780 tcp_hdr->th_sport, tcp_hdr->th_dport,
781 (TCP_LRO_FLOW_MAP - 1));
782
783
784 lck_mtx_lock_spin(&tcp_lro_lock);
785 flow_id = lro_flow_map[hash];
786 if (flow_id != TCP_LRO_FLOW_NOTFOUND) {
787 lf = &lro_flow_list[flow_id];
788 if ((lf->lr_faddr.s_addr == ip_hdr->ip_src.s_addr) &&
789 (lf->lr_laddr.s_addr == ip_hdr->ip_dst.s_addr) &&
790 (lf->lr_fport == tcp_hdr->th_sport) &&
791 (lf->lr_lport == tcp_hdr->th_dport)) {
792 if ((lf->lr_tcphdr == NULL) &&
793 (lf->lr_seq != (tcp_hdr->th_seq + tlen))) {
794 lf->lr_seq = tcp_hdr->th_seq + tlen;
795 }
796 lf->lr_flags &= ~LRO_EJECT_REQ;
797 }
798 lck_mtx_unlock(&tcp_lro_lock);
799 return 0;
800 }
801
802 HTONL(tcp_hdr->th_seq);
803 HTONL(tcp_hdr->th_ack);
804 eject_mb =
805 tcp_lro_insert_flow(NULL, ip_hdr, tcp_hdr, tlen, 0, hash,
806 NULL, NULL, NULL);
807
808 lck_mtx_unlock(&tcp_lro_lock);
809
810 NTOHL(tcp_hdr->th_seq);
811 NTOHL(tcp_hdr->th_ack);
812 if (lrodebug >= 3) {
813 printf("%s: src = %x dst = %x sport = %d dport = %d seq %x \n",
814 __func__, ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr,
815 tcp_hdr->th_sport, tcp_hdr->th_dport, tcp_hdr->th_seq);
816 }
817 ASSERT(eject_mb == NULL);
818 return 0;
819}
820
821/*
822 * When TCP detects loss or idle condition, it stops offloading
823 * to LRO.
824 */
825int
826tcp_lro_remove_state(struct in_addr saddr, struct in_addr daddr,
827 unsigned short sport, unsigned short dport)
828{
829 int hash, flow_id;
830 struct lro_flow *lf;
831
832 hash = LRO_HASH(daddr.s_addr, saddr.s_addr, dport, sport,
833 (TCP_LRO_FLOW_MAP - 1));
834 lck_mtx_lock_spin(&tcp_lro_lock);
835 flow_id = lro_flow_map[hash];
836 if (flow_id == TCP_LRO_FLOW_UNINIT) {
837 lck_mtx_unlock(&tcp_lro_lock);
838 return 0;
839 }
840 lf = &lro_flow_list[flow_id];
841 if ((lf->lr_faddr.s_addr == daddr.s_addr) &&
842 (lf->lr_laddr.s_addr == saddr.s_addr) &&
843 (lf->lr_fport == dport) &&
844 (lf->lr_lport == sport)) {
845 if (lrodebug) {
846 printf("%s: %x %x\n", __func__,
847 lf->lr_flags, lf->lr_seq);
848 }
849 lf->lr_flags |= LRO_EJECT_REQ;
850 }
851 lck_mtx_unlock(&tcp_lro_lock);
852 return 0;
853}
854
855void
856tcp_update_lro_seq(__uint32_t rcv_nxt, struct in_addr saddr, struct in_addr daddr,
857 unsigned short sport, unsigned short dport)
858{
859 int hash, flow_id;
860 struct lro_flow *lf;
861
862 hash = LRO_HASH(daddr.s_addr, saddr.s_addr, dport, sport,
863 (TCP_LRO_FLOW_MAP - 1));
864 lck_mtx_lock_spin(&tcp_lro_lock);
865 flow_id = lro_flow_map[hash];
866 if (flow_id == TCP_LRO_FLOW_UNINIT) {
867 lck_mtx_unlock(&tcp_lro_lock);
868 return;
869 }
870 lf = &lro_flow_list[flow_id];
871 if ((lf->lr_faddr.s_addr == daddr.s_addr) &&
872 (lf->lr_laddr.s_addr == saddr.s_addr) &&
873 (lf->lr_fport == dport) &&
874 (lf->lr_lport == sport) &&
875 (lf->lr_tcphdr == NULL)) {
876 lf->lr_seq = (tcp_seq)rcv_nxt;
877 }
878 lck_mtx_unlock(&tcp_lro_lock);
879 return;
880}
881
882static void
883lro_update_stats(struct mbuf *m)
884{
885 switch(m->m_pkthdr.lro_npkts) {
886 case 0: /* fall through */
887 case 1:
888 break;
889
890 case 2:
891 tcpstat.tcps_lro_twopack++;
892 break;
893
894 case 3: /* fall through */
895 case 4:
896 tcpstat.tcps_lro_multpack++;
897 break;
898
899 default:
900 tcpstat.tcps_lro_largepack++;
901 break;
902 }
903 return;
904}
905
906static void
907lro_update_flush_stats(struct mbuf *m)
908{
909 lro_flushes++;
910 switch(m->m_pkthdr.lro_npkts) {
911 case 0: ASSERT(0);
912 case 1: lro_single_flushes++;
913 break;
914 case 2: lro_double_flushes++;
915 break;
916 default: lro_good_flushes++;
917 break;
918 }
919 return;
920}