]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/tcp_output.c
b693e0512eccdda6f47f5e0711c2d228892b4685
[apple/xnu.git] / bsd / netinet / tcp_output.c
1 /*
2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/tcp_output.c,v 1.39.2.10 2001/07/07 04:30:38 silby Exp $
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #define _IP_VHL
71
72
73 #include <sys/param.h>
74 #include <sys/systm.h>
75 #include <sys/kernel.h>
76 #include <sys/sysctl.h>
77 #include <sys/mbuf.h>
78 #include <sys/domain.h>
79 #include <sys/protosw.h>
80 #include <sys/socket.h>
81 #include <sys/socketvar.h>
82
83 #include <net/route.h>
84 #include <net/ntstat.h>
85 #include <net/if_var.h>
86 #include <net/if.h>
87 #include <net/if_types.h>
88 #include <net/dlil.h>
89
90 #include <netinet/in.h>
91 #include <netinet/in_systm.h>
92 #include <netinet/in_var.h>
93 #include <netinet/ip.h>
94 #include <netinet/in_pcb.h>
95 #include <netinet/ip_var.h>
96 #include <mach/sdt.h>
97 #if INET6
98 #include <netinet6/in6_pcb.h>
99 #include <netinet/ip6.h>
100 #include <netinet6/ip6_var.h>
101 #endif
102 #include <netinet/tcp.h>
103 #define TCPOUTFLAGS
104 #include <netinet/tcp_fsm.h>
105 #include <netinet/tcp_seq.h>
106 #include <netinet/tcp_timer.h>
107 #include <netinet/tcp_var.h>
108 #include <netinet/tcpip.h>
109 #include <netinet/tcp_cc.h>
110 #if TCPDEBUG
111 #include <netinet/tcp_debug.h>
112 #endif
113 #include <sys/kdebug.h>
114 #include <mach/sdt.h>
115
116 #if IPSEC
117 #include <netinet6/ipsec.h>
118 #endif /*IPSEC*/
119
120 #if CONFIG_MACF_NET
121 #include <security/mac_framework.h>
122 #endif /* MAC_SOCKET */
123
124 #include <netinet/lro_ext.h>
125 #if MPTCP
126 #include <netinet/mptcp_var.h>
127 #include <netinet/mptcp.h>
128 #include <netinet/mptcp_opt.h>
129 #endif
130
131 #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 1)
132 #define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 3)
133 #define DBG_FNC_TCP_OUTPUT NETDBG_CODE(DBG_NETTCP, (4 << 8) | 1)
134
135 int path_mtu_discovery = 1;
136 SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery,
137 CTLFLAG_RW | CTLFLAG_LOCKED, &path_mtu_discovery, 1,
138 "Enable Path MTU Discovery");
139
140 int ss_fltsz = 1;
141 SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize,
142 CTLFLAG_RW | CTLFLAG_LOCKED,&ss_fltsz, 1,
143 "Slow start flight size");
144
145 int ss_fltsz_local = 8; /* starts with eight segments max */
146 SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize,
147 CTLFLAG_RW | CTLFLAG_LOCKED, &ss_fltsz_local, 1,
148 "Slow start flight size for local networks");
149
150 int tcp_do_tso = 1;
151 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW | CTLFLAG_LOCKED,
152 &tcp_do_tso, 0, "Enable TCP Segmentation Offload");
153
154 int tcp_ecn_outbound = 0;
155 SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_initiate_out,
156 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn_outbound, 0,
157 "Initiate ECN for outbound connections");
158
159 int tcp_ecn_inbound = 0;
160 SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_negotiate_in,
161 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn_inbound, 0,
162 "Allow ECN negotiation for inbound connections");
163
164 int tcp_packet_chaining = 50;
165 SYSCTL_INT(_net_inet_tcp, OID_AUTO, packetchain,
166 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_packet_chaining, 0,
167 "Enable TCP output packet chaining");
168
169 int tcp_output_unlocked = 1;
170 SYSCTL_INT(_net_inet_tcp, OID_AUTO, socket_unlocked_on_output,
171 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_output_unlocked, 0,
172 "Unlock TCP when sending packets down to IP");
173
174 int tcp_do_rfc3390 = 1;
175 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390,
176 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_rfc3390, 1,
177 "Calculate intial slowstart cwnd depending on MSS");
178
179 int tcp_min_iaj_win = MIN_IAJ_WIN;
180 SYSCTL_INT(_net_inet_tcp, OID_AUTO, min_iaj_win,
181 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_min_iaj_win, 1,
182 "Minimum recv win based on inter-packet arrival jitter");
183
184 int tcp_acc_iaj_react_limit = ACC_IAJ_REACT_LIMIT;
185 SYSCTL_INT(_net_inet_tcp, OID_AUTO, acc_iaj_react_limit,
186 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_acc_iaj_react_limit, 1,
187 "Accumulated IAJ when receiver starts to react");
188
189 uint32_t tcp_do_autosendbuf = 1;
190 SYSCTL_INT(_net_inet_tcp, OID_AUTO, doautosndbuf,
191 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_autosendbuf, 1,
192 "Enable send socket buffer auto-tuning");
193
194 uint32_t tcp_autosndbuf_inc = 8 * 1024;
195 SYSCTL_INT(_net_inet_tcp, OID_AUTO, autosndbufinc,
196 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_autosndbuf_inc, 1,
197 "Increment in send socket bufffer size");
198
199 uint32_t tcp_autosndbuf_max = 512 * 1024;
200 SYSCTL_INT(_net_inet_tcp, OID_AUTO, autosndbufmax,
201 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_autosndbuf_max, 1,
202 "Maximum send socket buffer size");
203
204 uint32_t tcp_prioritize_acks = 1;
205 SYSCTL_INT(_net_inet_tcp, OID_AUTO, ack_prioritize,
206 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_prioritize_acks, 1,
207 "Prioritize pure acks");
208
209 uint32_t tcp_use_rtt_recvbg = 1;
210 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rtt_recvbg,
211 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_use_rtt_recvbg, 1,
212 "Use RTT for bg recv algorithm");
213
214 uint32_t tcp_recv_throttle_minwin = 16 * 1024;
215 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recv_throttle_minwin,
216 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_recv_throttle_minwin, 1,
217 "Minimum recv win for throttling");
218
219 int32_t tcp_enable_tlp = 1;
220 SYSCTL_INT(_net_inet_tcp, OID_AUTO, enable_tlp,
221 CTLFLAG_RW | CTLFLAG_LOCKED,
222 &tcp_enable_tlp, 1, "Enable Tail loss probe");
223
224 static int32_t packchain_newlist = 0;
225 static int32_t packchain_looped = 0;
226 static int32_t packchain_sent = 0;
227
228 /* temporary: for testing */
229 #if IPSEC
230 extern int ipsec_bypass;
231 #endif
232
233 extern int slowlink_wsize; /* window correction for slow links */
234 #if IPFIREWALL
235 extern int fw_enable; /* firewall check for packet chaining */
236 extern int fw_bypass; /* firewall check: disable packet chaining if there is rules */
237 #endif /* IPFIREWALL */
238
239 extern u_int32_t dlil_filter_disable_tso_count;
240 extern u_int32_t kipf_count;
241 extern int tcp_recv_bg;
242
243 static int tcp_ip_output(struct socket *, struct tcpcb *, struct mbuf *, int,
244 struct mbuf *, int, int, int32_t, boolean_t);
245 static struct mbuf* tcp_send_lroacks(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th);
246 static int tcp_recv_throttle(struct tcpcb *tp);
247
248 /*
249 * Tcp output routine: figure out what should be sent and send it.
250 *
251 * Returns: 0 Success
252 * EADDRNOTAVAIL
253 * ENOBUFS
254 * EMSGSIZE
255 * EHOSTUNREACH
256 * ENETDOWN
257 * ip_output_list:ENOMEM
258 * ip_output_list:EADDRNOTAVAIL
259 * ip_output_list:ENETUNREACH
260 * ip_output_list:EHOSTUNREACH
261 * ip_output_list:EACCES
262 * ip_output_list:EMSGSIZE
263 * ip_output_list:ENOBUFS
264 * ip_output_list:??? [ignorable: mostly IPSEC/firewall/DLIL]
265 * ip6_output_list:EINVAL
266 * ip6_output_list:EOPNOTSUPP
267 * ip6_output_list:EHOSTUNREACH
268 * ip6_output_list:EADDRNOTAVAIL
269 * ip6_output_list:ENETUNREACH
270 * ip6_output_list:EMSGSIZE
271 * ip6_output_list:ENOBUFS
272 * ip6_output_list:??? [ignorable: mostly IPSEC/firewall/DLIL]
273 */
274 int
275 tcp_output(struct tcpcb *tp)
276 {
277 struct inpcb *inp = tp->t_inpcb;
278 struct socket *so = inp->inp_socket;
279 int32_t len, recwin, sendwin, off;
280 int flags, error;
281 struct mbuf *m;
282 struct ip *ip = NULL;
283 struct ipovly *ipov = NULL;
284 #if INET6
285 struct ip6_hdr *ip6 = NULL;
286 #endif /* INET6 */
287 struct tcphdr *th;
288 u_char opt[TCP_MAXOLEN];
289 unsigned ipoptlen, optlen, hdrlen;
290 int idle, sendalot, lost = 0;
291 int i, sack_rxmit;
292 int tso = 0;
293 int sack_bytes_rxmt;
294 struct sackhole *p;
295 #if IPSEC
296 unsigned ipsec_optlen = 0;
297 #endif /* IPSEC */
298 int idle_time = 0;
299 struct mbuf *packetlist = NULL;
300 struct mbuf *tp_inp_options = inp->inp_depend4.inp4_options;
301 #if INET6
302 int isipv6 = inp->inp_vflag & INP_IPV6 ;
303 #endif
304 short packchain_listadd = 0;
305 int so_options = so->so_options;
306 struct rtentry *rt;
307 u_int32_t basertt, svc_flags = 0, allocated_len;
308 u_int32_t lro_ackmore = (tp->t_lropktlen != 0) ? 1 : 0;
309 struct mbuf *mnext = NULL;
310 int sackoptlen = 0;
311 #if MPTCP
312 unsigned int *dlenp = NULL;
313 u_int8_t *finp = NULL;
314 u_int32_t *sseqp = NULL;
315 u_int64_t dss_val = 0;
316 boolean_t mptcp_acknow = FALSE;
317 boolean_t early_data_sent = FALSE;
318 #endif /* MPTCP */
319 boolean_t cell = FALSE;
320 boolean_t wifi = FALSE;
321 boolean_t wired = FALSE;
322
323 /*
324 * Determine length of data that should be transmitted,
325 * and flags that will be used.
326 * If there is some data or critical controls (SYN, RST)
327 * to send, then transmit; otherwise, investigate further.
328 */
329 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
330
331 /* Since idle_time is signed integer, the following integer subtraction
332 * will take care of wrap around of tcp_now
333 */
334 idle_time = tcp_now - tp->t_rcvtime;
335 if (idle && idle_time >= TCP_IDLETIMEOUT(tp)) {
336 if (CC_ALGO(tp)->after_idle != NULL)
337 CC_ALGO(tp)->after_idle(tp);
338 tcp_ccdbg_trace(tp, NULL, TCP_CC_IDLE_TIMEOUT);
339 }
340 tp->t_flags &= ~TF_LASTIDLE;
341 if (idle) {
342 if (tp->t_flags & TF_MORETOCOME) {
343 tp->t_flags |= TF_LASTIDLE;
344 idle = 0;
345 }
346 }
347 #if MPTCP
348 if (tp->t_mpflags & TMPF_RESET) {
349 tcp_check_timer_state(tp);
350 /*
351 * Once a RST has been sent for an MPTCP subflow,
352 * the subflow socket stays around until deleted.
353 * No packets such as FINs must be sent after RST.
354 */
355 return (0);
356 }
357 #endif /* MPTCP */
358
359 again:
360 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0);
361
362 #if INET6
363 if (isipv6) {
364 KERNEL_DEBUG(DBG_LAYER_BEG,
365 ((inp->inp_fport << 16) | inp->inp_lport),
366 (((inp->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
367 (inp->in6p_faddr.s6_addr16[0] & 0xffff)),
368 sendalot,0,0);
369 } else
370 #endif
371
372 {
373 KERNEL_DEBUG(DBG_LAYER_BEG,
374 ((inp->inp_fport << 16) | inp->inp_lport),
375 (((inp->inp_laddr.s_addr & 0xffff) << 16) |
376 (inp->inp_faddr.s_addr & 0xffff)),
377 sendalot,0,0);
378 }
379 /*
380 * If the route generation id changed, we need to check that our
381 * local (source) IP address is still valid. If it isn't either
382 * return error or silently do nothing (assuming the address will
383 * come back before the TCP connection times out).
384 */
385 rt = inp->inp_route.ro_rt;
386 if (rt != NULL && ROUTE_UNUSABLE(&tp->t_inpcb->inp_route)) {
387 struct ifnet *ifp;
388 struct in_ifaddr *ia = NULL;
389 struct in6_ifaddr *ia6 = NULL;
390 int found_srcaddr = 0;
391
392 /* disable multipages at the socket */
393 somultipages(so, FALSE);
394
395 /* Disable TSO for the socket until we know more */
396 tp->t_flags &= ~TF_TSO;
397
398 soif2kcl(so, FALSE);
399
400 if (isipv6) {
401 ia6 = ifa_foraddr6(&inp->in6p_laddr);
402 if (ia6 != NULL)
403 found_srcaddr = 1;
404 } else {
405 ia = ifa_foraddr(inp->inp_laddr.s_addr);
406 if (ia != NULL)
407 found_srcaddr = 1;
408 }
409
410 /* check that the source address is still valid */
411 if (found_srcaddr == 0) {
412 soevent(so,
413 (SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR));
414
415 if (tp->t_state >= TCPS_CLOSE_WAIT) {
416 tcp_drop(tp, EADDRNOTAVAIL);
417 return(EADDRNOTAVAIL);
418 }
419
420 /* Set retransmit timer if it wasn't set,
421 * reset Persist timer and shift register as the
422 * advertised peer window may not be valid anymore
423 */
424
425 if (!tp->t_timer[TCPT_REXMT]) {
426 tp->t_timer[TCPT_REXMT] =
427 OFFSET_FROM_START(tp, tp->t_rxtcur);
428 if (tp->t_timer[TCPT_PERSIST]) {
429 tp->t_timer[TCPT_PERSIST] = 0;
430 tp->t_rxtshift = 0;
431 tp->t_persist_stop = 0;
432 tp->t_rxtstart = 0;
433 }
434 }
435
436 if (tp->t_pktlist_head != NULL)
437 m_freem_list(tp->t_pktlist_head);
438 TCP_PKTLIST_CLEAR(tp);
439
440 /* drop connection if source address isn't available */
441 if (so->so_flags & SOF_NOADDRAVAIL) {
442 tcp_drop(tp, EADDRNOTAVAIL);
443 return(EADDRNOTAVAIL);
444 } else {
445 tcp_check_timer_state(tp);
446 return(0); /* silently ignore, keep data in socket: address may be back */
447 }
448 }
449 if (ia != NULL)
450 IFA_REMREF(&ia->ia_ifa);
451
452 if (ia6 != NULL)
453 IFA_REMREF(&ia6->ia_ifa);
454
455 /*
456 * Address is still valid; check for multipages capability
457 * again in case the outgoing interface has changed.
458 */
459 RT_LOCK(rt);
460 if ((ifp = rt->rt_ifp) != NULL) {
461 somultipages(so, (ifp->if_hwassist & IFNET_MULTIPAGES));
462 tcp_set_tso(tp, ifp);
463 soif2kcl(so,
464 (ifp->if_eflags & IFEF_2KCL));
465 }
466 if (rt->rt_flags & RTF_UP)
467 RT_GENID_SYNC(rt);
468 /*
469 * See if we should do MTU discovery. Don't do it if:
470 * 1) it is disabled via the sysctl
471 * 2) the route isn't up
472 * 3) the MTU is locked (if it is, then discovery
473 * has been disabled)
474 */
475
476 if (!path_mtu_discovery || ((rt != NULL) &&
477 (!(rt->rt_flags & RTF_UP) ||
478 (rt->rt_rmx.rmx_locks & RTV_MTU))))
479 tp->t_flags &= ~TF_PMTUD;
480 else
481 tp->t_flags |= TF_PMTUD;
482
483 RT_UNLOCK(rt);
484 }
485
486 if (rt != NULL) {
487 cell = IFNET_IS_CELLULAR(rt->rt_ifp);
488 wifi = (!cell && IFNET_IS_WIFI(rt->rt_ifp));
489 wired = (!wifi && IFNET_IS_WIRED(rt->rt_ifp));
490 }
491
492 /*
493 * If we've recently taken a timeout, snd_max will be greater than
494 * snd_nxt. There may be SACK information that allows us to avoid
495 * resending already delivered data. Adjust snd_nxt accordingly.
496 */
497 if (SACK_ENABLED(tp) && SEQ_LT(tp->snd_nxt, tp->snd_max))
498 tcp_sack_adjust(tp);
499 sendalot = 0;
500 off = tp->snd_nxt - tp->snd_una;
501 sendwin = min(tp->snd_wnd, tp->snd_cwnd);
502
503 if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0)
504 sendwin = min(sendwin, slowlink_wsize);
505
506 flags = tcp_outflags[tp->t_state];
507 /*
508 * Send any SACK-generated retransmissions. If we're explicitly
509 * trying to send out new data (when sendalot is 1), bypass this
510 * function. If we retransmit in fast recovery mode, decrement
511 * snd_cwnd, since we're replacing a (future) new transmission
512 * with a retransmission now, and we previously incremented
513 * snd_cwnd in tcp_input().
514 */
515 /*
516 * Still in sack recovery , reset rxmit flag to zero.
517 */
518 sack_rxmit = 0;
519 sack_bytes_rxmt = 0;
520 len = 0;
521 p = NULL;
522 if (SACK_ENABLED(tp) && IN_FASTRECOVERY(tp) &&
523 (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
524 int32_t cwin;
525
526 cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
527 if (cwin < 0)
528 cwin = 0;
529 /* Do not retransmit SACK segments beyond snd_recover */
530 if (SEQ_GT(p->end, tp->snd_recover)) {
531 /*
532 * (At least) part of sack hole extends beyond
533 * snd_recover. Check to see if we can rexmit data
534 * for this hole.
535 */
536 if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
537 /*
538 * Can't rexmit any more data for this hole.
539 * That data will be rexmitted in the next
540 * sack recovery episode, when snd_recover
541 * moves past p->rxmit.
542 */
543 p = NULL;
544 goto after_sack_rexmit;
545 } else
546 /* Can rexmit part of the current hole */
547 len = ((int32_t)min(cwin,
548 tp->snd_recover - p->rxmit));
549 } else {
550 len = ((int32_t)min(cwin, p->end - p->rxmit));
551 }
552 if (len > 0) {
553 off = p->rxmit - tp->snd_una;
554 sack_rxmit = 1;
555 sendalot = 1;
556 tcpstat.tcps_sack_rexmits++;
557 tcpstat.tcps_sack_rexmit_bytes +=
558 min(len, tp->t_maxseg);
559 if (nstat_collect) {
560 nstat_route_tx(inp->inp_route.ro_rt, 1,
561 min(len, tp->t_maxseg),
562 NSTAT_TX_FLAG_RETRANSMIT);
563 INP_ADD_STAT(inp, cell, wifi, wired,
564 txpackets, 1);
565 INP_ADD_STAT(inp, cell, wifi, wired,
566 txbytes, min(len, tp->t_maxseg));
567 tp->t_stat.txretransmitbytes += min(len, tp->t_maxseg);
568 }
569 } else {
570 len = 0;
571 }
572 }
573 after_sack_rexmit:
574 /*
575 * Get standard flags, and add SYN or FIN if requested by 'hidden'
576 * state flags.
577 */
578 if (tp->t_flags & TF_NEEDFIN)
579 flags |= TH_FIN;
580 if (tp->t_flags & TF_NEEDSYN)
581 flags |= TH_SYN;
582
583 /*
584 * If in persist timeout with window of 0, send 1 byte.
585 * Otherwise, if window is small but nonzero
586 * and timer expired, we will send what we can
587 * and go to transmit state.
588 */
589 if (tp->t_flagsext & TF_FORCE) {
590 if (sendwin == 0) {
591 /*
592 * If we still have some data to send, then
593 * clear the FIN bit. Usually this would
594 * happen below when it realizes that we
595 * aren't sending all the data. However,
596 * if we have exactly 1 byte of unsent data,
597 * then it won't clear the FIN bit below,
598 * and if we are in persist state, we wind
599 * up sending the packet without recording
600 * that we sent the FIN bit.
601 *
602 * We can't just blindly clear the FIN bit,
603 * because if we don't have any more data
604 * to send then the probe will be the FIN
605 * itself.
606 */
607 if (off < so->so_snd.sb_cc)
608 flags &= ~TH_FIN;
609 sendwin = 1;
610 } else {
611 tp->t_timer[TCPT_PERSIST] = 0;
612 tp->t_rxtshift = 0;
613 tp->t_rxtstart = 0;
614 tp->t_persist_stop = 0;
615 }
616 }
617
618 /*
619 * If snd_nxt == snd_max and we have transmitted a FIN, the
620 * offset will be > 0 even if so_snd.sb_cc is 0, resulting in
621 * a negative length. This can also occur when TCP opens up
622 * its congestion window while receiving additional duplicate
623 * acks after fast-retransmit because TCP will reset snd_nxt
624 * to snd_max after the fast-retransmit.
625 *
626 * In the normal retransmit-FIN-only case, however, snd_nxt will
627 * be set to snd_una, the offset will be 0, and the length may
628 * wind up 0.
629 *
630 * If sack_rxmit is true we are retransmitting from the scoreboard
631 * in which case len is already set.
632 */
633 if (sack_rxmit == 0) {
634 if (sack_bytes_rxmt == 0)
635 len = min(so->so_snd.sb_cc, sendwin) - off;
636 else {
637 int32_t cwin;
638
639 /*
640 * We are inside of a SACK recovery episode and are
641 * sending new data, having retransmitted all the
642 * data possible in the scoreboard.
643 */
644 len = min(so->so_snd.sb_cc, tp->snd_wnd)
645 - off;
646 /*
647 * Don't remove this (len > 0) check !
648 * We explicitly check for len > 0 here (although it
649 * isn't really necessary), to work around a gcc
650 * optimization issue - to force gcc to compute
651 * len above. Without this check, the computation
652 * of len is bungled by the optimizer.
653 */
654 if (len > 0) {
655 cwin = tp->snd_cwnd -
656 (tp->snd_nxt - tp->sack_newdata) -
657 sack_bytes_rxmt;
658 if (cwin < 0)
659 cwin = 0;
660 len = imin(len, cwin);
661 }
662 else
663 len = 0;
664 }
665 }
666
667 #if MPTCP
668 if ((tp->t_mpflags & TMPF_FASTJOIN_SEND) &&
669 (tp->t_state == TCPS_SYN_SENT) &&
670 (!(tp->t_flags & TF_CLOSING)) &&
671 (so->so_snd.sb_cc != 0) &&
672 (tp->t_rxtshift == 0)) {
673 flags &= ~TH_SYN;
674 flags |= TH_ACK;
675 off = 0;
676 len = min(so->so_snd.sb_cc, tp->t_maxseg);
677 early_data_sent = TRUE;
678 } else if (early_data_sent) {
679 /* for now, we allow only one data segment to be sent */
680 return (0);
681 }
682 #endif /* MPTCP */
683 /*
684 * Lop off SYN bit if it has already been sent. However, if this
685 * is SYN-SENT state and if segment contains data and if we don't
686 * know that foreign host supports TAO, suppress sending segment.
687 */
688 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
689 if (tp->t_state != TCPS_SYN_RECEIVED)
690 flags &= ~TH_SYN;
691 off--, len++;
692 if (len > 0 && tp->t_state == TCPS_SYN_SENT) {
693 while (inp->inp_sndinprog_cnt == 0 &&
694 tp->t_pktlist_head != NULL) {
695 packetlist = tp->t_pktlist_head;
696 packchain_listadd = tp->t_lastchain;
697 packchain_sent++;
698 TCP_PKTLIST_CLEAR(tp);
699
700 error = tcp_ip_output(so, tp, packetlist,
701 packchain_listadd, tp_inp_options,
702 (so_options & SO_DONTROUTE),
703 (sack_rxmit | (sack_bytes_rxmt != 0)), 0,
704 #if INET6
705 isipv6);
706 #else /* INET6 */
707 0);
708 #endif /* !INET6 */
709
710
711 }
712
713 /*
714 * tcp was closed while we were in ip,
715 * resume close
716 */
717 if (inp->inp_sndinprog_cnt == 0 &&
718 (tp->t_flags & TF_CLOSING)) {
719 tp->t_flags &= ~TF_CLOSING;
720 (void) tcp_close(tp);
721 } else {
722 tcp_check_timer_state(tp);
723 }
724 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,
725 0,0,0,0,0);
726 return(0);
727 }
728 }
729
730 /*
731 * Be careful not to send data and/or FIN on SYN segments.
732 * This measure is needed to prevent interoperability problems
733 * with not fully conformant TCP implementations.
734 */
735 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
736 len = 0;
737 flags &= ~TH_FIN;
738 }
739
740 /*
741 * The check here used to be (len < 0). Some times len is zero
742 * when the congestion window is closed and we need to check
743 * if persist timer has to be set in that case. But don't set
744 * persist until connection is established.
745 */
746 if (len <= 0 && !(flags & TH_SYN)) {
747 /*
748 * If FIN has been sent but not acked,
749 * but we haven't been called to retransmit,
750 * len will be < 0. Otherwise, window shrank
751 * after we sent into it. If window shrank to 0,
752 * cancel pending retransmit, pull snd_nxt back
753 * to (closed) window, and set the persist timer
754 * if it isn't already going. If the window didn't
755 * close completely, just wait for an ACK.
756 */
757 len = 0;
758 if (sendwin == 0) {
759 tp->t_timer[TCPT_REXMT] = 0;
760 tp->t_timer[TCPT_PTO] = 0;
761 tp->t_rxtshift = 0;
762 tp->t_rxtstart = 0;
763 tp->snd_nxt = tp->snd_una;
764 off = 0;
765 if (tp->t_timer[TCPT_PERSIST] == 0)
766 tcp_setpersist(tp);
767 }
768 }
769
770 /*
771 * Automatic sizing of send socket buffer. Increase the send
772 * socket buffer size if all of the following criteria are met
773 * 1. the receiver has enough buffer space for this data
774 * 2. send buffer is filled to 7/8th with data (so we actually
775 * have data to make use of it);
776 * 3. our send window (slow start and congestion controlled) is
777 * larger than sent but unacknowledged data in send buffer.
778 */
779 basertt = get_base_rtt(tp);
780 if (tcp_do_autosendbuf == 1 &&
781 !INP_WAIT_FOR_IF_FEEDBACK(inp) && !IN_FASTRECOVERY(tp) &&
782 (so->so_snd.sb_flags & (SB_AUTOSIZE | SB_TRIM)) == SB_AUTOSIZE &&
783 tcp_cansbgrow(&so->so_snd)) {
784 if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
785 so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) &&
786 sendwin >= (so->so_snd.sb_cc -
787 (tp->snd_nxt - tp->snd_una))) {
788 /* Also increase the send buffer only if the
789 * round-trip time is not increasing because we do
790 * not want to contribute to latency by filling
791 * buffers.
792 * We also do not want to hold onto application's
793 * old data for too long. Interactive applications
794 * would rather discard old data.
795 */
796 if (tp->t_rttcur <= (basertt + 25)) {
797 if (sbreserve(&so->so_snd,
798 min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
799 tcp_autosndbuf_max)) == 1) {
800 so->so_snd.sb_idealsize = so->so_snd.sb_hiwat;
801 }
802 } else {
803 so->so_snd.sb_idealsize =
804 max(tcp_sendspace, so->so_snd.sb_hiwat -
805 (2 * tcp_autosndbuf_inc));
806 so->so_snd.sb_flags |= SB_TRIM;
807 }
808 }
809 }
810
811 /*
812 * Truncate to the maximum segment length or enable TCP Segmentation
813 * Offloading (if supported by hardware) and ensure that FIN is removed
814 * if the length no longer contains the last data byte.
815 *
816 * TSO may only be used if we are in a pure bulk sending state.
817 * The presence of TCP-MD5, SACK retransmits, SACK advertizements,
818 * ipfw rules and IP options, as well as disabling hardware checksum
819 * offload prevent using TSO. With TSO the TCP header is the same
820 * (except for the sequence number) for all generated packets. This
821 * makes it impossible to transmit any options which vary per generated
822 * segment or packet.
823 *
824 * The length of TSO bursts is limited to TCP_MAXWIN. That limit and
825 * removal of FIN (if not already catched here) are handled later after
826 * the exact length of the TCP options are known.
827 */
828 #if IPSEC
829 /*
830 * Pre-calculate here as we save another lookup into the darknesses
831 * of IPsec that way and can actually decide if TSO is ok.
832 */
833 if (ipsec_bypass == 0)
834 ipsec_optlen = ipsec_hdrsiz_tcp(tp);
835 #endif
836 if (len > tp->t_maxseg) {
837 if ((tp->t_flags & TF_TSO) && tcp_do_tso && hwcksum_tx &&
838 ip_use_randomid && kipf_count == 0 &&
839 dlil_filter_disable_tso_count == 0 &&
840 tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
841 sack_bytes_rxmt == 0 &&
842 inp->inp_options == NULL &&
843 inp->in6p_options == NULL
844 #if IPSEC
845 && ipsec_optlen == 0
846 #endif
847 #if IPFIREWALL
848 && (fw_enable == 0 || fw_bypass)
849 #endif
850 ) {
851 tso = 1;
852 sendalot = 0;
853 } else {
854 len = tp->t_maxseg;
855 sendalot = 1;
856 tso = 0;
857 }
858 }
859
860 /* Send one segment or less as a tail loss probe */
861 if (tp->t_flagsext & TF_SENT_TLPROBE) {
862 len = min(len, tp->t_maxseg);
863 sendalot = 0;
864 tso = 0;
865 }
866
867 #if MPTCP
868 if ((so->so_flags & SOF_MP_SUBFLOW) &&
869 !(tp->t_mpflags & TMPF_TCP_FALLBACK)) {
870 int newlen = len;
871 if (!(tp->t_mpflags & TMPF_PREESTABLISHED) &&
872 (tp->t_state > TCPS_CLOSED) &&
873 ((tp->t_mpflags & TMPF_SND_MPPRIO) ||
874 (tp->t_mpflags & TMPF_SND_REM_ADDR) ||
875 (tp->t_mpflags & TMPF_SND_MPFAIL))) {
876 if (len > 0) {
877 len = 0;
878 }
879 sendalot = 1;
880 mptcp_acknow = TRUE;
881 } else {
882 mptcp_acknow = FALSE;
883 }
884 /*
885 * The contiguous bytes in the subflow socket buffer can be
886 * discontiguous at the MPTCP level. Since only one DSS
887 * option can be sent in one packet, reduce length to match
888 * the contiguous MPTCP level. Set sendalot to send remainder.
889 */
890 if (len > 0)
891 newlen = mptcp_adj_sendlen(so, off, len);
892 if (newlen < len) {
893 len = newlen;
894 sendalot = 1;
895 }
896 }
897 #endif /* MPTCP */
898
899 /*
900 * If the socket is capable of doing unordered send,
901 * pull the amount of data that can be sent from the
902 * unordered priority queues to the serial queue in
903 * the socket buffer. If bytes are not yet available
904 * in the highest priority message, we may not be able
905 * to send any new data.
906 */
907 if (so->so_flags & SOF_ENABLE_MSGS) {
908 if ((off + len) >
909 so->so_msg_state->msg_serial_bytes) {
910 sbpull_unordered_data(so, off, len);
911
912 /* check if len needs to be modified */
913 if ((off + len) >
914 so->so_msg_state->msg_serial_bytes) {
915 len = so->so_msg_state->msg_serial_bytes - off;
916 if (len <= 0) {
917 len = 0;
918 tcpstat.tcps_msg_sndwaithipri++;
919 }
920 }
921 }
922 }
923
924 if (sack_rxmit) {
925 if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
926 flags &= ~TH_FIN;
927 } else {
928 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
929 flags &= ~TH_FIN;
930 }
931
932 recwin = tcp_sbspace(tp);
933
934 /*
935 * Sender silly window avoidance. We transmit under the following
936 * conditions when len is non-zero:
937 *
938 * - we've timed out (e.g. persist timer)
939 * - we need to retransmit
940 * - We have a full segment (or more with TSO)
941 * - This is the last buffer in a write()/send() and we are
942 * either idle or running NODELAY
943 * - we have more then 1/2 the maximum send window's worth of
944 * data (receiver may be limited the window size)
945 */
946 if (len) {
947 if (tp->t_flagsext & TF_FORCE)
948 goto send;
949 if (SEQ_LT(tp->snd_nxt, tp->snd_max))
950 goto send;
951 if (sack_rxmit)
952 goto send;
953
954 /*
955 * Send new data on the connection only if it is
956 * not flow controlled
957 */
958 if (!INP_WAIT_FOR_IF_FEEDBACK(inp) ||
959 tp->t_state != TCPS_ESTABLISHED) {
960 if (len >= tp->t_maxseg)
961 goto send;
962 if (!(tp->t_flags & TF_MORETOCOME) &&
963 (idle || tp->t_flags & TF_NODELAY ||
964 tp->t_flags & TF_MAXSEGSNT ||
965 ALLOW_LIMITED_TRANSMIT(tp)) &&
966 (tp->t_flags & TF_NOPUSH) == 0 &&
967 len + off >= so->so_snd.sb_cc)
968 goto send;
969 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
970 goto send;
971 } else {
972 tcpstat.tcps_fcholdpacket++;
973 }
974 }
975
976 /*
977 * Compare available window to amount of window
978 * known to peer (as advertised window less
979 * next expected input). If the difference is at least two
980 * max size segments, or at least 25% of the maximum possible
981 * window, then want to send a window update to peer.
982 * Skip this if the connection is in T/TCP half-open state.
983 */
984 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN)) {
985 /*
986 * "adv" is the amount we can increase the window,
987 * taking into account that we are limited by
988 * TCP_MAXWIN << tp->rcv_scale.
989 */
990 int32_t adv, oldwin = 0;
991 adv = imin(recwin, (int)TCP_MAXWIN << tp->rcv_scale) -
992 (tp->rcv_adv - tp->rcv_nxt);
993
994 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
995 oldwin = tp->rcv_adv - tp->rcv_nxt;
996
997 if (adv >= (int32_t) (2 * tp->t_maxseg)) {
998 /*
999 * Update only if the resulting scaled value of
1000 * the window changed, or if there is a change in
1001 * the sequence since the last ack. This avoids
1002 * what appears as dupe ACKS (see rdar://5640997)
1003 *
1004 * If streaming is detected avoid sending too many
1005 * window updates. We will depend on the delack
1006 * timer to send a window update when needed.
1007 */
1008 if (!(tp->t_flags & TF_STRETCHACK) &&
1009 (tp->last_ack_sent != tp->rcv_nxt ||
1010 ((oldwin + adv) >> tp->rcv_scale) >
1011 (oldwin >> tp->rcv_scale))) {
1012 goto send;
1013 }
1014
1015 /*
1016 * Make sure that the delayed ack timer is set if
1017 * we delayed sending a window update because of
1018 * streaming detection.
1019 */
1020 if ((tp->t_flags & TF_STRETCHACK) &&
1021 !(tp->t_flags & TF_DELACK)) {
1022 tp->t_flags |= TF_DELACK;
1023 tp->t_timer[TCPT_DELACK] =
1024 OFFSET_FROM_START(tp, tcp_delack);
1025 }
1026 }
1027 if (4 * adv >= (int32_t) so->so_rcv.sb_hiwat)
1028 goto send;
1029 }
1030
1031 /*
1032 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW
1033 * is also a catch-all for the retransmit timer timeout case.
1034 */
1035 if (tp->t_flags & TF_ACKNOW)
1036 goto send;
1037 if ((flags & TH_RST) ||
1038 ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
1039 goto send;
1040 if (SEQ_GT(tp->snd_up, tp->snd_una))
1041 goto send;
1042 #if MPTCP
1043 if (mptcp_acknow)
1044 goto send;
1045 #endif /* MPTCP */
1046 /*
1047 * If our state indicates that FIN should be sent
1048 * and we have not yet done so, then we need to send.
1049 */
1050 if ((flags & TH_FIN) &&
1051 (!(tp->t_flags & TF_SENTFIN) || tp->snd_nxt == tp->snd_una))
1052 goto send;
1053 /*
1054 * In SACK, it is possible for tcp_output to fail to send a segment
1055 * after the retransmission timer has been turned off. Make sure
1056 * that the retransmission timer is set.
1057 */
1058 if (SACK_ENABLED(tp) && (tp->t_state >= TCPS_ESTABLISHED) &&
1059 SEQ_GT(tp->snd_max, tp->snd_una) &&
1060 tp->t_timer[TCPT_REXMT] == 0 &&
1061 tp->t_timer[TCPT_PERSIST] == 0) {
1062 tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp,
1063 tp->t_rxtcur);
1064 goto just_return;
1065 }
1066 /*
1067 * TCP window updates are not reliable, rather a polling protocol
1068 * using ``persist'' packets is used to insure receipt of window
1069 * updates. The three ``states'' for the output side are:
1070 * idle not doing retransmits or persists
1071 * persisting to move a small or zero window
1072 * (re)transmitting and thereby not persisting
1073 *
1074 * tp->t_timer[TCPT_PERSIST]
1075 * is set when we are in persist state.
1076 * tp->t_force
1077 * is set when we are called to send a persist packet.
1078 * tp->t_timer[TCPT_REXMT]
1079 * is set when we are retransmitting
1080 * The output side is idle when both timers are zero.
1081 *
1082 * If send window is too small, there is data to transmit, and no
1083 * retransmit or persist is pending, then go to persist state.
1084 * If nothing happens soon, send when timer expires:
1085 * if window is nonzero, transmit what we can,
1086 * otherwise force out a byte.
1087 */
1088 if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
1089 tp->t_timer[TCPT_PERSIST] == 0) {
1090 tp->t_rxtshift = 0;
1091 tp->t_rxtstart = 0;
1092 tcp_setpersist(tp);
1093 }
1094 just_return:
1095 /*
1096 * If there is no reason to send a segment, just return.
1097 * but if there is some packets left in the packet list, send them now.
1098 */
1099 while (inp->inp_sndinprog_cnt == 0 &&
1100 tp->t_pktlist_head != NULL) {
1101 packetlist = tp->t_pktlist_head;
1102 packchain_listadd = tp->t_lastchain;
1103 packchain_sent++;
1104 TCP_PKTLIST_CLEAR(tp);
1105
1106 error = tcp_ip_output(so, tp, packetlist,
1107 packchain_listadd,
1108 tp_inp_options, (so_options & SO_DONTROUTE),
1109 (sack_rxmit | (sack_bytes_rxmt != 0)), recwin,
1110 #if INET6
1111 isipv6);
1112 #else /* INET6 */
1113 0);
1114 #endif /* !INET6 */
1115 }
1116 /* tcp was closed while we were in ip; resume close */
1117 if (inp->inp_sndinprog_cnt == 0 &&
1118 (tp->t_flags & TF_CLOSING)) {
1119 tp->t_flags &= ~TF_CLOSING;
1120 (void) tcp_close(tp);
1121 } else {
1122 tcp_check_timer_state(tp);
1123 }
1124 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1125 return (0);
1126
1127 send:
1128 /*
1129 * Set TF_MAXSEGSNT flag if the segment size is greater than
1130 * the max segment size.
1131 */
1132 if (len > 0) {
1133 if (len >= tp->t_maxseg)
1134 tp->t_flags |= TF_MAXSEGSNT;
1135 else
1136 tp->t_flags &= ~TF_MAXSEGSNT;
1137 }
1138 /*
1139 * Before ESTABLISHED, force sending of initial options
1140 * unless TCP set not to do any options.
1141 * NOTE: we assume that the IP/TCP header plus TCP options
1142 * always fit in a single mbuf, leaving room for a maximum
1143 * link header, i.e.
1144 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
1145 */
1146 optlen = 0;
1147 #if INET6
1148 if (isipv6)
1149 hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
1150 else
1151 #endif
1152 hdrlen = sizeof (struct tcpiphdr);
1153 if (flags & TH_SYN) {
1154 tp->snd_nxt = tp->iss;
1155 if ((tp->t_flags & TF_NOOPT) == 0) {
1156 u_short mss;
1157
1158 opt[0] = TCPOPT_MAXSEG;
1159 opt[1] = TCPOLEN_MAXSEG;
1160 mss = htons((u_short) tcp_mssopt(tp));
1161 (void)memcpy(opt + 2, &mss, sizeof(mss));
1162 optlen = TCPOLEN_MAXSEG;
1163
1164 if ((tp->t_flags & TF_REQ_SCALE) &&
1165 ((flags & TH_ACK) == 0 ||
1166 (tp->t_flags & TF_RCVD_SCALE))) {
1167 *((u_int32_t *)(void *)(opt + optlen)) = htonl(
1168 TCPOPT_NOP << 24 |
1169 TCPOPT_WINDOW << 16 |
1170 TCPOLEN_WINDOW << 8 |
1171 tp->request_r_scale);
1172 optlen += 4;
1173 }
1174 #if MPTCP
1175 if (mptcp_enable) {
1176 optlen = mptcp_setup_syn_opts(so, flags, opt,
1177 optlen);
1178 }
1179 #endif /* MPTCP */
1180 }
1181 }
1182
1183 /*
1184 * RFC 3168 states that:
1185 * - If you ever sent an ECN-setup SYN/SYN-ACK you must be prepared
1186 * to handle the TCP ECE flag, even if you also later send a
1187 * non-ECN-setup SYN/SYN-ACK.
1188 * - If you ever send a non-ECN-setup SYN/SYN-ACK, you must not set
1189 * the ip ECT flag.
1190 *
1191 * It is not clear how the ECE flag would ever be set if you never
1192 * set the IP ECT flag on outbound packets. All the same, we use
1193 * the TE_SETUPSENT to indicate that we have committed to handling
1194 * the TCP ECE flag correctly. We use the TE_SENDIPECT to indicate
1195 * whether or not we should set the IP ECT flag on outbound packet
1196 *
1197 * For a SYN-ACK, send an ECN setup SYN-ACK
1198 */
1199 if ((tcp_ecn_inbound || (tp->t_flags & TF_ENABLE_ECN))
1200 && (flags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
1201 if ((tp->ecn_flags & TE_SETUPRECEIVED) != 0) {
1202 if ((tp->ecn_flags & TE_SETUPSENT) == 0) {
1203 /* Setting TH_ECE makes this an ECN-setup SYN-ACK */
1204 flags |= TH_ECE;
1205
1206 /*
1207 * Record that we sent the ECN-setup and
1208 * default to setting IP ECT.
1209 */
1210 tp->ecn_flags |= (TE_SETUPSENT|TE_SENDIPECT);
1211 tcpstat.tcps_ecn_setup++;
1212 } else {
1213 /*
1214 * We sent an ECN-setup SYN-ACK but it was
1215 * dropped. Fallback to non-ECN-setup
1216 * SYN-ACK and clear flag to indicate that
1217 * we should not send data with IP ECT set
1218 *
1219 * Pretend we didn't receive an
1220 * ECN-setup SYN.
1221 */
1222 tp->ecn_flags &= ~TE_SETUPRECEIVED;
1223 /*
1224 * We already incremented the counter
1225 * assuming that the ECN setup will
1226 * succeed. Decrementing here to
1227 * correct it.
1228 */
1229 tcpstat.tcps_ecn_setup--;
1230 }
1231 }
1232 } else if ((tcp_ecn_outbound || (tp->t_flags & TF_ENABLE_ECN))
1233 && (flags & (TH_SYN | TH_ACK)) == TH_SYN) {
1234 if ((tp->ecn_flags & TE_SETUPSENT) == 0) {
1235 /* Setting TH_ECE and TH_CWR makes this an ECN-setup SYN */
1236 flags |= (TH_ECE | TH_CWR);
1237
1238 /*
1239 * Record that we sent the ECN-setup and default to
1240 * setting IP ECT.
1241 */
1242 tp->ecn_flags |= (TE_SETUPSENT | TE_SENDIPECT);
1243 } else {
1244 /*
1245 * We sent an ECN-setup SYN but it was dropped.
1246 * Fall back to no ECN and clear flag indicating
1247 * we should send data with IP ECT set.
1248 */
1249 tp->ecn_flags &= ~TE_SENDIPECT;
1250 }
1251 }
1252
1253 /*
1254 * Check if we should set the TCP CWR flag.
1255 * CWR flag is sent when we reduced the congestion window because
1256 * we received a TCP ECE or we performed a fast retransmit. We
1257 * never set the CWR flag on retransmitted packets. We only set
1258 * the CWR flag on data packets. Pure acks don't have this set.
1259 */
1260 if ((tp->ecn_flags & TE_SENDCWR) != 0 && len != 0 &&
1261 !SEQ_LT(tp->snd_nxt, tp->snd_max) && !sack_rxmit) {
1262 flags |= TH_CWR;
1263 tp->ecn_flags &= ~TE_SENDCWR;
1264 tcpstat.tcps_sent_cwr++;
1265 }
1266
1267 /*
1268 * Check if we should set the TCP ECE flag.
1269 */
1270 if ((tp->ecn_flags & TE_SENDECE) != 0 && len == 0) {
1271 flags |= TH_ECE;
1272 tcpstat.tcps_sent_ece++;
1273 }
1274
1275 /*
1276 * Send a timestamp and echo-reply if this is a SYN and our side
1277 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
1278 * and our peer have sent timestamps in our SYN's.
1279 */
1280 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
1281 (flags & TH_RST) == 0 &&
1282 ((flags & TH_ACK) == 0 ||
1283 (tp->t_flags & TF_RCVD_TSTMP))) {
1284 u_int32_t *lp = (u_int32_t *)(void *)(opt + optlen);
1285
1286 /* Form timestamp option as shown in appendix A of RFC 1323. */
1287 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
1288 *lp++ = htonl(tcp_now);
1289 *lp = htonl(tp->ts_recent);
1290 optlen += TCPOLEN_TSTAMP_APPA;
1291 }
1292
1293 /* Note the timestamp for receive buffer autosizing */
1294 if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE))
1295 tp->rfbuf_ts = tcp_now;
1296
1297 if (SACK_ENABLED(tp) && ((tp->t_flags & TF_NOOPT) == 0)) {
1298 /*
1299 * Tack on the SACK permitted option *last*.
1300 * And do padding of options after tacking this on.
1301 * This is because of MSS, TS, WinScale and Signatures are
1302 * all present, we have just 2 bytes left for the SACK
1303 * permitted option, which is just enough.
1304 */
1305 /*
1306 * If this is the first SYN of connection (not a SYN
1307 * ACK), include SACK permitted option. If this is a
1308 * SYN ACK, include SACK permitted option if peer has
1309 * already done so. This is only for active connect,
1310 * since the syncache takes care of the passive connect.
1311 */
1312 if ((flags & TH_SYN) &&
1313 (!(flags & TH_ACK) || (tp->t_flags & TF_SACK_PERMIT))) {
1314 u_char *bp;
1315 bp = (u_char *)opt + optlen;
1316
1317 *bp++ = TCPOPT_SACK_PERMITTED;
1318 *bp++ = TCPOLEN_SACK_PERMITTED;
1319 optlen += TCPOLEN_SACK_PERMITTED;
1320 }
1321 }
1322 #if MPTCP
1323 if (so->so_flags & SOF_MP_SUBFLOW) {
1324 /*
1325 * Its important to piggyback acks with data as ack only packets
1326 * may get lost and data packets that don't send Data ACKs
1327 * still advance the subflow level ACK and therefore make it
1328 * hard for the remote end to recover in low cwnd situations.
1329 */
1330 if (len != 0) {
1331 tp->t_mpflags |= (TMPF_SEND_DSN |
1332 TMPF_MPTCP_ACKNOW);
1333 } else {
1334 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
1335 }
1336 optlen = mptcp_setup_opts(tp, off, &opt[0], optlen, flags,
1337 len, &dlenp, &finp, &dss_val, &sseqp, &mptcp_acknow);
1338 tp->t_mpflags &= ~TMPF_SEND_DSN;
1339 }
1340 #endif /* MPTCP */
1341
1342 if (SACK_ENABLED(tp) && ((tp->t_flags & TF_NOOPT) == 0)) {
1343 /*
1344 * Send SACKs if necessary. This should be the last
1345 * option processed. Only as many SACKs are sent as
1346 * are permitted by the maximum options size.
1347 *
1348 * In general, SACK blocks consume 8*n+2 bytes.
1349 * So a full size SACK blocks option is 34 bytes
1350 * (to generate 4 SACK blocks). At a minimum,
1351 * we need 10 bytes (to generate 1 SACK block).
1352 * If TCP Timestamps (12 bytes) and TCP Signatures
1353 * (18 bytes) are both present, we'll just have
1354 * 10 bytes for SACK options 40 - (12 + 18).
1355 */
1356 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
1357 (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0 &&
1358 MAX_TCPOPTLEN - optlen - 2 >= TCPOLEN_SACK) {
1359 int nsack, padlen;
1360 u_char *bp = (u_char *)opt + optlen;
1361 u_int32_t *lp;
1362
1363 nsack = (MAX_TCPOPTLEN - optlen - 2) / TCPOLEN_SACK;
1364 nsack = min(nsack, tp->rcv_numsacks);
1365 sackoptlen = (2 + nsack * TCPOLEN_SACK);
1366
1367 /*
1368 * First we need to pad options so that the
1369 * SACK blocks can start at a 4-byte boundary
1370 * (sack option and length are at a 2 byte offset).
1371 */
1372 padlen = (MAX_TCPOPTLEN - optlen - sackoptlen) % 4;
1373 optlen += padlen;
1374 while (padlen-- > 0)
1375 *bp++ = TCPOPT_NOP;
1376
1377 tcpstat.tcps_sack_send_blocks++;
1378 *bp++ = TCPOPT_SACK;
1379 *bp++ = sackoptlen;
1380 lp = (u_int32_t *)(void *)bp;
1381 for (i = 0; i < nsack; i++) {
1382 struct sackblk sack = tp->sackblks[i];
1383 *lp++ = htonl(sack.start);
1384 *lp++ = htonl(sack.end);
1385 }
1386 optlen += sackoptlen;
1387 }
1388 }
1389
1390 /* Pad TCP options to a 4 byte boundary */
1391 if (optlen < MAX_TCPOPTLEN && (optlen % sizeof(u_int32_t))) {
1392 int pad = sizeof(u_int32_t) - (optlen % sizeof(u_int32_t));
1393 u_char *bp = (u_char *)opt + optlen;
1394
1395 optlen += pad;
1396 while (pad) {
1397 *bp++ = TCPOPT_EOL;
1398 pad--;
1399 }
1400 }
1401
1402 hdrlen += optlen;
1403
1404 #if INET6
1405 if (isipv6)
1406 ipoptlen = ip6_optlen(inp);
1407 else
1408 #endif
1409 {
1410 if (tp_inp_options) {
1411 ipoptlen = tp_inp_options->m_len -
1412 offsetof(struct ipoption, ipopt_list);
1413 } else {
1414 ipoptlen = 0;
1415 }
1416 }
1417 #if IPSEC
1418 ipoptlen += ipsec_optlen;
1419 #endif
1420
1421 /*
1422 * Adjust data length if insertion of options will
1423 * bump the packet length beyond the t_maxopd length.
1424 * Clear the FIN bit because we cut off the tail of
1425 * the segment.
1426 *
1427 * When doing TSO limit a burst to TCP_MAXWIN minus the
1428 * IP, TCP and Options length to keep ip->ip_len from
1429 * overflowing. Prevent the last segment from being
1430 * fractional thus making them all equal sized and set
1431 * the flag to continue sending. TSO is disabled when
1432 * IP options or IPSEC are present.
1433 */
1434 if (len + optlen + ipoptlen > tp->t_maxopd) {
1435 /*
1436 * If there is still more to send,
1437 * don't close the connection.
1438 */
1439 flags &= ~TH_FIN;
1440 if (tso) {
1441 int32_t tso_maxlen;
1442
1443 tso_maxlen = tp->tso_max_segment_size ?
1444 tp->tso_max_segment_size : TCP_MAXWIN;
1445
1446 if (len > tso_maxlen - hdrlen - optlen) {
1447 len = tso_maxlen - hdrlen - optlen;
1448 len = len - (len % (tp->t_maxopd - optlen));
1449 sendalot = 1;
1450 } else if (tp->t_flags & TF_NEEDFIN) {
1451 sendalot = 1;
1452 }
1453 } else {
1454 len = tp->t_maxopd - optlen - ipoptlen;
1455 sendalot = 1;
1456 }
1457 }
1458 #if MPTCP
1459 /* Adjust the length in the DSS option, if it is lesser than len */
1460 if (dlenp) {
1461 /*
1462 * To test this path without SACK, artificially
1463 * decrement len with something like
1464 * if (len > 10)
1465 len -= 10;
1466 */
1467 if (ntohs(*dlenp) > len) {
1468 *dlenp = htons(len);
1469 /* Unset the FIN flag, if len was adjusted */
1470 if (finp) {
1471 *finp &= ~MDSS_F;
1472 }
1473 sendalot = 1;
1474 }
1475 }
1476 #endif /* MPTCP */
1477
1478 if (max_linkhdr + hdrlen > MCLBYTES)
1479 panic("tcphdr too big");
1480
1481 /* Check if there is enough data in the send socket
1482 * buffer to start measuring bw
1483 */
1484 if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
1485 (tp->t_bwmeas != NULL) &&
1486 (tp->t_flagsext & TF_BWMEAS_INPROGRESS) == 0 &&
1487 (so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)) >=
1488 tp->t_bwmeas->bw_minsize) {
1489 tp->t_bwmeas->bw_size = min(
1490 (so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)),
1491 tp->t_bwmeas->bw_maxsize);
1492 tp->t_flagsext |= TF_BWMEAS_INPROGRESS;
1493 tp->t_bwmeas->bw_start = tp->snd_max;
1494 tp->t_bwmeas->bw_ts = tcp_now;
1495 }
1496
1497 VERIFY(inp->inp_flowhash != 0);
1498 /*
1499 * Grab a header mbuf, attaching a copy of data to
1500 * be transmitted, and initialize the header from
1501 * the template for sends on this connection.
1502 */
1503 if (len) {
1504 if ((tp->t_flagsext & TF_FORCE) && len == 1)
1505 tcpstat.tcps_sndprobe++;
1506 else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
1507 tcpstat.tcps_sndrexmitpack++;
1508 tcpstat.tcps_sndrexmitbyte += len;
1509 if (nstat_collect) {
1510 nstat_route_tx(inp->inp_route.ro_rt, 1,
1511 len, NSTAT_TX_FLAG_RETRANSMIT);
1512 INP_ADD_STAT(inp, cell, wifi, wired,
1513 txpackets, 1);
1514 INP_ADD_STAT(inp, cell, wifi, wired,
1515 txbytes, len);
1516 tp->t_stat.txretransmitbytes += len;
1517 }
1518 } else {
1519 tcpstat.tcps_sndpack++;
1520 tcpstat.tcps_sndbyte += len;
1521
1522 if (nstat_collect) {
1523 INP_ADD_STAT(inp, cell, wifi, wired,
1524 txpackets, 1);
1525 INP_ADD_STAT(inp, cell, wifi, wired,
1526 txbytes, len);
1527 }
1528 }
1529 #if MPTCP
1530 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
1531 tcpstat.tcps_mp_sndpacks++;
1532 tcpstat.tcps_mp_sndbytes += len;
1533 }
1534 #endif /* MPTCP */
1535 /*
1536 * try to use the new interface that allocates all
1537 * the necessary mbuf hdrs under 1 mbuf lock and
1538 * avoids rescanning the socket mbuf list if
1539 * certain conditions are met. This routine can't
1540 * be used in the following cases...
1541 * 1) the protocol headers exceed the capacity of
1542 * of a single mbuf header's data area (no cluster attached)
1543 * 2) the length of the data being transmitted plus
1544 * the protocol headers fits into a single mbuf header's
1545 * data area (no cluster attached)
1546 */
1547 m = NULL;
1548
1549 /* minimum length we are going to allocate */
1550 allocated_len = MHLEN;
1551 if (MHLEN < hdrlen + max_linkhdr) {
1552 MGETHDR(m, M_DONTWAIT, MT_HEADER);
1553 if (m == NULL) {
1554 error = ENOBUFS;
1555 goto out;
1556 }
1557 MCLGET(m, M_DONTWAIT);
1558 if ((m->m_flags & M_EXT) == 0) {
1559 m_freem(m);
1560 error = ENOBUFS;
1561 goto out;
1562 }
1563 m->m_data += max_linkhdr;
1564 m->m_len = hdrlen;
1565 allocated_len = MCLBYTES;
1566 }
1567 if (len <= allocated_len - hdrlen - max_linkhdr) {
1568 if (m == NULL) {
1569 VERIFY(allocated_len <= MHLEN);
1570 MGETHDR(m, M_DONTWAIT, MT_HEADER);
1571 if (m == NULL) {
1572 error = ENOBUFS;
1573 goto out;
1574 }
1575 m->m_data += max_linkhdr;
1576 m->m_len = hdrlen;
1577 }
1578 /* makes sure we still have data left to be sent at this point */
1579 if (so->so_snd.sb_mb == NULL || off < 0) {
1580 if (m != NULL) m_freem(m);
1581 error = 0; /* should we return an error? */
1582 goto out;
1583 }
1584 m_copydata(so->so_snd.sb_mb, off, (int) len,
1585 mtod(m, caddr_t) + hdrlen);
1586 m->m_len += len;
1587 } else {
1588 uint32_t copymode;
1589 /*
1590 * Retain packet header metadata at the socket
1591 * buffer if this is is an MPTCP subflow,
1592 * otherwise move it.
1593 */
1594 copymode = M_COPYM_MOVE_HDR;
1595 #if MPTCP
1596 if (so->so_flags & SOF_MP_SUBFLOW) {
1597 copymode = M_COPYM_NOOP_HDR;
1598 }
1599 #endif /* MPTCP */
1600 if (m != NULL) {
1601 m->m_next = m_copym_mode(so->so_snd.sb_mb,
1602 off, (int)len, M_DONTWAIT, copymode);
1603 if (m->m_next == NULL) {
1604 (void) m_free(m);
1605 error = ENOBUFS;
1606 goto out;
1607 }
1608 } else {
1609 /*
1610 * make sure we still have data left
1611 * to be sent at this point
1612 */
1613 if (so->so_snd.sb_mb == NULL) {
1614 error = 0; /* should we return an error? */
1615 goto out;
1616 }
1617
1618 /*
1619 * m_copym_with_hdrs will always return the
1620 * last mbuf pointer and the offset into it that
1621 * it acted on to fullfill the current request,
1622 * whether a valid 'hint' was passed in or not.
1623 */
1624 if ((m = m_copym_with_hdrs(so->so_snd.sb_mb,
1625 off, len, M_DONTWAIT, NULL, NULL,
1626 copymode)) == NULL) {
1627 error = ENOBUFS;
1628 goto out;
1629 }
1630 m->m_data += max_linkhdr;
1631 m->m_len = hdrlen;
1632 }
1633 }
1634 /*
1635 * If we're sending everything we've got, set PUSH.
1636 * (This will keep happy those implementations which only
1637 * give data to the user when a buffer fills or
1638 * a PUSH comes in.)
1639 */
1640 if (off + len == so->so_snd.sb_cc)
1641 flags |= TH_PUSH;
1642 } else {
1643 if (tp->t_flags & TF_ACKNOW)
1644 tcpstat.tcps_sndacks++;
1645 else if (flags & (TH_SYN|TH_FIN|TH_RST))
1646 tcpstat.tcps_sndctrl++;
1647 else if (SEQ_GT(tp->snd_up, tp->snd_una))
1648 tcpstat.tcps_sndurg++;
1649 else
1650 tcpstat.tcps_sndwinup++;
1651
1652 MGETHDR(m, M_DONTWAIT, MT_HEADER); /* MAC-OK */
1653 if (m == NULL) {
1654 error = ENOBUFS;
1655 goto out;
1656 }
1657 if (MHLEN < (hdrlen + max_linkhdr)) {
1658 MCLGET(m, M_DONTWAIT);
1659 if ((m->m_flags & M_EXT) == 0) {
1660 m_freem(m);
1661 error = ENOBUFS;
1662 goto out;
1663 }
1664 }
1665 m->m_data += max_linkhdr;
1666 m->m_len = hdrlen;
1667 }
1668 m->m_pkthdr.rcvif = 0;
1669 #if MPTCP
1670 /* Before opt is copied to the mbuf, set the csum field */
1671 mptcp_output_csum(tp, m, len, hdrlen, dss_val, sseqp);
1672 #endif /* MPTCP */
1673 #if CONFIG_MACF_NET
1674 mac_mbuf_label_associate_inpcb(inp, m);
1675 #endif
1676 #if INET6
1677 if (isipv6) {
1678 ip6 = mtod(m, struct ip6_hdr *);
1679 th = (struct tcphdr *)(void *)(ip6 + 1);
1680 tcp_fillheaders(tp, ip6, th);
1681 if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len &&
1682 !SEQ_LT(tp->snd_nxt, tp->snd_max) && !sack_rxmit) {
1683 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
1684 }
1685 svc_flags |= PKT_SCF_IPV6;
1686 #if PF_ECN
1687 m->m_pkthdr.pf_mtag.pftag_hdr = (void *)ip6;
1688 m->m_pkthdr.pf_mtag.pftag_flags |= PF_TAG_HDR_INET6;
1689 #endif /* PF_ECN */
1690 } else
1691 #endif /* INET6 */
1692 {
1693 ip = mtod(m, struct ip *);
1694 ipov = (struct ipovly *)ip;
1695 th = (struct tcphdr *)(void *)(ip + 1);
1696 /* this picks up the pseudo header (w/o the length) */
1697 tcp_fillheaders(tp, ip, th);
1698 if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len &&
1699 !SEQ_LT(tp->snd_nxt, tp->snd_max) && !sack_rxmit) {
1700 ip->ip_tos = IPTOS_ECN_ECT0;
1701 }
1702 #if PF_ECN
1703 m->m_pkthdr.pf_mtag.pftag_hdr = (void *)ip;
1704 m->m_pkthdr.pf_mtag.pftag_flags |= PF_TAG_HDR_INET;
1705 #endif /* PF_ECN */
1706 }
1707
1708 /*
1709 * Fill in fields, remembering maximum advertised
1710 * window for use in delaying messages about window sizes.
1711 * If resending a FIN, be sure not to use a new sequence number.
1712 */
1713 if (flags & TH_FIN && (tp->t_flags & TF_SENTFIN) &&
1714 tp->snd_nxt == tp->snd_max)
1715 tp->snd_nxt--;
1716 /*
1717 * If we are doing retransmissions, then snd_nxt will
1718 * not reflect the first unsent octet. For ACK only
1719 * packets, we do not want the sequence number of the
1720 * retransmitted packet, we want the sequence number
1721 * of the next unsent octet. So, if there is no data
1722 * (and no SYN or FIN), use snd_max instead of snd_nxt
1723 * when filling in ti_seq. But if we are in persist
1724 * state, snd_max might reflect one byte beyond the
1725 * right edge of the window, so use snd_nxt in that
1726 * case, since we know we aren't doing a retransmission.
1727 * (retransmit and persist are mutually exclusive...)
1728 */
1729 if (sack_rxmit == 0) {
1730 if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST])
1731 th->th_seq = htonl(tp->snd_nxt);
1732 else
1733 th->th_seq = htonl(tp->snd_max);
1734 } else {
1735 th->th_seq = htonl(p->rxmit);
1736 p->rxmit += len;
1737 tp->sackhint.sack_bytes_rexmit += len;
1738 }
1739 th->th_ack = htonl(tp->rcv_nxt);
1740 tp->last_ack_sent = tp->rcv_nxt;
1741 #if MPTCP
1742 /* Initialize the ACK field to a value as 0 ack fields are dropped */
1743 if (early_data_sent) {
1744 th->th_ack = th->th_seq + 1;
1745 }
1746 #endif /* MPTCP */
1747 if (optlen) {
1748 bcopy(opt, th + 1, optlen);
1749 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
1750 }
1751 th->th_flags = flags;
1752 /*
1753 * Calculate receive window. Don't shrink window,
1754 * but avoid silly window syndrome.
1755 */
1756 if (recwin < (int32_t)(so->so_rcv.sb_hiwat / 4) && recwin < (int)tp->t_maxseg)
1757 recwin = 0;
1758 if (recwin < (int32_t)(tp->rcv_adv - tp->rcv_nxt))
1759 recwin = (int32_t)(tp->rcv_adv - tp->rcv_nxt);
1760 if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) {
1761 if (recwin > (int32_t)slowlink_wsize)
1762 recwin = slowlink_wsize;
1763 }
1764
1765 #if TRAFFIC_MGT
1766 if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(so)) {
1767 if (tcp_recv_throttle(tp)) {
1768 uint32_t min_iaj_win =
1769 tcp_min_iaj_win * tp->t_maxseg;
1770 if (tp->iaj_rwintop == 0 ||
1771 SEQ_LT(tp->iaj_rwintop, tp->rcv_adv))
1772 tp->iaj_rwintop = tp->rcv_adv;
1773 if (SEQ_LT(tp->iaj_rwintop,
1774 tp->rcv_nxt + min_iaj_win))
1775 tp->iaj_rwintop = tp->rcv_nxt + min_iaj_win;
1776 recwin = min(tp->iaj_rwintop - tp->rcv_nxt, recwin);
1777 }
1778 }
1779 #endif /* TRAFFIC_MGT */
1780
1781 if (recwin > (int32_t)(TCP_MAXWIN << tp->rcv_scale))
1782 recwin = (int32_t)(TCP_MAXWIN << tp->rcv_scale);
1783 th->th_win = htons((u_short) (recwin>>tp->rcv_scale));
1784
1785 /*
1786 * Adjust the RXWIN0SENT flag - indicate that we have advertised
1787 * a 0 window. This may cause the remote transmitter to stall. This
1788 * flag tells soreceive() to disable delayed acknowledgements when
1789 * draining the buffer. This can occur if the receiver is attempting
1790 * to read more data then can be buffered prior to transmitting on
1791 * the connection.
1792 */
1793 if (th->th_win == 0)
1794 tp->t_flags |= TF_RXWIN0SENT;
1795 else
1796 tp->t_flags &= ~TF_RXWIN0SENT;
1797 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
1798 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
1799 th->th_flags |= TH_URG;
1800 } else {
1801 /*
1802 * If no urgent pointer to send, then we pull
1803 * the urgent pointer to the left edge of the send window
1804 * so that it doesn't drift into the send window on sequence
1805 * number wraparound.
1806 */
1807 tp->snd_up = tp->snd_una; /* drag it along */
1808 }
1809
1810 /*
1811 * Put TCP length in extended header, and then
1812 * checksum extended header and data.
1813 */
1814 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
1815 #if INET6
1816 if (isipv6) {
1817 /*
1818 * ip6_plen is not need to be filled now, and will be filled
1819 * in ip6_output.
1820 */
1821 m->m_pkthdr.csum_flags = CSUM_TCPIPV6;
1822 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1823 if (len + optlen)
1824 th->th_sum = in_addword(th->th_sum,
1825 htons((u_short)(optlen + len)));
1826 }
1827 else
1828 #endif /* INET6 */
1829 {
1830 m->m_pkthdr.csum_flags = CSUM_TCP;
1831 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1832 if (len + optlen)
1833 th->th_sum = in_addword(th->th_sum,
1834 htons((u_short)(optlen + len)));
1835 }
1836
1837 /*
1838 * Enable TSO and specify the size of the segments.
1839 * The TCP pseudo header checksum is always provided.
1840 */
1841 if (tso) {
1842 #if INET6
1843 if (isipv6)
1844 m->m_pkthdr.csum_flags |= CSUM_TSO_IPV6;
1845 else
1846 #endif /* INET6 */
1847 m->m_pkthdr.csum_flags |= CSUM_TSO_IPV4;
1848
1849 m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen;
1850 } else {
1851 m->m_pkthdr.tso_segsz = 0;
1852 }
1853
1854 /*
1855 * In transmit state, time the transmission and arrange for
1856 * the retransmit. In persist state, just set snd_max.
1857 */
1858 if (!(tp->t_flagsext & TF_FORCE)
1859 || tp->t_timer[TCPT_PERSIST] == 0) {
1860 tcp_seq startseq = tp->snd_nxt;
1861
1862 /*
1863 * Advance snd_nxt over sequence space of this segment.
1864 */
1865 if (flags & (TH_SYN|TH_FIN)) {
1866 if (flags & TH_SYN)
1867 tp->snd_nxt++;
1868 if ((flags & TH_FIN) &&
1869 !(tp->t_flags & TF_SENTFIN)) {
1870 tp->snd_nxt++;
1871 tp->t_flags |= TF_SENTFIN;
1872 }
1873 }
1874 if (sack_rxmit)
1875 goto timer;
1876 tp->snd_nxt += len;
1877 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
1878 tp->snd_max = tp->snd_nxt;
1879 /*
1880 * Time this transmission if not a retransmission and
1881 * not currently timing anything.
1882 */
1883 if (tp->t_rtttime == 0) {
1884 tp->t_rtttime = tcp_now;
1885 tp->t_rtseq = startseq;
1886 tcpstat.tcps_segstimed++;
1887 }
1888 }
1889
1890 /*
1891 * Set retransmit timer if not currently set,
1892 * and not doing an ack or a keep-alive probe.
1893 */
1894 timer:
1895 if (tp->t_timer[TCPT_REXMT] == 0 &&
1896 ((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
1897 tp->snd_nxt != tp->snd_una || (flags & TH_FIN))) {
1898 if (tp->t_timer[TCPT_PERSIST]) {
1899 tp->t_timer[TCPT_PERSIST] = 0;
1900 tp->t_rxtshift = 0;
1901 tp->t_rxtstart = 0;
1902 tp->t_persist_stop = 0;
1903 }
1904 tp->t_timer[TCPT_REXMT] =
1905 OFFSET_FROM_START(tp, tp->t_rxtcur);
1906 }
1907
1908 /*
1909 * Set tail loss probe timeout if new data is being
1910 * transmitted. This will be supported only when
1911 * SACK option is enabled on a connection.
1912 *
1913 * Every time new data is sent PTO will get reset.
1914 */
1915 if (tcp_enable_tlp && tp->t_state == TCPS_ESTABLISHED &&
1916 SACK_ENABLED(tp) && !IN_FASTRECOVERY(tp)
1917 && tp->snd_nxt == tp->snd_max
1918 && SEQ_GT(tp->snd_nxt, tp->snd_una)
1919 && tp->t_rxtshift == 0
1920 && (tp->t_flagsext & (TF_SENT_TLPROBE|TF_PKTS_REORDERED)) == 0) {
1921 u_int32_t pto, srtt, new_rto = 0;
1922
1923 /*
1924 * Using SRTT alone to set PTO can cause spurious
1925 * retransmissions on wireless networks where there
1926 * is a lot of variance in RTT. Taking variance
1927 * into account will avoid this.
1928 */
1929 srtt = tp->t_srtt >> TCP_RTT_SHIFT;
1930 pto = ((TCP_REXMTVAL(tp)) * 3) >> 1;
1931 pto = max (2 * srtt, pto);
1932 if ((tp->snd_max - tp->snd_una) == tp->t_maxseg)
1933 pto = max(pto,
1934 (((3 * pto) >> 2) + tcp_delack * 2));
1935 else
1936 pto = max(10, pto);
1937
1938 /* if RTO is less than PTO, choose RTO instead */
1939 if (tp->t_rxtcur < pto) {
1940 /*
1941 * Schedule PTO instead of RTO in favor of
1942 * fast recovery.
1943 */
1944 pto = tp->t_rxtcur;
1945
1946 /* Reset the next RTO to be after PTO. */
1947 TCPT_RANGESET(new_rto,
1948 (pto + TCP_REXMTVAL(tp)),
1949 max(tp->t_rttmin, tp->t_rttcur + 2),
1950 TCPTV_REXMTMAX, 0);
1951 tp->t_timer[TCPT_REXMT] =
1952 OFFSET_FROM_START(tp, new_rto);
1953 }
1954 tp->t_timer[TCPT_PTO] = OFFSET_FROM_START(tp, pto);
1955 }
1956 } else {
1957 /*
1958 * Persist case, update snd_max but since we are in
1959 * persist mode (no window) we do not update snd_nxt.
1960 */
1961 int xlen = len;
1962 if (flags & TH_SYN)
1963 ++xlen;
1964 if ((flags & TH_FIN) &&
1965 !(tp->t_flags & TF_SENTFIN)) {
1966 ++xlen;
1967 tp->t_flags |= TF_SENTFIN;
1968 }
1969 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
1970 tp->snd_max = tp->snd_nxt + len;
1971 }
1972
1973 #if TCPDEBUG
1974 /*
1975 * Trace.
1976 */
1977 if (so_options & SO_DEBUG)
1978 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
1979 #endif
1980
1981 /*
1982 * Fill in IP length and desired time to live and
1983 * send to IP level. There should be a better way
1984 * to handle ttl and tos; we could keep them in
1985 * the template, but need a way to checksum without them.
1986 */
1987 #if INET6
1988 /*
1989 * m->m_pkthdr.len should have been set before cksum calcuration,
1990 * because in6_cksum() need it.
1991 */
1992 if (isipv6) {
1993 /*
1994 * we separately set hoplimit for every segment, since the
1995 * user might want to change the value via setsockopt.
1996 * Also, desired default hop limit might be changed via
1997 * Neighbor Discovery.
1998 */
1999 ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ?
2000 inp->in6p_route.ro_rt->rt_ifp : NULL);
2001
2002 /* TODO: IPv6 IP6TOS_ECT bit on */
2003 KERNEL_DEBUG(DBG_LAYER_BEG,
2004 ((inp->inp_fport << 16) | inp->inp_lport),
2005 (((inp->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
2006 (inp->in6p_faddr.s6_addr16[0] & 0xffff)),
2007 sendalot,0,0);
2008 } else
2009 #endif /* INET6 */
2010 {
2011 ip->ip_len = m->m_pkthdr.len;
2012 ip->ip_ttl = inp->inp_ip_ttl; /* XXX */
2013 ip->ip_tos |= (inp->inp_ip_tos & ~IPTOS_ECN_MASK);/* XXX */
2014 KERNEL_DEBUG(DBG_LAYER_BEG,
2015 ((inp->inp_fport << 16) | inp->inp_lport),
2016 (((inp->inp_laddr.s_addr & 0xffff) << 16) |
2017 (inp->inp_faddr.s_addr & 0xffff)), 0,0,0);
2018 }
2019
2020 /*
2021 * See if we should do MTU discovery.
2022 * Look at the flag updated on the following criterias:
2023 * 1) Path MTU discovery is authorized by the sysctl
2024 * 2) The route isn't set yet (unlikely but could happen)
2025 * 3) The route is up
2026 * 4) the MTU is not locked (if it is, then discovery has been
2027 * disabled for that route)
2028 */
2029 #if INET6
2030 if (!isipv6)
2031 #endif /* INET6 */
2032 if (path_mtu_discovery && (tp->t_flags & TF_PMTUD))
2033 ip->ip_off |= IP_DF;
2034
2035 #if NECP
2036 {
2037 necp_kernel_policy_id policy_id;
2038 if (!necp_socket_is_allowed_to_send_recv(inp, &policy_id)) {
2039 m_freem(m);
2040 error = EHOSTUNREACH;
2041 goto out;
2042 }
2043
2044 necp_mark_packet_from_socket(m, inp, policy_id);
2045 }
2046 #endif /* NECP */
2047
2048 #if IPSEC
2049 if (inp->inp_sp != NULL)
2050 ipsec_setsocket(m, so);
2051 #endif /*IPSEC*/
2052
2053 /*
2054 * The socket is kept locked while sending out packets in ip_output, even if packet chaining is not active.
2055 */
2056 lost = 0;
2057
2058 /*
2059 * Embed the flow hash in pkt hdr and mark the packet as
2060 * capable of flow controlling
2061 */
2062 m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB;
2063 m->m_pkthdr.pkt_flowid = inp->inp_flowhash;
2064 m->m_pkthdr.pkt_flags |= PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC;
2065 #if MPTCP
2066 /* Disable flow advisory when using MPTCP. */
2067 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE))
2068 #endif /* MPTCP */
2069 m->m_pkthdr.pkt_flags |= PKTF_FLOW_ADV;
2070 m->m_pkthdr.pkt_proto = IPPROTO_TCP;
2071
2072 m->m_nextpkt = NULL;
2073
2074 if (inp->inp_last_outifp != NULL &&
2075 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2076 /* Hint to prioritize this packet if
2077 * 1. if the packet has no data
2078 * 2. the interface supports transmit-start model and did
2079 * not disable ACK prioritization.
2080 * 3. Only ACK flag is set.
2081 * 4. there is no outstanding data on this connection.
2082 */
2083 if (tcp_prioritize_acks != 0 && len == 0 &&
2084 (inp->inp_last_outifp->if_eflags &
2085 (IFEF_TXSTART | IFEF_NOACKPRI)) == IFEF_TXSTART &&
2086 th->th_flags == TH_ACK && tp->snd_una == tp->snd_max &&
2087 tp->t_timer[TCPT_REXMT] == 0) {
2088 svc_flags |= PKT_SCF_TCP_ACK;
2089 }
2090 set_packet_service_class(m, so, MBUF_SC_UNSPEC, svc_flags);
2091 }
2092
2093 tp->t_pktlist_sentlen += len;
2094 tp->t_lastchain++;
2095
2096 #if INET6
2097 if (isipv6) {
2098 DTRACE_TCP5(send, struct mbuf *, m, struct inpcb *, inp,
2099 struct ip6 *, ip6, struct tcpcb *, tp, struct tcphdr *,
2100 th);
2101 } else
2102 #endif /* INET6 */
2103 {
2104 DTRACE_TCP5(send, struct mbuf *, m, struct inpcb *, inp,
2105 struct ip *, ip, struct tcpcb *, tp, struct tcphdr *, th);
2106 }
2107
2108 if (tp->t_pktlist_head != NULL) {
2109 tp->t_pktlist_tail->m_nextpkt = m;
2110 tp->t_pktlist_tail = m;
2111 } else {
2112 packchain_newlist++;
2113 tp->t_pktlist_head = tp->t_pktlist_tail = m;
2114 }
2115
2116 if ((lro_ackmore) && (!sackoptlen) && (!tp->t_timer[TCPT_PERSIST]) &&
2117 ((th->th_flags & TH_ACK) == TH_ACK) && (!len) &&
2118 (tp->t_state == TCPS_ESTABLISHED)) {
2119 /* For a pure ACK, see if you need to send more of them */
2120 mnext = tcp_send_lroacks(tp, m, th);
2121 if (mnext) {
2122 tp->t_pktlist_tail->m_nextpkt = mnext;
2123 if (mnext->m_nextpkt == NULL) {
2124 tp->t_pktlist_tail = mnext;
2125 tp->t_lastchain++;
2126 } else {
2127 struct mbuf *tail, *next;
2128 next = mnext->m_nextpkt;
2129 tail = next->m_nextpkt;
2130 while (tail) {
2131 next = tail;
2132 tail = tail->m_nextpkt;
2133 tp->t_lastchain++;
2134 }
2135 tp->t_pktlist_tail = next;
2136 }
2137 }
2138 }
2139
2140 if (sendalot == 0 || (tp->t_state != TCPS_ESTABLISHED) ||
2141 (tp->snd_cwnd <= (tp->snd_wnd / 8)) ||
2142 (tp->t_flags & (TH_PUSH | TF_ACKNOW)) ||
2143 (tp->t_flagsext & TF_FORCE) ||
2144 tp->t_lastchain >= tcp_packet_chaining) {
2145 error = 0;
2146 while (inp->inp_sndinprog_cnt == 0 &&
2147 tp->t_pktlist_head != NULL) {
2148 packetlist = tp->t_pktlist_head;
2149 packchain_listadd = tp->t_lastchain;
2150 packchain_sent++;
2151 lost = tp->t_pktlist_sentlen;
2152 TCP_PKTLIST_CLEAR(tp);
2153
2154 error = tcp_ip_output(so, tp, packetlist,
2155 packchain_listadd, tp_inp_options,
2156 (so_options & SO_DONTROUTE),
2157 (sack_rxmit | (sack_bytes_rxmt != 0)), recwin,
2158 #if INET6
2159 isipv6);
2160 #else /* INET6 */
2161 0);
2162 #endif /* !INET6 */
2163 if (error) {
2164 /*
2165 * Take into account the rest of unsent
2166 * packets in the packet list for this tcp
2167 * into "lost", since we're about to free
2168 * the whole list below.
2169 */
2170 lost += tp->t_pktlist_sentlen;
2171 break;
2172 } else {
2173 lost = 0;
2174 }
2175 }
2176 /* tcp was closed while we were in ip; resume close */
2177 if (inp->inp_sndinprog_cnt == 0 &&
2178 (tp->t_flags & TF_CLOSING)) {
2179 tp->t_flags &= ~TF_CLOSING;
2180 (void) tcp_close(tp);
2181 return (0);
2182 }
2183 } else {
2184 error = 0;
2185 packchain_looped++;
2186 tcpstat.tcps_sndtotal++;
2187
2188 goto again;
2189 }
2190 if (error) {
2191 /*
2192 * Assume that the packets were lost, so back out the
2193 * sequence number advance, if any. Note that the "lost"
2194 * variable represents the amount of user data sent during
2195 * the recent call to ip_output_list() plus the amount of
2196 * user data in the packet list for this tcp at the moment.
2197 */
2198 if (!(tp->t_flagsext & TF_FORCE)
2199 || tp->t_timer[TCPT_PERSIST] == 0) {
2200 /*
2201 * No need to check for TH_FIN here because
2202 * the TF_SENTFIN flag handles that case.
2203 */
2204 if ((flags & TH_SYN) == 0) {
2205 if (sack_rxmit) {
2206 if (SEQ_GT((p->rxmit - lost),
2207 tp->snd_una)) {
2208 p->rxmit -= lost;
2209 } else {
2210 lost = p->rxmit - tp->snd_una;
2211 p->rxmit = tp->snd_una;
2212 }
2213 tp->sackhint.sack_bytes_rexmit -= lost;
2214 } else {
2215 if (SEQ_GT((tp->snd_nxt - lost),
2216 tp->snd_una))
2217 tp->snd_nxt -= lost;
2218 else
2219 tp->snd_nxt = tp->snd_una;
2220 }
2221 }
2222 }
2223 out:
2224 if (tp->t_pktlist_head != NULL)
2225 m_freem_list(tp->t_pktlist_head);
2226 TCP_PKTLIST_CLEAR(tp);
2227
2228 if (error == ENOBUFS) {
2229 if (!tp->t_timer[TCPT_REXMT] &&
2230 !tp->t_timer[TCPT_PERSIST])
2231 tp->t_timer[TCPT_REXMT] =
2232 OFFSET_FROM_START(tp, tp->t_rxtcur);
2233 tp->snd_cwnd = tp->t_maxseg;
2234 tp->t_bytes_acked = 0;
2235 tcp_check_timer_state(tp);
2236 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
2237
2238 tcp_ccdbg_trace(tp, NULL, TCP_CC_OUTPUT_ERROR);
2239 return (0);
2240 }
2241 if (error == EMSGSIZE) {
2242 /*
2243 * ip_output() will have already fixed the route
2244 * for us. tcp_mtudisc() will, as its last action,
2245 * initiate retransmission, so it is important to
2246 * not do so here.
2247 *
2248 * If TSO was active we either got an interface
2249 * without TSO capabilits or TSO was turned off.
2250 * Disable it for this connection as too and
2251 * immediatly retry with MSS sized segments generated
2252 * by this function.
2253 */
2254 if (tso)
2255 tp->t_flags &= ~TF_TSO;
2256
2257 tcp_mtudisc(inp, 0);
2258 tcp_check_timer_state(tp);
2259
2260 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
2261 return 0;
2262 }
2263 /*
2264 * Unless this is due to interface restriction policy,
2265 * treat EHOSTUNREACH/ENETDOWN as a soft error.
2266 */
2267 if ((error == EHOSTUNREACH || error == ENETDOWN) &&
2268 TCPS_HAVERCVDSYN(tp->t_state) &&
2269 !inp_restricted_send(inp, inp->inp_last_outifp)) {
2270 tp->t_softerror = error;
2271 error = 0;
2272 }
2273 tcp_check_timer_state(tp);
2274 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
2275 return (error);
2276 }
2277
2278 tcpstat.tcps_sndtotal++;
2279
2280 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,0,0,0,0,0);
2281 if (sendalot)
2282 goto again;
2283
2284 tcp_check_timer_state(tp);
2285 return (0);
2286 }
2287
2288 static int
2289 tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt,
2290 int cnt, struct mbuf *opt, int flags, int sack_in_progress, int recwin,
2291 boolean_t isipv6)
2292 {
2293 int error = 0;
2294 boolean_t chain;
2295 boolean_t unlocked = FALSE;
2296 boolean_t ifdenied = FALSE;
2297 struct inpcb *inp = tp->t_inpcb;
2298 struct ip_out_args ipoa =
2299 { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF|IPOAF_BOUND_SRCADDR, 0 };
2300 struct route ro;
2301 struct ifnet *outif = NULL;
2302 #if INET6
2303 struct ip6_out_args ip6oa =
2304 { IFSCOPE_NONE, { 0 }, IP6OAF_SELECT_SRCIF|IP6OAF_BOUND_SRCADDR, 0 };
2305 struct route_in6 ro6;
2306 struct flowadv *adv =
2307 (isipv6 ? &ip6oa.ip6oa_flowadv : &ipoa.ipoa_flowadv);
2308 #else /* INET6 */
2309 struct flowadv *adv = &ipoa.ipoa_flowadv;
2310 #endif /* !INET6 */
2311
2312 /* If socket was bound to an ifindex, tell ip_output about it */
2313 if (inp->inp_flags & INP_BOUND_IF) {
2314 #if INET6
2315 if (isipv6) {
2316 ip6oa.ip6oa_boundif = inp->inp_boundifp->if_index;
2317 ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF;
2318 } else
2319 #endif /* INET6 */
2320 {
2321 ipoa.ipoa_boundif = inp->inp_boundifp->if_index;
2322 ipoa.ipoa_flags |= IPOAF_BOUND_IF;
2323 }
2324 }
2325
2326 if (INP_NO_CELLULAR(inp)) {
2327 #if INET6
2328 if (isipv6)
2329 ip6oa.ip6oa_flags |= IP6OAF_NO_CELLULAR;
2330 else
2331 #endif /* INET6 */
2332 ipoa.ipoa_flags |= IPOAF_NO_CELLULAR;
2333 }
2334 if (INP_NO_EXPENSIVE(inp)) {
2335 #if INET6
2336 if (isipv6)
2337 ip6oa.ip6oa_flags |= IP6OAF_NO_EXPENSIVE;
2338 else
2339 #endif /* INET6 */
2340 ipoa.ipoa_flags |= IPOAF_NO_EXPENSIVE;
2341
2342 }
2343 if (INP_AWDL_UNRESTRICTED(inp)) {
2344 #if INET6
2345 if (isipv6)
2346 ip6oa.ip6oa_flags |= IP6OAF_AWDL_UNRESTRICTED;
2347 else
2348 #endif /* INET6 */
2349 ipoa.ipoa_flags |= IPOAF_AWDL_UNRESTRICTED;
2350
2351 }
2352 #if INET6
2353 if (isipv6)
2354 flags |= IPV6_OUTARGS;
2355 else
2356 #endif /* INET6 */
2357 flags |= IP_OUTARGS;
2358
2359 /* Copy the cached route and take an extra reference */
2360 #if INET6
2361 if (isipv6)
2362 in6p_route_copyout(inp, &ro6);
2363 else
2364 #endif /* INET6 */
2365 inp_route_copyout(inp, &ro);
2366
2367 /*
2368 * Data sent (as far as we can tell).
2369 * If this advertises a larger window than any other segment,
2370 * then remember the size of the advertised window.
2371 * Make sure ACK/DELACK conditions are cleared before
2372 * we unlock the socket.
2373 */
2374 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
2375 tp->rcv_adv = tp->rcv_nxt + recwin;
2376 tp->last_ack_sent = tp->rcv_nxt;
2377 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
2378 tp->t_timer[TCPT_DELACK] = 0;
2379 tp->t_unacksegs = 0;
2380
2381 /* Increment the count of outstanding send operations */
2382 inp->inp_sndinprog_cnt++;
2383
2384 /*
2385 * If allowed, unlock TCP socket while in IP
2386 * but only if the connection is established and
2387 * in a normal mode where reentrancy on the tcpcb won't be
2388 * an issue:
2389 * - there is no SACK episode
2390 * - we're not in Fast Recovery mode
2391 * - if we're not sending from an upcall.
2392 */
2393 if (tcp_output_unlocked && !so->so_upcallusecount &&
2394 (tp->t_state == TCPS_ESTABLISHED) && (sack_in_progress == 0) &&
2395 !IN_FASTRECOVERY(tp)) {
2396
2397 unlocked = TRUE;
2398 socket_unlock(so, 0);
2399 }
2400
2401 /*
2402 * Don't send down a chain of packets when:
2403 * - TCP chaining is disabled
2404 * - there is an IPsec rule set
2405 * - there is a non default rule set for the firewall
2406 */
2407
2408 chain = tcp_packet_chaining > 1
2409 #if IPSEC
2410 && ipsec_bypass
2411 #endif
2412 #if IPFIREWALL
2413 && (fw_enable == 0 || fw_bypass)
2414 #endif
2415 ; // I'm important, not extraneous
2416
2417
2418 while (pkt != NULL) {
2419 struct mbuf *npkt = pkt->m_nextpkt;
2420
2421 if (!chain) {
2422 pkt->m_nextpkt = NULL;
2423 /*
2424 * If we are not chaining, make sure to set the packet
2425 * list count to 0 so that IP takes the right path;
2426 * this is important for cases such as IPSec where a
2427 * single mbuf might result in multiple mbufs as part
2428 * of the encapsulation. If a non-zero count is passed
2429 * down to IP, the head of the chain might change and
2430 * we could end up skipping it (thus generating bogus
2431 * packets). Fixing it in IP would be desirable, but
2432 * for now this would do it.
2433 */
2434 cnt = 0;
2435 }
2436 #if INET6
2437 if (isipv6) {
2438 error = ip6_output_list(pkt, cnt,
2439 inp->in6p_outputopts, &ro6, flags, NULL, NULL,
2440 &ip6oa);
2441 ifdenied = (ip6oa.ip6oa_retflags & IP6OARF_IFDENIED);
2442 } else {
2443 #endif /* INET6 */
2444 error = ip_output_list(pkt, cnt, opt, &ro, flags, NULL,
2445 &ipoa);
2446 ifdenied = (ipoa.ipoa_retflags & IPOARF_IFDENIED);
2447 }
2448
2449 if (chain || error) {
2450 /*
2451 * If we sent down a chain then we are done since
2452 * the callee had taken care of everything; else
2453 * we need to free the rest of the chain ourselves.
2454 */
2455 if (!chain)
2456 m_freem_list(npkt);
2457 break;
2458 }
2459 pkt = npkt;
2460 }
2461
2462 if (unlocked)
2463 socket_lock(so, 0);
2464
2465 /*
2466 * Enter flow controlled state if the connection is established
2467 * and is not in recovery.
2468 *
2469 * A connection will enter suspended state even if it is in
2470 * recovery.
2471 */
2472 if (((adv->code == FADV_FLOW_CONTROLLED && !IN_FASTRECOVERY(tp)) ||
2473 adv->code == FADV_SUSPENDED) &&
2474 !(tp->t_flags & TF_CLOSING) &&
2475 tp->t_state == TCPS_ESTABLISHED) {
2476 int rc;
2477 rc = inp_set_fc_state(inp, adv->code);
2478
2479 if (rc == 1)
2480 tcp_ccdbg_trace(tp, NULL,
2481 ((adv->code == FADV_FLOW_CONTROLLED) ?
2482 TCP_CC_FLOW_CONTROL : TCP_CC_SUSPEND));
2483 }
2484
2485 /*
2486 * When an interface queue gets suspended, some of the
2487 * packets are dropped. Return ENOBUFS, to update the
2488 * pcb state.
2489 */
2490 if (adv->code == FADV_SUSPENDED)
2491 error = ENOBUFS;
2492
2493 VERIFY(inp->inp_sndinprog_cnt > 0);
2494 if ( --inp->inp_sndinprog_cnt == 0)
2495 inp->inp_flags &= ~(INP_FC_FEEDBACK);
2496
2497 #if INET6
2498 if (isipv6) {
2499 if (ro6.ro_rt != NULL && (outif = ro6.ro_rt->rt_ifp) !=
2500 inp->in6p_last_outifp)
2501 inp->in6p_last_outifp = outif;
2502 } else
2503 #endif /* INET6 */
2504 if (ro.ro_rt != NULL && (outif = ro.ro_rt->rt_ifp) !=
2505 inp->inp_last_outifp)
2506 inp->inp_last_outifp = outif;
2507
2508 if (error != 0 && ifdenied &&
2509 (INP_NO_CELLULAR(inp) || INP_NO_EXPENSIVE(inp)))
2510 soevent(inp->inp_socket,
2511 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_IFDENIED));
2512
2513 /* Synchronize cached PCB route & options */
2514 #if INET6
2515 if (isipv6)
2516 in6p_route_copyin(inp, &ro6);
2517 else
2518 #endif /* INET6 */
2519 inp_route_copyin(inp, &ro);
2520
2521 if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift == 0 &&
2522 tp->t_inpcb->inp_route.ro_rt != NULL) {
2523 /* If we found the route and there is an rtt on it
2524 * reset the retransmit timer
2525 */
2526 tcp_getrt_rtt(tp, tp->t_inpcb->in6p_route.ro_rt);
2527 tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
2528 }
2529 return (error);
2530 }
2531
2532 void
2533 tcp_setpersist(tp)
2534 register struct tcpcb *tp;
2535 {
2536 int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
2537
2538 /* If a PERSIST_TIMER option was set we will limit the
2539 * time the persist timer will be active for that connection
2540 * in order to avoid DOS by using zero window probes.
2541 * see rdar://5805356
2542 */
2543
2544 if ((tp->t_persist_timeout != 0) &&
2545 (tp->t_timer[TCPT_PERSIST] == 0) &&
2546 (tp->t_persist_stop == 0)) {
2547 tp->t_persist_stop = tcp_now + tp->t_persist_timeout;
2548 }
2549
2550 /*
2551 * Start/restart persistance timer.
2552 */
2553 TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
2554 t * tcp_backoff[tp->t_rxtshift],
2555 TCPTV_PERSMIN, TCPTV_PERSMAX, 0);
2556 tp->t_timer[TCPT_PERSIST] = OFFSET_FROM_START(tp, tp->t_timer[TCPT_PERSIST]);
2557
2558 if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
2559 tp->t_rxtshift++;
2560 }
2561
2562 /*
2563 * Send as many acks as data coalesced. Every other packet when stretch
2564 * ACK is not enabled. Every 8 packets, if stretch ACK is enabled.
2565 */
2566 static struct mbuf*
2567 tcp_send_lroacks(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
2568 {
2569 struct mbuf *mnext = NULL, *ack_chain = NULL, *tail = NULL;
2570 int count = 0;
2571 tcp_seq org_ack = ntohl(th->th_ack);
2572 tcp_seq prev_ack = 0;
2573 int tack_offset = 28; /* XXX IPv6 and IP options not supported */
2574 int twin_offset = 34; /* XXX IPv6 and IP options not supported */
2575 int ack_size = (tp->t_flags & TF_STRETCHACK) ?
2576 (maxseg_unacked * tp->t_maxseg) : (tp->t_maxseg << 1);
2577 int segs_acked = (tp->t_flags & TF_STRETCHACK) ? maxseg_unacked : 2;
2578 struct mbuf *prev_ack_pkt = NULL;
2579 struct socket *so = tp->t_inpcb->inp_socket;
2580 unsigned short winsz = ntohs(th->th_win);
2581 unsigned int scaled_win = winsz<<tp->rcv_scale;
2582 tcp_seq win_rtedge = org_ack + scaled_win;
2583
2584 count = tp->t_lropktlen/tp->t_maxseg;
2585
2586 prev_ack = (org_ack - tp->t_lropktlen) + ack_size;
2587 if (prev_ack < org_ack) {
2588 ack_chain = m_dup(m, M_DONTWAIT);
2589 if (ack_chain) {
2590 th->th_ack = htonl(prev_ack);
2591 /* Keep adv window constant for duplicated ACK packets */
2592 scaled_win = win_rtedge - prev_ack;
2593 if (scaled_win > (int32_t)(TCP_MAXWIN << tp->rcv_scale))
2594 scaled_win = (int32_t)(TCP_MAXWIN << tp->rcv_scale);
2595 th->th_win = htons(scaled_win>>tp->rcv_scale);
2596 if (lrodebug == 5) {
2597 printf("%s: win = %d winsz = %d sc = %d"
2598 " lro_len %d %d\n",
2599 __func__, scaled_win>>tp->rcv_scale, winsz,
2600 tp->rcv_scale, tp->t_lropktlen, count);
2601 }
2602 tail = ack_chain;
2603 count -= segs_acked; /* accounts for prev_ack packet */
2604 count = (count <= segs_acked) ? 0 : count - segs_acked;
2605 tcpstat.tcps_sndacks++;
2606 so_tc_update_stats(m, so, m_get_service_class(m));
2607 } else {
2608 return NULL;
2609 }
2610 }
2611 else {
2612 tp->t_lropktlen = 0;
2613 return NULL;
2614 }
2615
2616 prev_ack_pkt = ack_chain;
2617
2618 while (count > 0) {
2619 if ((prev_ack + ack_size) < org_ack) {
2620 prev_ack += ack_size;
2621 } else {
2622 /*
2623 * The last ACK sent must have the ACK number that TCP
2624 * thinks is the last sent ACK number.
2625 */
2626 prev_ack = org_ack;
2627 }
2628 mnext = m_dup(prev_ack_pkt, M_DONTWAIT);
2629 if (mnext) {
2630 /* Keep adv window constant for duplicated ACK packets */
2631 scaled_win = win_rtedge - prev_ack;
2632 if (scaled_win > (int32_t)(TCP_MAXWIN << tp->rcv_scale))
2633 scaled_win = (int32_t)(TCP_MAXWIN << tp->rcv_scale);
2634 winsz = htons(scaled_win>>tp->rcv_scale);
2635 if (lrodebug == 5) {
2636 printf("%s: winsz = %d ack %x count %d\n",
2637 __func__, scaled_win>>tp->rcv_scale,
2638 prev_ack, count);
2639 }
2640 bcopy(&winsz, mtod(prev_ack_pkt, caddr_t) + twin_offset, 2);
2641 HTONL(prev_ack);
2642 bcopy(&prev_ack, mtod(prev_ack_pkt, caddr_t) + tack_offset, 4);
2643 NTOHL(prev_ack);
2644 tail->m_nextpkt = mnext;
2645 tail = mnext;
2646 count -= segs_acked;
2647 tcpstat.tcps_sndacks++;
2648 so_tc_update_stats(m, so, m_get_service_class(m));
2649 } else {
2650 if (lrodebug == 5) {
2651 printf("%s: failed to alloc mbuf.\n", __func__);
2652 }
2653 break;
2654 }
2655 prev_ack_pkt = mnext;
2656 }
2657 tp->t_lropktlen = 0;
2658 return ack_chain;
2659 }
2660
2661 static int
2662 tcp_recv_throttle (struct tcpcb *tp)
2663 {
2664 uint32_t base_rtt, newsize;
2665 int32_t qdelay;
2666 struct sockbuf *sbrcv = &tp->t_inpcb->inp_socket->so_rcv;
2667
2668 if (tcp_use_rtt_recvbg == 1 &&
2669 TSTMP_SUPPORTED(tp)) {
2670 /*
2671 * Timestamps are supported on this connection. Use
2672 * RTT to look for an increase in latency.
2673 */
2674
2675 /*
2676 * If the connection is already being throttled, leave it
2677 * in that state until rtt comes closer to base rtt
2678 */
2679 if (tp->t_flagsext & TF_RECV_THROTTLE)
2680 return (1);
2681
2682 base_rtt = get_base_rtt(tp);
2683
2684 if (base_rtt != 0 && tp->t_rttcur != 0) {
2685 qdelay = tp->t_rttcur - base_rtt;
2686 /*
2687 * if latency increased on a background flow,
2688 * return 1 to start throttling.
2689 */
2690 if (qdelay > target_qdelay) {
2691 tp->t_flagsext |= TF_RECV_THROTTLE;
2692
2693 /*
2694 * Reduce the recv socket buffer size to
2695 * minimize latecy.
2696 */
2697 if (sbrcv->sb_idealsize >
2698 tcp_recv_throttle_minwin) {
2699 newsize = sbrcv->sb_idealsize >> 1;
2700 /* Set a minimum of 16 K */
2701 newsize =
2702 max(newsize,
2703 tcp_recv_throttle_minwin);
2704 sbrcv->sb_idealsize = newsize;
2705 }
2706 return (1);
2707 } else {
2708 return (0);
2709 }
2710 }
2711 }
2712
2713 /*
2714 * Timestamps are not supported or there is no good RTT
2715 * measurement. Use IPDV in this case.
2716 */
2717 if (tp->acc_iaj > tcp_acc_iaj_react_limit)
2718 return (1);
2719
2720 return (0);
2721 }