]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/tcp_output.c
5c29ff3a6627efe487f45756844536ffbd87d10a
[apple/xnu.git] / bsd / netinet / tcp_output.c
1 /*
2 * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/tcp_output.c,v 1.39.2.10 2001/07/07 04:30:38 silby Exp $
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #define _IP_VHL
71
72
73 #include <sys/param.h>
74 #include <sys/systm.h>
75 #include <sys/kernel.h>
76 #include <sys/sysctl.h>
77 #include <sys/mbuf.h>
78 #include <sys/domain.h>
79 #include <sys/protosw.h>
80 #include <sys/socket.h>
81 #include <sys/socketvar.h>
82
83 #include <net/route.h>
84 #include <net/ntstat.h>
85 #include <net/if_var.h>
86 #include <net/if.h>
87 #include <net/if_types.h>
88 #include <net/dlil.h>
89
90 #include <netinet/in.h>
91 #include <netinet/in_systm.h>
92 #include <netinet/in_var.h>
93 #include <netinet/in_tclass.h>
94 #include <netinet/ip.h>
95 #include <netinet/in_pcb.h>
96 #include <netinet/ip_var.h>
97 #include <mach/sdt.h>
98 #if INET6
99 #include <netinet6/in6_pcb.h>
100 #include <netinet/ip6.h>
101 #include <netinet6/ip6_var.h>
102 #endif
103 #include <netinet/tcp.h>
104 #define TCPOUTFLAGS
105 #include <netinet/tcp_cache.h>
106 #include <netinet/tcp_fsm.h>
107 #include <netinet/tcp_seq.h>
108 #include <netinet/tcp_timer.h>
109 #include <netinet/tcp_var.h>
110 #include <netinet/tcpip.h>
111 #include <netinet/tcp_cc.h>
112 #if TCPDEBUG
113 #include <netinet/tcp_debug.h>
114 #endif
115 #include <sys/kdebug.h>
116 #include <mach/sdt.h>
117
118 #if IPSEC
119 #include <netinet6/ipsec.h>
120 #endif /*IPSEC*/
121
122 #if CONFIG_MACF_NET
123 #include <security/mac_framework.h>
124 #endif /* MAC_SOCKET */
125
126 #include <netinet/lro_ext.h>
127 #if MPTCP
128 #include <netinet/mptcp_var.h>
129 #include <netinet/mptcp.h>
130 #include <netinet/mptcp_opt.h>
131 #endif
132
133 #include <corecrypto/ccaes.h>
134
135 #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 1)
136 #define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 3)
137 #define DBG_FNC_TCP_OUTPUT NETDBG_CODE(DBG_NETTCP, (4 << 8) | 1)
138
139 int path_mtu_discovery = 1;
140 SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery,
141 CTLFLAG_RW | CTLFLAG_LOCKED, &path_mtu_discovery, 1,
142 "Enable Path MTU Discovery");
143
144 int ss_fltsz = 1;
145 SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize,
146 CTLFLAG_RW | CTLFLAG_LOCKED,&ss_fltsz, 1,
147 "Slow start flight size");
148
149 int ss_fltsz_local = 8; /* starts with eight segments max */
150 SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize,
151 CTLFLAG_RW | CTLFLAG_LOCKED, &ss_fltsz_local, 1,
152 "Slow start flight size for local networks");
153
154 int tcp_do_tso = 1;
155 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW | CTLFLAG_LOCKED,
156 &tcp_do_tso, 0, "Enable TCP Segmentation Offload");
157
158 int tcp_ecn_setup_percentage = 50;
159 SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_setup_percentage,
160 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn_setup_percentage, 0,
161 "Max ECN setup percentage");
162
163 static int
164 sysctl_change_ecn_setting SYSCTL_HANDLER_ARGS
165 {
166 #pragma unused(oidp, arg1, arg2)
167 int i, err = 0, changed = 0;
168 struct ifnet *ifp;
169
170 err = sysctl_io_number(req, tcp_ecn_outbound, sizeof(int32_t),
171 &i, &changed);
172 if (err != 0 || req->newptr == USER_ADDR_NULL)
173 return(err);
174
175 if (changed) {
176 if ((tcp_ecn_outbound == 0 || tcp_ecn_outbound == 1) &&
177 (i == 0 || i == 1)) {
178 tcp_ecn_outbound = i;
179 return(err);
180 }
181 if (tcp_ecn_outbound == 2 && (i == 0 || i == 1)) {
182 /*
183 * Reset ECN enable flags on non-cellular
184 * interfaces so that the system default will take
185 * over
186 */
187 ifnet_head_lock_shared();
188 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
189 if (!IFNET_IS_CELLULAR(ifp)) {
190 ifnet_lock_exclusive(ifp);
191 ifp->if_eflags &= ~IFEF_ECN_DISABLE;
192 ifp->if_eflags &= ~IFEF_ECN_ENABLE;
193 ifnet_lock_done(ifp);
194 }
195 }
196 ifnet_head_done();
197 } else {
198 /*
199 * Set ECN enable flags on non-cellular
200 * interfaces
201 */
202 ifnet_head_lock_shared();
203 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
204 if (!IFNET_IS_CELLULAR(ifp)) {
205 ifnet_lock_exclusive(ifp);
206 ifp->if_eflags |= IFEF_ECN_ENABLE;
207 ifp->if_eflags &= ~IFEF_ECN_DISABLE;
208 ifnet_lock_done(ifp);
209 }
210 }
211 ifnet_head_done();
212 }
213 tcp_ecn_outbound = i;
214 }
215 /* Change the other one too as the work is done */
216 if (i == 2 || tcp_ecn_inbound == 2)
217 tcp_ecn_inbound = i;
218 return (err);
219 }
220
221 int tcp_ecn_outbound = 2;
222 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, ecn_initiate_out,
223 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn_outbound, 0,
224 sysctl_change_ecn_setting, "IU",
225 "Initiate ECN for outbound connections");
226
227 int tcp_ecn_inbound = 2;
228 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, ecn_negotiate_in,
229 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn_inbound, 0,
230 sysctl_change_ecn_setting, "IU",
231 "Initiate ECN for inbound connections");
232
233 int tcp_packet_chaining = 50;
234 SYSCTL_INT(_net_inet_tcp, OID_AUTO, packetchain,
235 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_packet_chaining, 0,
236 "Enable TCP output packet chaining");
237
238 int tcp_output_unlocked = 1;
239 SYSCTL_INT(_net_inet_tcp, OID_AUTO, socket_unlocked_on_output,
240 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_output_unlocked, 0,
241 "Unlock TCP when sending packets down to IP");
242
243 int tcp_do_rfc3390 = 1;
244 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390,
245 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_rfc3390, 1,
246 "Calculate intial slowstart cwnd depending on MSS");
247
248 int tcp_min_iaj_win = MIN_IAJ_WIN;
249 SYSCTL_INT(_net_inet_tcp, OID_AUTO, min_iaj_win,
250 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_min_iaj_win, 1,
251 "Minimum recv win based on inter-packet arrival jitter");
252
253 int tcp_acc_iaj_react_limit = ACC_IAJ_REACT_LIMIT;
254 SYSCTL_INT(_net_inet_tcp, OID_AUTO, acc_iaj_react_limit,
255 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_acc_iaj_react_limit, 1,
256 "Accumulated IAJ when receiver starts to react");
257
258 uint32_t tcp_do_autosendbuf = 1;
259 SYSCTL_INT(_net_inet_tcp, OID_AUTO, doautosndbuf,
260 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_autosendbuf, 1,
261 "Enable send socket buffer auto-tuning");
262
263 uint32_t tcp_autosndbuf_inc = 8 * 1024;
264 SYSCTL_INT(_net_inet_tcp, OID_AUTO, autosndbufinc,
265 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_autosndbuf_inc, 1,
266 "Increment in send socket bufffer size");
267
268 uint32_t tcp_autosndbuf_max = 512 * 1024;
269 SYSCTL_INT(_net_inet_tcp, OID_AUTO, autosndbufmax,
270 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_autosndbuf_max, 1,
271 "Maximum send socket buffer size");
272
273 uint32_t tcp_prioritize_acks = 1;
274 SYSCTL_INT(_net_inet_tcp, OID_AUTO, ack_prioritize,
275 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_prioritize_acks, 1,
276 "Prioritize pure acks");
277
278 uint32_t tcp_use_rtt_recvbg = 1;
279 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rtt_recvbg,
280 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_use_rtt_recvbg, 1,
281 "Use RTT for bg recv algorithm");
282
283 uint32_t tcp_recv_throttle_minwin = 16 * 1024;
284 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recv_throttle_minwin,
285 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_recv_throttle_minwin, 1,
286 "Minimum recv win for throttling");
287
288 int32_t tcp_enable_tlp = 1;
289 SYSCTL_INT(_net_inet_tcp, OID_AUTO, enable_tlp,
290 CTLFLAG_RW | CTLFLAG_LOCKED,
291 &tcp_enable_tlp, 1, "Enable Tail loss probe");
292
293 static int32_t packchain_newlist = 0;
294 static int32_t packchain_looped = 0;
295 static int32_t packchain_sent = 0;
296
297 /* temporary: for testing */
298 #if IPSEC
299 extern int ipsec_bypass;
300 #endif
301
302 extern int slowlink_wsize; /* window correction for slow links */
303 #if IPFIREWALL
304 extern int fw_enable; /* firewall check for packet chaining */
305 extern int fw_bypass; /* firewall check: disable packet chaining if there is rules */
306 #endif /* IPFIREWALL */
307
308 extern u_int32_t dlil_filter_disable_tso_count;
309 extern u_int32_t kipf_count;
310 extern int tcp_recv_bg;
311
312 static int tcp_ip_output(struct socket *, struct tcpcb *, struct mbuf *, int,
313 struct mbuf *, int, int, int32_t, boolean_t);
314 static struct mbuf* tcp_send_lroacks(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th);
315 static int tcp_recv_throttle(struct tcpcb *tp);
316
317 static int32_t tcp_tfo_check(struct tcpcb *tp, int32_t len)
318 {
319 struct socket *so = tp->t_inpcb->inp_socket;
320 unsigned int optlen = 0;
321 unsigned int cookie_len;
322
323 if (tp->t_flags & TF_NOOPT)
324 goto fallback;
325
326 if (!tcp_heuristic_do_tfo(tp)) {
327 tp->t_tfo_stats |= TFO_S_HEURISTICS_DISABLE;
328 tcpstat.tcps_tfo_heuristics_disable++;
329 goto fallback;
330 }
331
332 optlen += TCPOLEN_MAXSEG;
333
334 if (tp->t_flags & TF_REQ_SCALE)
335 optlen += 4;
336
337 #if MPTCP
338 if ((so->so_flags & SOF_MP_SUBFLOW) && mptcp_enable &&
339 tp->t_rxtshift <= mptcp_mpcap_retries)
340 optlen += sizeof(struct mptcp_mpcapable_opt_common) + sizeof(mptcp_key_t);
341 #endif /* MPTCP */
342
343 if (tp->t_flags & TF_REQ_TSTMP)
344 optlen += TCPOLEN_TSTAMP_APPA;
345
346 if (SACK_ENABLED(tp))
347 optlen += TCPOLEN_SACK_PERMITTED;
348
349 /* Now, decide whether to use TFO or not */
350
351 /* Don't even bother trying if there is no space at all... */
352 if (MAX_TCPOPTLEN - optlen < TCPOLEN_FASTOPEN_REQ)
353 goto fallback;
354
355 cookie_len = tcp_cache_get_cookie_len(tp);
356 if (cookie_len == 0)
357 /* No cookie, so we request one */
358 return (0);
359
360 /* Do not send SYN+data if there is more in the queue than MSS */
361 if (so->so_snd.sb_cc > (tp->t_maxopd - MAX_TCPOPTLEN))
362 goto fallback;
363
364 /* Ok, everything looks good. We can go on and do TFO */
365 return (len);
366
367 fallback:
368 tp->t_flagsext &= ~TF_FASTOPEN;
369 return (0);
370 }
371
372 /* Returns the number of bytes written to the TCP option-space */
373 static unsigned
374 tcp_tfo_write_cookie_rep(struct tcpcb *tp, unsigned optlen, u_char *opt)
375 {
376 u_char out[CCAES_BLOCK_SIZE];
377 unsigned ret = 0;
378 u_char *bp;
379
380 if ((MAX_TCPOPTLEN - optlen) <
381 (TCPOLEN_FASTOPEN_REQ + TFO_COOKIE_LEN_DEFAULT))
382 return (ret);
383
384 tcp_tfo_gen_cookie(tp->t_inpcb, out, sizeof(out));
385
386 bp = opt + optlen;
387
388 *bp++ = TCPOPT_FASTOPEN;
389 *bp++ = 2 + TFO_COOKIE_LEN_DEFAULT;
390 memcpy(bp, out, TFO_COOKIE_LEN_DEFAULT);
391 ret += 2 + TFO_COOKIE_LEN_DEFAULT;
392
393 tp->t_tfo_stats |= TFO_S_COOKIE_SENT;
394 tcpstat.tcps_tfo_cookie_sent++;
395
396 return (ret);
397 }
398
399 static unsigned
400 tcp_tfo_write_cookie(struct tcpcb *tp, unsigned optlen, int32_t *len,
401 u_char *opt)
402 {
403 u_int8_t tfo_len = MAX_TCPOPTLEN - optlen - TCPOLEN_FASTOPEN_REQ;
404 unsigned ret = 0;
405 int res;
406 u_char *bp;
407
408 bp = opt + optlen;
409
410 /*
411 * The cookie will be copied in the appropriate place within the
412 * TCP-option space. That way we avoid the need for an intermediate
413 * variable.
414 */
415 res = tcp_cache_get_cookie(tp, bp + TCPOLEN_FASTOPEN_REQ, &tfo_len);
416 if (res == 0) {
417 *bp++ = TCPOPT_FASTOPEN;
418 *bp++ = TCPOLEN_FASTOPEN_REQ;
419 ret += TCPOLEN_FASTOPEN_REQ;
420
421 tp->t_tfo_flags |= TFO_F_COOKIE_REQ;
422
423 tp->t_tfo_stats |= TFO_S_COOKIE_REQ;
424 tcpstat.tcps_tfo_cookie_req++;
425 } else {
426 *bp++ = TCPOPT_FASTOPEN;
427 *bp++ = TCPOLEN_FASTOPEN_REQ + tfo_len;
428
429 ret += TCPOLEN_FASTOPEN_REQ + tfo_len;
430
431 tp->t_tfo_flags |= TFO_F_COOKIE_SENT;
432
433 /* If there is some data, let's track it */
434 if (*len) {
435 tp->t_tfo_stats |= TFO_S_SYN_DATA_SENT;
436 tcpstat.tcps_tfo_syn_data_sent++;
437 }
438 }
439
440 return (ret);
441 }
442
443 static inline bool
444 tcp_send_ecn_flags_on_syn(struct tcpcb *tp, struct socket *so)
445 {
446 return(!((tp->ecn_flags & TE_SETUPSENT) ||
447 (so->so_flags & SOF_MP_SUBFLOW) ||
448 (tp->t_flagsext & TF_FASTOPEN)));
449 }
450
451 void
452 tcp_set_ecn(struct tcpcb *tp, struct ifnet *ifp)
453 {
454 boolean_t inbound;
455
456 /*
457 * Socket option has precedence
458 */
459 if (tp->ecn_flags & TE_ECN_MODE_ENABLE) {
460 tp->ecn_flags |= TE_ENABLE_ECN;
461 goto check_heuristic;
462 }
463
464 if (tp->ecn_flags & TE_ECN_MODE_DISABLE) {
465 tp->ecn_flags &= ~TE_ENABLE_ECN;
466 return;
467 }
468 /*
469 * Per interface setting comes next
470 */
471 if (ifp != NULL) {
472 if (ifp->if_eflags & IFEF_ECN_ENABLE) {
473 tp->ecn_flags |= TE_ENABLE_ECN;
474 goto check_heuristic;
475 }
476
477 if (ifp->if_eflags & IFEF_ECN_DISABLE) {
478 tp->ecn_flags &= ~TE_ENABLE_ECN;
479 return;
480 }
481 }
482 /*
483 * System wide settings come last
484 */
485 inbound = (tp->t_inpcb->inp_socket->so_head != NULL);
486 if ((inbound && tcp_ecn_inbound == 1) ||
487 (!inbound && tcp_ecn_outbound == 1)) {
488 tp->ecn_flags |= TE_ENABLE_ECN;
489 goto check_heuristic;
490 } else {
491 tp->ecn_flags &= ~TE_ENABLE_ECN;
492 }
493
494 return;
495
496 check_heuristic:
497 if (!tcp_heuristic_do_ecn(tp))
498 tp->ecn_flags &= ~TE_ENABLE_ECN;
499
500 /*
501 * If the interface setting, system-level setting and heuristics
502 * allow to enable ECN, randomly select 5% of connections to
503 * enable it
504 */
505 if ((tp->ecn_flags & (TE_ECN_MODE_ENABLE | TE_ECN_MODE_DISABLE
506 | TE_ENABLE_ECN)) == TE_ENABLE_ECN) {
507 /*
508 * Use the random value in iss for randomizing
509 * this selection
510 */
511 if ((tp->iss % 100) >= tcp_ecn_setup_percentage)
512 tp->ecn_flags &= ~TE_ENABLE_ECN;
513 }
514 }
515
516 /*
517 * Tcp output routine: figure out what should be sent and send it.
518 *
519 * Returns: 0 Success
520 * EADDRNOTAVAIL
521 * ENOBUFS
522 * EMSGSIZE
523 * EHOSTUNREACH
524 * ENETDOWN
525 * ip_output_list:ENOMEM
526 * ip_output_list:EADDRNOTAVAIL
527 * ip_output_list:ENETUNREACH
528 * ip_output_list:EHOSTUNREACH
529 * ip_output_list:EACCES
530 * ip_output_list:EMSGSIZE
531 * ip_output_list:ENOBUFS
532 * ip_output_list:??? [ignorable: mostly IPSEC/firewall/DLIL]
533 * ip6_output_list:EINVAL
534 * ip6_output_list:EOPNOTSUPP
535 * ip6_output_list:EHOSTUNREACH
536 * ip6_output_list:EADDRNOTAVAIL
537 * ip6_output_list:ENETUNREACH
538 * ip6_output_list:EMSGSIZE
539 * ip6_output_list:ENOBUFS
540 * ip6_output_list:??? [ignorable: mostly IPSEC/firewall/DLIL]
541 */
542 int
543 tcp_output(struct tcpcb *tp)
544 {
545 struct inpcb *inp = tp->t_inpcb;
546 struct socket *so = inp->inp_socket;
547 int32_t len, recwin, sendwin, off;
548 int flags, error;
549 struct mbuf *m;
550 struct ip *ip = NULL;
551 struct ipovly *ipov = NULL;
552 #if INET6
553 struct ip6_hdr *ip6 = NULL;
554 #endif /* INET6 */
555 struct tcphdr *th;
556 u_char opt[TCP_MAXOLEN];
557 unsigned ipoptlen, optlen, hdrlen;
558 int idle, sendalot, lost = 0;
559 int i, sack_rxmit;
560 int tso = 0;
561 int sack_bytes_rxmt;
562 tcp_seq old_snd_nxt = 0;
563 struct sackhole *p;
564 #if IPSEC
565 unsigned ipsec_optlen = 0;
566 #endif /* IPSEC */
567 int idle_time = 0;
568 struct mbuf *packetlist = NULL;
569 struct mbuf *tp_inp_options = inp->inp_depend4.inp4_options;
570 #if INET6
571 int isipv6 = inp->inp_vflag & INP_IPV6 ;
572 #else
573 int isipv6 = 0;
574 #endif
575 short packchain_listadd = 0;
576 int so_options = so->so_options;
577 struct rtentry *rt;
578 u_int32_t svc_flags = 0, allocated_len;
579 u_int32_t lro_ackmore = (tp->t_lropktlen != 0) ? 1 : 0;
580 struct mbuf *mnext = NULL;
581 int sackoptlen = 0;
582 #if MPTCP
583 unsigned int *dlenp = NULL;
584 u_int8_t *finp = NULL;
585 u_int32_t *sseqp = NULL;
586 u_int64_t dss_val = 0;
587 boolean_t mptcp_acknow = FALSE;
588 boolean_t early_data_sent = FALSE;
589 #endif /* MPTCP */
590 boolean_t cell = FALSE;
591 boolean_t wifi = FALSE;
592 boolean_t wired = FALSE;
593 boolean_t sack_rescue_rxt = FALSE;
594 int sotc = so->so_traffic_class;
595
596 /*
597 * Determine length of data that should be transmitted,
598 * and flags that will be used.
599 * If there is some data or critical controls (SYN, RST)
600 * to send, then transmit; otherwise, investigate further.
601 */
602 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
603
604 /* Since idle_time is signed integer, the following integer subtraction
605 * will take care of wrap around of tcp_now
606 */
607 idle_time = tcp_now - tp->t_rcvtime;
608 if (idle && idle_time >= TCP_IDLETIMEOUT(tp)) {
609 if (CC_ALGO(tp)->after_idle != NULL &&
610 (tp->tcp_cc_index != TCP_CC_ALGO_CUBIC_INDEX ||
611 idle_time >= TCP_CC_CWND_NONVALIDATED_PERIOD)) {
612 CC_ALGO(tp)->after_idle(tp);
613 tcp_ccdbg_trace(tp, NULL, TCP_CC_IDLE_TIMEOUT);
614 }
615
616 /*
617 * Do some other tasks that need to be done after
618 * idle time
619 */
620 if (!SLIST_EMPTY(&tp->t_rxt_segments))
621 tcp_rxtseg_clean(tp);
622
623 /* If stretch ack was auto-disabled, re-evaluate it */
624 tcp_cc_after_idle_stretchack(tp);
625 }
626 tp->t_flags &= ~TF_LASTIDLE;
627 if (idle) {
628 if (tp->t_flags & TF_MORETOCOME) {
629 tp->t_flags |= TF_LASTIDLE;
630 idle = 0;
631 }
632 }
633 #if MPTCP
634 if (tp->t_mpflags & TMPF_RESET) {
635 tcp_check_timer_state(tp);
636 /*
637 * Once a RST has been sent for an MPTCP subflow,
638 * the subflow socket stays around until deleted.
639 * No packets such as FINs must be sent after RST.
640 */
641 return (0);
642 }
643 #endif /* MPTCP */
644
645 again:
646 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0);
647
648 #if INET6
649 if (isipv6) {
650 KERNEL_DEBUG(DBG_LAYER_BEG,
651 ((inp->inp_fport << 16) | inp->inp_lport),
652 (((inp->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
653 (inp->in6p_faddr.s6_addr16[0] & 0xffff)),
654 sendalot,0,0);
655 } else
656 #endif
657
658 {
659 KERNEL_DEBUG(DBG_LAYER_BEG,
660 ((inp->inp_fport << 16) | inp->inp_lport),
661 (((inp->inp_laddr.s_addr & 0xffff) << 16) |
662 (inp->inp_faddr.s_addr & 0xffff)),
663 sendalot,0,0);
664 }
665 /*
666 * If the route generation id changed, we need to check that our
667 * local (source) IP address is still valid. If it isn't either
668 * return error or silently do nothing (assuming the address will
669 * come back before the TCP connection times out).
670 */
671 rt = inp->inp_route.ro_rt;
672 if (rt != NULL && ROUTE_UNUSABLE(&tp->t_inpcb->inp_route)) {
673 struct ifnet *ifp;
674 struct in_ifaddr *ia = NULL;
675 struct in6_ifaddr *ia6 = NULL;
676 int found_srcaddr = 0;
677
678 /* disable multipages at the socket */
679 somultipages(so, FALSE);
680
681 /* Disable TSO for the socket until we know more */
682 tp->t_flags &= ~TF_TSO;
683
684 soif2kcl(so, FALSE);
685
686 if (isipv6) {
687 ia6 = ifa_foraddr6(&inp->in6p_laddr);
688 if (ia6 != NULL)
689 found_srcaddr = 1;
690 } else {
691 ia = ifa_foraddr(inp->inp_laddr.s_addr);
692 if (ia != NULL)
693 found_srcaddr = 1;
694 }
695
696 /* check that the source address is still valid */
697 if (found_srcaddr == 0) {
698 soevent(so,
699 (SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR));
700
701 if (tp->t_state >= TCPS_CLOSE_WAIT) {
702 tcp_drop(tp, EADDRNOTAVAIL);
703 return(EADDRNOTAVAIL);
704 }
705
706 /* Set retransmit timer if it wasn't set,
707 * reset Persist timer and shift register as the
708 * advertised peer window may not be valid anymore
709 */
710
711 if (!tp->t_timer[TCPT_REXMT]) {
712 tp->t_timer[TCPT_REXMT] =
713 OFFSET_FROM_START(tp, tp->t_rxtcur);
714 if (tp->t_timer[TCPT_PERSIST]) {
715 tp->t_timer[TCPT_PERSIST] = 0;
716 tp->t_persist_stop = 0;
717 TCP_RESET_REXMT_STATE(tp);
718 }
719 }
720
721 if (tp->t_pktlist_head != NULL)
722 m_freem_list(tp->t_pktlist_head);
723 TCP_PKTLIST_CLEAR(tp);
724
725 /* drop connection if source address isn't available */
726 if (so->so_flags & SOF_NOADDRAVAIL) {
727 tcp_drop(tp, EADDRNOTAVAIL);
728 return(EADDRNOTAVAIL);
729 } else {
730 tcp_check_timer_state(tp);
731 return(0); /* silently ignore, keep data in socket: address may be back */
732 }
733 }
734 if (ia != NULL)
735 IFA_REMREF(&ia->ia_ifa);
736
737 if (ia6 != NULL)
738 IFA_REMREF(&ia6->ia_ifa);
739
740 /*
741 * Address is still valid; check for multipages capability
742 * again in case the outgoing interface has changed.
743 */
744 RT_LOCK(rt);
745 if ((ifp = rt->rt_ifp) != NULL) {
746 somultipages(so, (ifp->if_hwassist & IFNET_MULTIPAGES));
747 tcp_set_tso(tp, ifp);
748 soif2kcl(so, (ifp->if_eflags & IFEF_2KCL));
749 tcp_set_ecn(tp, ifp);
750 }
751 if (rt->rt_flags & RTF_UP)
752 RT_GENID_SYNC(rt);
753 /*
754 * See if we should do MTU discovery. Don't do it if:
755 * 1) it is disabled via the sysctl
756 * 2) the route isn't up
757 * 3) the MTU is locked (if it is, then discovery
758 * has been disabled)
759 */
760
761 if (!path_mtu_discovery || ((rt != NULL) &&
762 (!(rt->rt_flags & RTF_UP) ||
763 (rt->rt_rmx.rmx_locks & RTV_MTU))))
764 tp->t_flags &= ~TF_PMTUD;
765 else
766 tp->t_flags |= TF_PMTUD;
767
768 RT_UNLOCK(rt);
769 }
770
771 if (rt != NULL) {
772 cell = IFNET_IS_CELLULAR(rt->rt_ifp);
773 wifi = (!cell && IFNET_IS_WIFI(rt->rt_ifp));
774 wired = (!wifi && IFNET_IS_WIRED(rt->rt_ifp));
775 }
776
777 /*
778 * If we've recently taken a timeout, snd_max will be greater than
779 * snd_nxt. There may be SACK information that allows us to avoid
780 * resending already delivered data. Adjust snd_nxt accordingly.
781 */
782 if (SACK_ENABLED(tp) && SEQ_LT(tp->snd_nxt, tp->snd_max))
783 tcp_sack_adjust(tp);
784 sendalot = 0;
785 off = tp->snd_nxt - tp->snd_una;
786 sendwin = min(tp->snd_wnd, tp->snd_cwnd);
787
788 if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0)
789 sendwin = min(sendwin, slowlink_wsize);
790
791 flags = tcp_outflags[tp->t_state];
792 /*
793 * Send any SACK-generated retransmissions. If we're explicitly
794 * trying to send out new data (when sendalot is 1), bypass this
795 * function. If we retransmit in fast recovery mode, decrement
796 * snd_cwnd, since we're replacing a (future) new transmission
797 * with a retransmission now, and we previously incremented
798 * snd_cwnd in tcp_input().
799 */
800 /*
801 * Still in sack recovery , reset rxmit flag to zero.
802 */
803 sack_rxmit = 0;
804 sack_bytes_rxmt = 0;
805 len = 0;
806 p = NULL;
807 if (SACK_ENABLED(tp) && IN_FASTRECOVERY(tp) &&
808 (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
809 int32_t cwin;
810
811 cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
812 if (cwin < 0)
813 cwin = 0;
814 /* Do not retransmit SACK segments beyond snd_recover */
815 if (SEQ_GT(p->end, tp->snd_recover)) {
816 /*
817 * (At least) part of sack hole extends beyond
818 * snd_recover. Check to see if we can rexmit data
819 * for this hole.
820 */
821 if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
822 /*
823 * Can't rexmit any more data for this hole.
824 * That data will be rexmitted in the next
825 * sack recovery episode, when snd_recover
826 * moves past p->rxmit.
827 */
828 p = NULL;
829 goto after_sack_rexmit;
830 } else
831 /* Can rexmit part of the current hole */
832 len = ((int32_t)min(cwin,
833 tp->snd_recover - p->rxmit));
834 } else {
835 len = ((int32_t)min(cwin, p->end - p->rxmit));
836 }
837 if (len > 0) {
838 off = p->rxmit - tp->snd_una;
839 sack_rxmit = 1;
840 sendalot = 1;
841 tcpstat.tcps_sack_rexmits++;
842 tcpstat.tcps_sack_rexmit_bytes +=
843 min(len, tp->t_maxseg);
844 } else {
845 len = 0;
846 }
847 }
848 after_sack_rexmit:
849 /*
850 * Get standard flags, and add SYN or FIN if requested by 'hidden'
851 * state flags.
852 */
853 if (tp->t_flags & TF_NEEDFIN)
854 flags |= TH_FIN;
855 if (tp->t_flags & TF_NEEDSYN)
856 flags |= TH_SYN;
857
858 /*
859 * If in persist timeout with window of 0, send 1 byte.
860 * Otherwise, if window is small but nonzero
861 * and timer expired, we will send what we can
862 * and go to transmit state.
863 */
864 if (tp->t_flagsext & TF_FORCE) {
865 if (sendwin == 0) {
866 /*
867 * If we still have some data to send, then
868 * clear the FIN bit. Usually this would
869 * happen below when it realizes that we
870 * aren't sending all the data. However,
871 * if we have exactly 1 byte of unsent data,
872 * then it won't clear the FIN bit below,
873 * and if we are in persist state, we wind
874 * up sending the packet without recording
875 * that we sent the FIN bit.
876 *
877 * We can't just blindly clear the FIN bit,
878 * because if we don't have any more data
879 * to send then the probe will be the FIN
880 * itself.
881 */
882 if (off < so->so_snd.sb_cc)
883 flags &= ~TH_FIN;
884 sendwin = 1;
885 } else {
886 tp->t_timer[TCPT_PERSIST] = 0;
887 tp->t_persist_stop = 0;
888 TCP_RESET_REXMT_STATE(tp);
889 }
890 }
891
892 /*
893 * If snd_nxt == snd_max and we have transmitted a FIN, the
894 * offset will be > 0 even if so_snd.sb_cc is 0, resulting in
895 * a negative length. This can also occur when TCP opens up
896 * its congestion window while receiving additional duplicate
897 * acks after fast-retransmit because TCP will reset snd_nxt
898 * to snd_max after the fast-retransmit.
899 *
900 * In the normal retransmit-FIN-only case, however, snd_nxt will
901 * be set to snd_una, the offset will be 0, and the length may
902 * wind up 0.
903 *
904 * If sack_rxmit is true we are retransmitting from the scoreboard
905 * in which case len is already set.
906 */
907 if (sack_rxmit == 0) {
908 if (sack_bytes_rxmt == 0) {
909 len = min(so->so_snd.sb_cc, sendwin) - off;
910 } else {
911 int32_t cwin;
912
913 cwin = tp->snd_cwnd -
914 (tp->snd_nxt - tp->sack_newdata) -
915 sack_bytes_rxmt;
916 if (cwin < 0)
917 cwin = 0;
918 /*
919 * We are inside of a SACK recovery episode and are
920 * sending new data, having retransmitted all the
921 * data possible in the scoreboard.
922 */
923 len = min(so->so_snd.sb_cc, tp->snd_wnd)
924 - off;
925 /*
926 * Don't remove this (len > 0) check !
927 * We explicitly check for len > 0 here (although it
928 * isn't really necessary), to work around a gcc
929 * optimization issue - to force gcc to compute
930 * len above. Without this check, the computation
931 * of len is bungled by the optimizer.
932 */
933 if (len > 0) {
934 len = imin(len, cwin);
935 } else {
936 len = 0;
937 }
938 /*
939 * At this point SACK recovery can not send any
940 * data from scoreboard or any new data. Check
941 * if we can do a rescue retransmit towards the
942 * tail end of recovery window.
943 */
944 if (len == 0 && cwin > 0 &&
945 SEQ_LT(tp->snd_fack, tp->snd_recover) &&
946 !(tp->t_flagsext & TF_RESCUE_RXT)) {
947 len = min((tp->snd_recover - tp->snd_fack),
948 tp->t_maxseg);
949 len = imin(len, cwin);
950 old_snd_nxt = tp->snd_nxt;
951 sack_rescue_rxt = TRUE;
952 tp->snd_nxt = tp->snd_recover - len;
953 /*
954 * If FIN has been sent, snd_max
955 * must have been advanced to cover it.
956 */
957 if ((tp->t_flags & TF_SENTFIN) &&
958 tp->snd_max == tp->snd_recover)
959 tp->snd_nxt--;
960
961 off = tp->snd_nxt - tp->snd_una;
962 sendalot = 0;
963 tp->t_flagsext |= TF_RESCUE_RXT;
964 }
965 }
966 }
967
968 #if MPTCP
969 if ((tp->t_mpflags & TMPF_FASTJOIN_SEND) &&
970 (tp->t_state == TCPS_SYN_SENT) &&
971 (!(tp->t_flags & TF_CLOSING)) &&
972 (so->so_snd.sb_cc != 0) &&
973 (tp->t_rxtshift == 0)) {
974 flags &= ~TH_SYN;
975 flags |= TH_ACK;
976 off = 0;
977 len = min(so->so_snd.sb_cc, tp->t_maxseg);
978 early_data_sent = TRUE;
979 } else if (early_data_sent) {
980 /* for now, we allow only one data segment to be sent */
981 return (0);
982 }
983 #endif /* MPTCP */
984 /*
985 * Lop off SYN bit if it has already been sent. However, if this
986 * is SYN-SENT state and if segment contains data and if we don't
987 * know that foreign host supports TAO, suppress sending segment.
988 */
989 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
990 if (tp->t_state != TCPS_SYN_RECEIVED || tfo_enabled(tp))
991 flags &= ~TH_SYN;
992 off--;
993 len++;
994 if (len > 0 && tp->t_state == TCPS_SYN_SENT) {
995 while (inp->inp_sndinprog_cnt == 0 &&
996 tp->t_pktlist_head != NULL) {
997 packetlist = tp->t_pktlist_head;
998 packchain_listadd = tp->t_lastchain;
999 packchain_sent++;
1000 TCP_PKTLIST_CLEAR(tp);
1001
1002 error = tcp_ip_output(so, tp, packetlist,
1003 packchain_listadd, tp_inp_options,
1004 (so_options & SO_DONTROUTE),
1005 (sack_rxmit | (sack_bytes_rxmt != 0)), 0,
1006 isipv6);
1007 }
1008
1009 /*
1010 * tcp was closed while we were in ip,
1011 * resume close
1012 */
1013 if (inp->inp_sndinprog_cnt == 0 &&
1014 (tp->t_flags & TF_CLOSING)) {
1015 tp->t_flags &= ~TF_CLOSING;
1016 (void) tcp_close(tp);
1017 } else {
1018 tcp_check_timer_state(tp);
1019 }
1020 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,
1021 0,0,0,0,0);
1022 return(0);
1023 }
1024 }
1025
1026 /*
1027 * Be careful not to send data and/or FIN on SYN segments.
1028 * This measure is needed to prevent interoperability problems
1029 * with not fully conformant TCP implementations.
1030 *
1031 * In case of TFO, we handle the setting of the len in
1032 * tcp_tfo_check. In case TFO is not enabled, never ever send
1033 * SYN+data.
1034 */
1035 if ((flags & TH_SYN) && !tfo_enabled(tp)) {
1036 len = 0;
1037 flags &= ~TH_FIN;
1038 }
1039
1040 if ((flags & TH_SYN) && tp->t_state <= TCPS_SYN_SENT && tfo_enabled(tp))
1041 len = tcp_tfo_check(tp, len);
1042
1043 /*
1044 * The check here used to be (len < 0). Some times len is zero
1045 * when the congestion window is closed and we need to check
1046 * if persist timer has to be set in that case. But don't set
1047 * persist until connection is established.
1048 */
1049 if (len <= 0 && !(flags & TH_SYN)) {
1050 /*
1051 * If FIN has been sent but not acked,
1052 * but we haven't been called to retransmit,
1053 * len will be < 0. Otherwise, window shrank
1054 * after we sent into it. If window shrank to 0,
1055 * cancel pending retransmit, pull snd_nxt back
1056 * to (closed) window, and set the persist timer
1057 * if it isn't already going. If the window didn't
1058 * close completely, just wait for an ACK.
1059 */
1060 len = 0;
1061 if (sendwin == 0) {
1062 tp->t_timer[TCPT_REXMT] = 0;
1063 tp->t_timer[TCPT_PTO] = 0;
1064 TCP_RESET_REXMT_STATE(tp);
1065 tp->snd_nxt = tp->snd_una;
1066 off = 0;
1067 if (tp->t_timer[TCPT_PERSIST] == 0)
1068 tcp_setpersist(tp);
1069 }
1070 }
1071
1072 /*
1073 * Automatic sizing of send socket buffer. Increase the send
1074 * socket buffer size if all of the following criteria are met
1075 * 1. the receiver has enough buffer space for this data
1076 * 2. send buffer is filled to 7/8th with data (so we actually
1077 * have data to make use of it);
1078 * 3. our send window (slow start and congestion controlled) is
1079 * larger than sent but unacknowledged data in send buffer.
1080 */
1081 if (tcp_do_autosendbuf == 1 &&
1082 !INP_WAIT_FOR_IF_FEEDBACK(inp) && !IN_FASTRECOVERY(tp) &&
1083 (so->so_snd.sb_flags & (SB_AUTOSIZE | SB_TRIM)) == SB_AUTOSIZE &&
1084 tcp_cansbgrow(&so->so_snd)) {
1085 if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
1086 so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) &&
1087 sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
1088 if (sbreserve(&so->so_snd,
1089 min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
1090 tcp_autosndbuf_max)) == 1) {
1091 so->so_snd.sb_idealsize = so->so_snd.sb_hiwat;
1092 }
1093 }
1094 }
1095
1096 /*
1097 * Truncate to the maximum segment length or enable TCP Segmentation
1098 * Offloading (if supported by hardware) and ensure that FIN is removed
1099 * if the length no longer contains the last data byte.
1100 *
1101 * TSO may only be used if we are in a pure bulk sending state.
1102 * The presence of TCP-MD5, SACK retransmits, SACK advertizements,
1103 * ipfw rules and IP options, as well as disabling hardware checksum
1104 * offload prevent using TSO. With TSO the TCP header is the same
1105 * (except for the sequence number) for all generated packets. This
1106 * makes it impossible to transmit any options which vary per generated
1107 * segment or packet.
1108 *
1109 * The length of TSO bursts is limited to TCP_MAXWIN. That limit and
1110 * removal of FIN (if not already catched here) are handled later after
1111 * the exact length of the TCP options are known.
1112 */
1113 #if IPSEC
1114 /*
1115 * Pre-calculate here as we save another lookup into the darknesses
1116 * of IPsec that way and can actually decide if TSO is ok.
1117 */
1118 if (ipsec_bypass == 0)
1119 ipsec_optlen = ipsec_hdrsiz_tcp(tp);
1120 #endif
1121 if (len > tp->t_maxseg) {
1122 if ((tp->t_flags & TF_TSO) && tcp_do_tso && hwcksum_tx &&
1123 ip_use_randomid && kipf_count == 0 &&
1124 dlil_filter_disable_tso_count == 0 &&
1125 tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
1126 sack_bytes_rxmt == 0 &&
1127 inp->inp_options == NULL &&
1128 inp->in6p_options == NULL
1129 #if IPSEC
1130 && ipsec_optlen == 0
1131 #endif
1132 #if IPFIREWALL
1133 && (fw_enable == 0 || fw_bypass)
1134 #endif
1135 ) {
1136 tso = 1;
1137 sendalot = 0;
1138 } else {
1139 len = tp->t_maxseg;
1140 sendalot = 1;
1141 tso = 0;
1142 }
1143 }
1144
1145 /* Send one segment or less as a tail loss probe */
1146 if (tp->t_flagsext & TF_SENT_TLPROBE) {
1147 len = min(len, tp->t_maxseg);
1148 sendalot = 0;
1149 tso = 0;
1150 }
1151
1152 #if MPTCP
1153 if ((so->so_flags & SOF_MP_SUBFLOW) &&
1154 !(tp->t_mpflags & TMPF_TCP_FALLBACK)) {
1155 int newlen = len;
1156 if ((tp->t_state >= TCPS_ESTABLISHED) &&
1157 ((tp->t_mpflags & TMPF_SND_MPPRIO) ||
1158 (tp->t_mpflags & TMPF_SND_REM_ADDR) ||
1159 (tp->t_mpflags & TMPF_SND_MPFAIL) ||
1160 (tp->t_mpflags & TMPF_MPCAP_RETRANSMIT))) {
1161 if (len > 0) {
1162 len = 0;
1163 }
1164 sendalot = 1;
1165 mptcp_acknow = TRUE;
1166 } else {
1167 mptcp_acknow = FALSE;
1168 }
1169 /*
1170 * The contiguous bytes in the subflow socket buffer can be
1171 * discontiguous at the MPTCP level. Since only one DSS
1172 * option can be sent in one packet, reduce length to match
1173 * the contiguous MPTCP level. Set sendalot to send remainder.
1174 */
1175 if (len > 0)
1176 newlen = mptcp_adj_sendlen(so, off, len);
1177 if (newlen < len) {
1178 len = newlen;
1179 sendalot = 1;
1180 }
1181 }
1182 #endif /* MPTCP */
1183
1184 /*
1185 * If the socket is capable of doing unordered send,
1186 * pull the amount of data that can be sent from the
1187 * unordered priority queues to the serial queue in
1188 * the socket buffer. If bytes are not yet available
1189 * in the highest priority message, we may not be able
1190 * to send any new data.
1191 */
1192 if (so->so_flags & SOF_ENABLE_MSGS) {
1193 if ((off + len) >
1194 so->so_msg_state->msg_serial_bytes) {
1195 sbpull_unordered_data(so, off, len);
1196
1197 /* check if len needs to be modified */
1198 if ((off + len) >
1199 so->so_msg_state->msg_serial_bytes) {
1200 len = so->so_msg_state->msg_serial_bytes - off;
1201 if (len <= 0) {
1202 len = 0;
1203 tcpstat.tcps_msg_sndwaithipri++;
1204 }
1205 }
1206 }
1207 }
1208
1209 if (sack_rxmit) {
1210 if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
1211 flags &= ~TH_FIN;
1212 } else {
1213 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
1214 flags &= ~TH_FIN;
1215 }
1216 /*
1217 * Compare available window to amount of window
1218 * known to peer (as advertised window less
1219 * next expected input). If the difference is at least two
1220 * max size segments, or at least 25% of the maximum possible
1221 * window, then want to send a window update to peer.
1222 * Skip this if the connection is in T/TCP half-open state.
1223 */
1224 recwin = tcp_sbspace(tp);
1225 #if MPTCP
1226 if (so->so_flags & SOF_MP_SUBFLOW) {
1227 struct mptcb *mp_tp = tptomptp(tp);
1228
1229 if (mp_tp != NULL) {
1230 MPT_LOCK(mp_tp);
1231 recwin = imin(recwin, (int)mp_tp->mpt_rcvwnd);
1232 MPT_UNLOCK(mp_tp);
1233 }
1234 }
1235 #endif
1236
1237 if (recwin < (int32_t)(so->so_rcv.sb_hiwat / 4) &&
1238 recwin < (int)tp->t_maxseg)
1239 recwin = 0;
1240 if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) {
1241 if (recwin > (int32_t)slowlink_wsize)
1242 recwin = slowlink_wsize;
1243 }
1244
1245 #if TRAFFIC_MGT
1246 if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(so)) {
1247 if (tcp_recv_throttle(tp)) {
1248 uint32_t min_iaj_win =
1249 tcp_min_iaj_win * tp->t_maxseg;
1250 if (tp->iaj_rwintop == 0 ||
1251 SEQ_LT(tp->iaj_rwintop, tp->rcv_adv))
1252 tp->iaj_rwintop = tp->rcv_adv;
1253 if (SEQ_LT(tp->iaj_rwintop,
1254 tp->rcv_nxt + min_iaj_win))
1255 tp->iaj_rwintop = tp->rcv_nxt + min_iaj_win;
1256 recwin = min(tp->iaj_rwintop - tp->rcv_nxt, recwin);
1257 }
1258 }
1259 #endif /* TRAFFIC_MGT */
1260
1261 if (recwin > (int32_t)(TCP_MAXWIN << tp->rcv_scale))
1262 recwin = (int32_t)(TCP_MAXWIN << tp->rcv_scale);
1263 if (recwin < (int32_t)(tp->rcv_adv - tp->rcv_nxt))
1264 recwin = (int32_t)(tp->rcv_adv - tp->rcv_nxt);
1265
1266 /*
1267 * Sender silly window avoidance. We transmit under the following
1268 * conditions when len is non-zero:
1269 *
1270 * - we've timed out (e.g. persist timer)
1271 * - we need to retransmit
1272 * - We have a full segment (or more with TSO)
1273 * - This is the last buffer in a write()/send() and we are
1274 * either idle or running NODELAY
1275 * - we have more then 1/2 the maximum send window's worth of
1276 * data (receiver may be limited the window size)
1277 */
1278 if (len) {
1279 if (tp->t_flagsext & TF_FORCE)
1280 goto send;
1281 if (SEQ_LT(tp->snd_nxt, tp->snd_max))
1282 goto send;
1283 if (sack_rxmit)
1284 goto send;
1285
1286 /*
1287 * Send new data on the connection only if it is
1288 * not flow controlled
1289 */
1290 if (!INP_WAIT_FOR_IF_FEEDBACK(inp) ||
1291 tp->t_state != TCPS_ESTABLISHED) {
1292 if (len >= tp->t_maxseg)
1293 goto send;
1294 if (!(tp->t_flags & TF_MORETOCOME) &&
1295 (idle || tp->t_flags & TF_NODELAY ||
1296 (tp->t_flags & TF_MAXSEGSNT) ||
1297 ALLOW_LIMITED_TRANSMIT(tp)) &&
1298 (tp->t_flags & TF_NOPUSH) == 0 &&
1299 len + off >= so->so_snd.sb_cc)
1300 goto send;
1301 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
1302 goto send;
1303 } else {
1304 tcpstat.tcps_fcholdpacket++;
1305 }
1306 }
1307
1308 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN)) {
1309 /*
1310 * "adv" is the amount we can increase the window,
1311 * taking into account that we are limited by
1312 * TCP_MAXWIN << tp->rcv_scale.
1313 */
1314 int32_t adv, oldwin = 0;
1315 adv = imin(recwin, (int)TCP_MAXWIN << tp->rcv_scale) -
1316 (tp->rcv_adv - tp->rcv_nxt);
1317
1318 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
1319 oldwin = tp->rcv_adv - tp->rcv_nxt;
1320
1321 if (adv >= (int32_t) (2 * tp->t_maxseg)) {
1322 /*
1323 * Update only if the resulting scaled value of
1324 * the window changed, or if there is a change in
1325 * the sequence since the last ack. This avoids
1326 * what appears as dupe ACKS (see rdar://5640997)
1327 *
1328 * If streaming is detected avoid sending too many
1329 * window updates. We will depend on the delack
1330 * timer to send a window update when needed.
1331 */
1332 if (!(tp->t_flags & TF_STRETCHACK) &&
1333 (tp->last_ack_sent != tp->rcv_nxt ||
1334 ((oldwin + adv) >> tp->rcv_scale) >
1335 (oldwin >> tp->rcv_scale))) {
1336 goto send;
1337 }
1338
1339 }
1340 if (4 * adv >= (int32_t) so->so_rcv.sb_hiwat)
1341 goto send;
1342
1343 /*
1344 * Make sure that the delayed ack timer is set if
1345 * we delayed sending a window update because of
1346 * streaming detection.
1347 */
1348 if ((tp->t_flags & TF_STRETCHACK) &&
1349 !(tp->t_flags & TF_DELACK)) {
1350 tp->t_flags |= TF_DELACK;
1351 tp->t_timer[TCPT_DELACK] =
1352 OFFSET_FROM_START(tp, tcp_delack);
1353 }
1354 }
1355
1356 /*
1357 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW
1358 * is also a catch-all for the retransmit timer timeout case.
1359 */
1360 if (tp->t_flags & TF_ACKNOW)
1361 goto send;
1362 if ((flags & TH_RST) ||
1363 ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
1364 goto send;
1365 if (SEQ_GT(tp->snd_up, tp->snd_una))
1366 goto send;
1367 #if MPTCP
1368 if (mptcp_acknow)
1369 goto send;
1370 #endif /* MPTCP */
1371 /*
1372 * If our state indicates that FIN should be sent
1373 * and we have not yet done so, then we need to send.
1374 */
1375 if ((flags & TH_FIN) &&
1376 (!(tp->t_flags & TF_SENTFIN) || tp->snd_nxt == tp->snd_una))
1377 goto send;
1378 /*
1379 * In SACK, it is possible for tcp_output to fail to send a segment
1380 * after the retransmission timer has been turned off. Make sure
1381 * that the retransmission timer is set.
1382 */
1383 if (SACK_ENABLED(tp) && (tp->t_state >= TCPS_ESTABLISHED) &&
1384 SEQ_GT(tp->snd_max, tp->snd_una) &&
1385 tp->t_timer[TCPT_REXMT] == 0 &&
1386 tp->t_timer[TCPT_PERSIST] == 0) {
1387 tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp,
1388 tp->t_rxtcur);
1389 goto just_return;
1390 }
1391 /*
1392 * TCP window updates are not reliable, rather a polling protocol
1393 * using ``persist'' packets is used to insure receipt of window
1394 * updates. The three ``states'' for the output side are:
1395 * idle not doing retransmits or persists
1396 * persisting to move a small or zero window
1397 * (re)transmitting and thereby not persisting
1398 *
1399 * tp->t_timer[TCPT_PERSIST]
1400 * is set when we are in persist state.
1401 * tp->t_force
1402 * is set when we are called to send a persist packet.
1403 * tp->t_timer[TCPT_REXMT]
1404 * is set when we are retransmitting
1405 * The output side is idle when both timers are zero.
1406 *
1407 * If send window is too small, there is data to transmit, and no
1408 * retransmit or persist is pending, then go to persist state.
1409 * If nothing happens soon, send when timer expires:
1410 * if window is nonzero, transmit what we can,
1411 * otherwise force out a byte.
1412 */
1413 if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
1414 tp->t_timer[TCPT_PERSIST] == 0) {
1415 TCP_RESET_REXMT_STATE(tp);
1416 tcp_setpersist(tp);
1417 }
1418 just_return:
1419 /*
1420 * If there is no reason to send a segment, just return.
1421 * but if there is some packets left in the packet list, send them now.
1422 */
1423 while (inp->inp_sndinprog_cnt == 0 &&
1424 tp->t_pktlist_head != NULL) {
1425 packetlist = tp->t_pktlist_head;
1426 packchain_listadd = tp->t_lastchain;
1427 packchain_sent++;
1428 TCP_PKTLIST_CLEAR(tp);
1429
1430 error = tcp_ip_output(so, tp, packetlist,
1431 packchain_listadd,
1432 tp_inp_options, (so_options & SO_DONTROUTE),
1433 (sack_rxmit | (sack_bytes_rxmt != 0)), recwin,
1434 isipv6);
1435 }
1436 /* tcp was closed while we were in ip; resume close */
1437 if (inp->inp_sndinprog_cnt == 0 &&
1438 (tp->t_flags & TF_CLOSING)) {
1439 tp->t_flags &= ~TF_CLOSING;
1440 (void) tcp_close(tp);
1441 } else {
1442 tcp_check_timer_state(tp);
1443 }
1444 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1445 return (0);
1446
1447 send:
1448 /*
1449 * Set TF_MAXSEGSNT flag if the segment size is greater than
1450 * the max segment size.
1451 */
1452 if (len > 0) {
1453 if (len >= tp->t_maxseg)
1454 tp->t_flags |= TF_MAXSEGSNT;
1455 else
1456 tp->t_flags &= ~TF_MAXSEGSNT;
1457 }
1458 /*
1459 * Before ESTABLISHED, force sending of initial options
1460 * unless TCP set not to do any options.
1461 * NOTE: we assume that the IP/TCP header plus TCP options
1462 * always fit in a single mbuf, leaving room for a maximum
1463 * link header, i.e.
1464 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
1465 */
1466 optlen = 0;
1467 #if INET6
1468 if (isipv6)
1469 hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
1470 else
1471 #endif
1472 hdrlen = sizeof (struct tcpiphdr);
1473 if (flags & TH_SYN) {
1474 tp->snd_nxt = tp->iss;
1475 if ((tp->t_flags & TF_NOOPT) == 0) {
1476 u_short mss;
1477
1478 opt[0] = TCPOPT_MAXSEG;
1479 opt[1] = TCPOLEN_MAXSEG;
1480 mss = htons((u_short) tcp_mssopt(tp));
1481 (void)memcpy(opt + 2, &mss, sizeof(mss));
1482 optlen = TCPOLEN_MAXSEG;
1483
1484 if ((tp->t_flags & TF_REQ_SCALE) &&
1485 ((flags & TH_ACK) == 0 ||
1486 (tp->t_flags & TF_RCVD_SCALE))) {
1487 *((u_int32_t *)(void *)(opt + optlen)) = htonl(
1488 TCPOPT_NOP << 24 |
1489 TCPOPT_WINDOW << 16 |
1490 TCPOLEN_WINDOW << 8 |
1491 tp->request_r_scale);
1492 optlen += 4;
1493 }
1494 #if MPTCP
1495 if (mptcp_enable && (so->so_flags & SOF_MP_SUBFLOW)) {
1496 optlen = mptcp_setup_syn_opts(so, flags, opt,
1497 optlen);
1498 }
1499 #endif /* MPTCP */
1500 }
1501 }
1502
1503 /*
1504 * Send a timestamp and echo-reply if this is a SYN and our side
1505 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
1506 * and our peer have sent timestamps in our SYN's.
1507 */
1508 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
1509 (flags & TH_RST) == 0 &&
1510 ((flags & TH_ACK) == 0 ||
1511 (tp->t_flags & TF_RCVD_TSTMP))) {
1512 u_int32_t *lp = (u_int32_t *)(void *)(opt + optlen);
1513
1514 /* Form timestamp option as shown in appendix A of RFC 1323. */
1515 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
1516 *lp++ = htonl(tcp_now);
1517 *lp = htonl(tp->ts_recent);
1518 optlen += TCPOLEN_TSTAMP_APPA;
1519 }
1520
1521 /* Note the timestamp for receive buffer autosizing */
1522 if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE))
1523 tp->rfbuf_ts = tcp_now;
1524
1525 if (SACK_ENABLED(tp) && ((tp->t_flags & TF_NOOPT) == 0)) {
1526 /*
1527 * Tack on the SACK permitted option *last*.
1528 * And do padding of options after tacking this on.
1529 * This is because of MSS, TS, WinScale and Signatures are
1530 * all present, we have just 2 bytes left for the SACK
1531 * permitted option, which is just enough.
1532 */
1533 /*
1534 * If this is the first SYN of connection (not a SYN
1535 * ACK), include SACK permitted option. If this is a
1536 * SYN ACK, include SACK permitted option if peer has
1537 * already done so. This is only for active connect,
1538 * since the syncache takes care of the passive connect.
1539 */
1540 if ((flags & TH_SYN) &&
1541 (!(flags & TH_ACK) || (tp->t_flags & TF_SACK_PERMIT))) {
1542 u_char *bp;
1543 bp = (u_char *)opt + optlen;
1544
1545 *bp++ = TCPOPT_SACK_PERMITTED;
1546 *bp++ = TCPOLEN_SACK_PERMITTED;
1547 optlen += TCPOLEN_SACK_PERMITTED;
1548 }
1549 }
1550 #if MPTCP
1551 if (so->so_flags & SOF_MP_SUBFLOW) {
1552 /*
1553 * Its important to piggyback acks with data as ack only packets
1554 * may get lost and data packets that don't send Data ACKs
1555 * still advance the subflow level ACK and therefore make it
1556 * hard for the remote end to recover in low cwnd situations.
1557 */
1558 if (len != 0) {
1559 tp->t_mpflags |= (TMPF_SEND_DSN |
1560 TMPF_MPTCP_ACKNOW);
1561 } else {
1562 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
1563 }
1564 optlen = mptcp_setup_opts(tp, off, &opt[0], optlen, flags,
1565 len, &dlenp, &finp, &dss_val, &sseqp, &mptcp_acknow);
1566 tp->t_mpflags &= ~TMPF_SEND_DSN;
1567 }
1568 #endif /* MPTCP */
1569
1570 if (tfo_enabled(tp) && !(tp->t_flags & TF_NOOPT) &&
1571 (flags & (TH_SYN | TH_ACK)) == TH_SYN)
1572 optlen += tcp_tfo_write_cookie(tp, optlen, &len, opt);
1573
1574 if (tfo_enabled(tp) &&
1575 (flags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK) &&
1576 (tp->t_tfo_flags & TFO_F_OFFER_COOKIE))
1577 optlen += tcp_tfo_write_cookie_rep(tp, optlen, opt);
1578
1579 if (SACK_ENABLED(tp) && ((tp->t_flags & TF_NOOPT) == 0)) {
1580 /*
1581 * Send SACKs if necessary. This should be the last
1582 * option processed. Only as many SACKs are sent as
1583 * are permitted by the maximum options size.
1584 *
1585 * In general, SACK blocks consume 8*n+2 bytes.
1586 * So a full size SACK blocks option is 34 bytes
1587 * (to generate 4 SACK blocks). At a minimum,
1588 * we need 10 bytes (to generate 1 SACK block).
1589 * If TCP Timestamps (12 bytes) and TCP Signatures
1590 * (18 bytes) are both present, we'll just have
1591 * 10 bytes for SACK options 40 - (12 + 18).
1592 */
1593 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
1594 (tp->t_flags & TF_SACK_PERMIT) &&
1595 (tp->rcv_numsacks > 0 || TCP_SEND_DSACK_OPT(tp)) &&
1596 MAX_TCPOPTLEN - optlen - 2 >= TCPOLEN_SACK) {
1597 int nsack, padlen;
1598 u_char *bp = (u_char *)opt + optlen;
1599 u_int32_t *lp;
1600
1601 nsack = (MAX_TCPOPTLEN - optlen - 2) / TCPOLEN_SACK;
1602 nsack = min(nsack, (tp->rcv_numsacks +
1603 (TCP_SEND_DSACK_OPT(tp) ? 1 : 0)));
1604 sackoptlen = (2 + nsack * TCPOLEN_SACK);
1605
1606 /*
1607 * First we need to pad options so that the
1608 * SACK blocks can start at a 4-byte boundary
1609 * (sack option and length are at a 2 byte offset).
1610 */
1611 padlen = (MAX_TCPOPTLEN - optlen - sackoptlen) % 4;
1612 optlen += padlen;
1613 while (padlen-- > 0)
1614 *bp++ = TCPOPT_NOP;
1615
1616 tcpstat.tcps_sack_send_blocks++;
1617 *bp++ = TCPOPT_SACK;
1618 *bp++ = sackoptlen;
1619 lp = (u_int32_t *)(void *)bp;
1620
1621 /*
1622 * First block of SACK option should represent
1623 * DSACK. Prefer to send SACK information if there
1624 * is space for only one SACK block. This will
1625 * allow for faster recovery.
1626 */
1627 if (TCP_SEND_DSACK_OPT(tp) && nsack > 0 &&
1628 (tp->rcv_numsacks == 0 || nsack > 1)) {
1629 *lp++ = htonl(tp->t_dsack_lseq);
1630 *lp++ = htonl(tp->t_dsack_rseq);
1631 tcpstat.tcps_dsack_sent++;
1632 tp->t_dsack_sent++;
1633 nsack--;
1634 }
1635 VERIFY(nsack == 0 || tp->rcv_numsacks >= nsack);
1636 for (i = 0; i < nsack; i++) {
1637 struct sackblk sack = tp->sackblks[i];
1638 *lp++ = htonl(sack.start);
1639 *lp++ = htonl(sack.end);
1640 }
1641 optlen += sackoptlen;
1642 }
1643 }
1644
1645 /* Pad TCP options to a 4 byte boundary */
1646 if (optlen < MAX_TCPOPTLEN && (optlen % sizeof(u_int32_t))) {
1647 int pad = sizeof(u_int32_t) - (optlen % sizeof(u_int32_t));
1648 u_char *bp = (u_char *)opt + optlen;
1649
1650 optlen += pad;
1651 while (pad) {
1652 *bp++ = TCPOPT_EOL;
1653 pad--;
1654 }
1655 }
1656
1657 /*
1658 * RFC 3168 states that:
1659 * - If you ever sent an ECN-setup SYN/SYN-ACK you must be prepared
1660 * to handle the TCP ECE flag, even if you also later send a
1661 * non-ECN-setup SYN/SYN-ACK.
1662 * - If you ever send a non-ECN-setup SYN/SYN-ACK, you must not set
1663 * the ip ECT flag.
1664 *
1665 * It is not clear how the ECE flag would ever be set if you never
1666 * set the IP ECT flag on outbound packets. All the same, we use
1667 * the TE_SETUPSENT to indicate that we have committed to handling
1668 * the TCP ECE flag correctly. We use the TE_SENDIPECT to indicate
1669 * whether or not we should set the IP ECT flag on outbound packet
1670 *
1671 * For a SYN-ACK, send an ECN setup SYN-ACK
1672 */
1673 if ((flags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK) &&
1674 (tp->ecn_flags & TE_ENABLE_ECN)) {
1675 if (tp->ecn_flags & TE_SETUPRECEIVED) {
1676 if (tcp_send_ecn_flags_on_syn(tp, so)) {
1677 /*
1678 * Setting TH_ECE makes this an ECN-setup
1679 * SYN-ACK
1680 */
1681 flags |= TH_ECE;
1682
1683 /*
1684 * Record that we sent the ECN-setup and
1685 * default to setting IP ECT.
1686 */
1687 tp->ecn_flags |= (TE_SETUPSENT|TE_SENDIPECT);
1688 tcpstat.tcps_ecn_server_setup++;
1689 tcpstat.tcps_ecn_server_success++;
1690 } else {
1691 /*
1692 * We sent an ECN-setup SYN-ACK but it was
1693 * dropped. Fallback to non-ECN-setup
1694 * SYN-ACK and clear flag to indicate that
1695 * we should not send data with IP ECT set
1696 *
1697 * Pretend we didn't receive an
1698 * ECN-setup SYN.
1699 *
1700 * We already incremented the counter
1701 * assuming that the ECN setup will
1702 * succeed. Decrementing here
1703 * tcps_ecn_server_success to correct it.
1704 */
1705 if (tp->ecn_flags & TE_SETUPSENT) {
1706 tcpstat.tcps_ecn_lost_synack++;
1707 tcpstat.tcps_ecn_server_success--;
1708 tp->ecn_flags |= TE_LOST_SYNACK;
1709 }
1710
1711 tp->ecn_flags &=
1712 ~(TE_SETUPRECEIVED | TE_SENDIPECT |
1713 TE_SENDCWR);
1714 }
1715 }
1716 } else if ((flags & (TH_SYN | TH_ACK)) == TH_SYN &&
1717 (tp->ecn_flags & TE_ENABLE_ECN)) {
1718 if (tcp_send_ecn_flags_on_syn(tp, so)) {
1719 /*
1720 * Setting TH_ECE and TH_CWR makes this an
1721 * ECN-setup SYN
1722 */
1723 flags |= (TH_ECE | TH_CWR);
1724 tcpstat.tcps_ecn_client_setup++;
1725 tp->ecn_flags |= TE_CLIENT_SETUP;
1726
1727 /*
1728 * Record that we sent the ECN-setup and default to
1729 * setting IP ECT.
1730 */
1731 tp->ecn_flags |= (TE_SETUPSENT | TE_SENDIPECT);
1732 } else {
1733 /*
1734 * We sent an ECN-setup SYN but it was dropped.
1735 * Fall back to non-ECN and clear flag indicating
1736 * we should send data with IP ECT set.
1737 */
1738 if (tp->ecn_flags & TE_SETUPSENT) {
1739 tcpstat.tcps_ecn_lost_syn++;
1740 tp->ecn_flags |= TE_LOST_SYN;
1741 }
1742 tp->ecn_flags &= ~TE_SENDIPECT;
1743 }
1744 }
1745
1746 /*
1747 * Check if we should set the TCP CWR flag.
1748 * CWR flag is sent when we reduced the congestion window because
1749 * we received a TCP ECE or we performed a fast retransmit. We
1750 * never set the CWR flag on retransmitted packets. We only set
1751 * the CWR flag on data packets. Pure acks don't have this set.
1752 */
1753 if ((tp->ecn_flags & TE_SENDCWR) != 0 && len != 0 &&
1754 !SEQ_LT(tp->snd_nxt, tp->snd_max) && !sack_rxmit) {
1755 flags |= TH_CWR;
1756 tp->ecn_flags &= ~TE_SENDCWR;
1757 }
1758
1759 /*
1760 * Check if we should set the TCP ECE flag.
1761 */
1762 if ((tp->ecn_flags & TE_SENDECE) != 0 && len == 0) {
1763 flags |= TH_ECE;
1764 tcpstat.tcps_ecn_sent_ece++;
1765 }
1766
1767
1768 hdrlen += optlen;
1769
1770 /* Reset DSACK sequence numbers */
1771 tp->t_dsack_lseq = 0;
1772 tp->t_dsack_rseq = 0;
1773
1774 #if INET6
1775 if (isipv6)
1776 ipoptlen = ip6_optlen(inp);
1777 else
1778 #endif
1779 {
1780 if (tp_inp_options) {
1781 ipoptlen = tp_inp_options->m_len -
1782 offsetof(struct ipoption, ipopt_list);
1783 } else {
1784 ipoptlen = 0;
1785 }
1786 }
1787 #if IPSEC
1788 ipoptlen += ipsec_optlen;
1789 #endif
1790
1791 /*
1792 * Adjust data length if insertion of options will
1793 * bump the packet length beyond the t_maxopd length.
1794 * Clear the FIN bit because we cut off the tail of
1795 * the segment.
1796 *
1797 * When doing TSO limit a burst to TCP_MAXWIN minus the
1798 * IP, TCP and Options length to keep ip->ip_len from
1799 * overflowing. Prevent the last segment from being
1800 * fractional thus making them all equal sized and set
1801 * the flag to continue sending. TSO is disabled when
1802 * IP options or IPSEC are present.
1803 */
1804 if (len + optlen + ipoptlen > tp->t_maxopd) {
1805 /*
1806 * If there is still more to send,
1807 * don't close the connection.
1808 */
1809 flags &= ~TH_FIN;
1810 if (tso) {
1811 int32_t tso_maxlen;
1812
1813 tso_maxlen = tp->tso_max_segment_size ?
1814 tp->tso_max_segment_size : TCP_MAXWIN;
1815
1816 if (len > tso_maxlen - hdrlen - optlen) {
1817 len = tso_maxlen - hdrlen - optlen;
1818 len = len - (len % (tp->t_maxopd - optlen));
1819 sendalot = 1;
1820 } else if (tp->t_flags & TF_NEEDFIN) {
1821 sendalot = 1;
1822 }
1823 } else {
1824 len = tp->t_maxopd - optlen - ipoptlen;
1825 sendalot = 1;
1826 }
1827 }
1828 #if MPTCP
1829 /* Adjust the length in the DSS option, if it is lesser than len */
1830 if (dlenp) {
1831 /*
1832 * To test this path without SACK, artificially
1833 * decrement len with something like
1834 * if (len > 10)
1835 len -= 10;
1836 */
1837 if (ntohs(*dlenp) > len) {
1838 *dlenp = htons(len);
1839 /* Unset the FIN flag, if len was adjusted */
1840 if (finp) {
1841 *finp &= ~MDSS_F;
1842 }
1843 sendalot = 1;
1844 }
1845 }
1846 #endif /* MPTCP */
1847
1848 if (max_linkhdr + hdrlen > MCLBYTES)
1849 panic("tcphdr too big");
1850
1851 /* Check if there is enough data in the send socket
1852 * buffer to start measuring bw
1853 */
1854 if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
1855 (tp->t_bwmeas != NULL) &&
1856 (tp->t_flagsext & TF_BWMEAS_INPROGRESS) == 0 &&
1857 (so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)) >=
1858 tp->t_bwmeas->bw_minsize) {
1859 tp->t_bwmeas->bw_size = min(
1860 (so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)),
1861 tp->t_bwmeas->bw_maxsize);
1862 tp->t_flagsext |= TF_BWMEAS_INPROGRESS;
1863 tp->t_bwmeas->bw_start = tp->snd_max;
1864 tp->t_bwmeas->bw_ts = tcp_now;
1865 }
1866
1867 VERIFY(inp->inp_flowhash != 0);
1868 /*
1869 * Grab a header mbuf, attaching a copy of data to
1870 * be transmitted, and initialize the header from
1871 * the template for sends on this connection.
1872 */
1873 if (len) {
1874 tp->t_pmtud_lastseg_size = len + optlen + ipoptlen;
1875 if ((tp->t_flagsext & TF_FORCE) && len == 1)
1876 tcpstat.tcps_sndprobe++;
1877 else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
1878 tcpstat.tcps_sndrexmitpack++;
1879 tcpstat.tcps_sndrexmitbyte += len;
1880 if (nstat_collect) {
1881 nstat_route_tx(inp->inp_route.ro_rt, 1,
1882 len, NSTAT_TX_FLAG_RETRANSMIT);
1883 INP_ADD_STAT(inp, cell, wifi, wired,
1884 txpackets, 1);
1885 INP_ADD_STAT(inp, cell, wifi, wired,
1886 txbytes, len);
1887 tp->t_stat.txretransmitbytes += len;
1888 tp->t_stat.rxmitpkts++;
1889 }
1890 } else {
1891 tcpstat.tcps_sndpack++;
1892 tcpstat.tcps_sndbyte += len;
1893
1894 if (nstat_collect) {
1895 INP_ADD_STAT(inp, cell, wifi, wired,
1896 txpackets, 1);
1897 INP_ADD_STAT(inp, cell, wifi, wired,
1898 txbytes, len);
1899 }
1900 inp_decr_sndbytes_unsent(so, len);
1901 }
1902 #if MPTCP
1903 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
1904 tcpstat.tcps_mp_sndpacks++;
1905 tcpstat.tcps_mp_sndbytes += len;
1906 }
1907 #endif /* MPTCP */
1908 /*
1909 * try to use the new interface that allocates all
1910 * the necessary mbuf hdrs under 1 mbuf lock and
1911 * avoids rescanning the socket mbuf list if
1912 * certain conditions are met. This routine can't
1913 * be used in the following cases...
1914 * 1) the protocol headers exceed the capacity of
1915 * of a single mbuf header's data area (no cluster attached)
1916 * 2) the length of the data being transmitted plus
1917 * the protocol headers fits into a single mbuf header's
1918 * data area (no cluster attached)
1919 */
1920 m = NULL;
1921
1922 /* minimum length we are going to allocate */
1923 allocated_len = MHLEN;
1924 if (MHLEN < hdrlen + max_linkhdr) {
1925 MGETHDR(m, M_DONTWAIT, MT_HEADER);
1926 if (m == NULL) {
1927 error = ENOBUFS;
1928 goto out;
1929 }
1930 MCLGET(m, M_DONTWAIT);
1931 if ((m->m_flags & M_EXT) == 0) {
1932 m_freem(m);
1933 error = ENOBUFS;
1934 goto out;
1935 }
1936 m->m_data += max_linkhdr;
1937 m->m_len = hdrlen;
1938 allocated_len = MCLBYTES;
1939 }
1940 if (len <= allocated_len - hdrlen - max_linkhdr) {
1941 if (m == NULL) {
1942 VERIFY(allocated_len <= MHLEN);
1943 MGETHDR(m, M_DONTWAIT, MT_HEADER);
1944 if (m == NULL) {
1945 error = ENOBUFS;
1946 goto out;
1947 }
1948 m->m_data += max_linkhdr;
1949 m->m_len = hdrlen;
1950 }
1951 /* makes sure we still have data left to be sent at this point */
1952 if (so->so_snd.sb_mb == NULL || off < 0) {
1953 if (m != NULL) m_freem(m);
1954 error = 0; /* should we return an error? */
1955 goto out;
1956 }
1957 m_copydata(so->so_snd.sb_mb, off, (int) len,
1958 mtod(m, caddr_t) + hdrlen);
1959 m->m_len += len;
1960 } else {
1961 uint32_t copymode;
1962 /*
1963 * Retain packet header metadata at the socket
1964 * buffer if this is is an MPTCP subflow,
1965 * otherwise move it.
1966 */
1967 copymode = M_COPYM_MOVE_HDR;
1968 #if MPTCP
1969 if (so->so_flags & SOF_MP_SUBFLOW) {
1970 copymode = M_COPYM_NOOP_HDR;
1971 }
1972 #endif /* MPTCP */
1973 if (m != NULL) {
1974 m->m_next = m_copym_mode(so->so_snd.sb_mb,
1975 off, (int)len, M_DONTWAIT, copymode);
1976 if (m->m_next == NULL) {
1977 (void) m_free(m);
1978 error = ENOBUFS;
1979 goto out;
1980 }
1981 } else {
1982 /*
1983 * make sure we still have data left
1984 * to be sent at this point
1985 */
1986 if (so->so_snd.sb_mb == NULL) {
1987 error = 0; /* should we return an error? */
1988 goto out;
1989 }
1990
1991 /*
1992 * m_copym_with_hdrs will always return the
1993 * last mbuf pointer and the offset into it that
1994 * it acted on to fullfill the current request,
1995 * whether a valid 'hint' was passed in or not.
1996 */
1997 if ((m = m_copym_with_hdrs(so->so_snd.sb_mb,
1998 off, len, M_DONTWAIT, NULL, NULL,
1999 copymode)) == NULL) {
2000 error = ENOBUFS;
2001 goto out;
2002 }
2003 m->m_data += max_linkhdr;
2004 m->m_len = hdrlen;
2005 }
2006 }
2007 /*
2008 * If we're sending everything we've got, set PUSH.
2009 * (This will keep happy those implementations which only
2010 * give data to the user when a buffer fills or
2011 * a PUSH comes in.)
2012 *
2013 * On SYN-segments we should not add the PUSH-flag.
2014 */
2015 if (off + len == so->so_snd.sb_cc && !(flags & TH_SYN))
2016 flags |= TH_PUSH;
2017 } else {
2018 if (tp->t_flags & TF_ACKNOW)
2019 tcpstat.tcps_sndacks++;
2020 else if (flags & (TH_SYN|TH_FIN|TH_RST))
2021 tcpstat.tcps_sndctrl++;
2022 else if (SEQ_GT(tp->snd_up, tp->snd_una))
2023 tcpstat.tcps_sndurg++;
2024 else
2025 tcpstat.tcps_sndwinup++;
2026
2027 MGETHDR(m, M_DONTWAIT, MT_HEADER); /* MAC-OK */
2028 if (m == NULL) {
2029 error = ENOBUFS;
2030 goto out;
2031 }
2032 if (MHLEN < (hdrlen + max_linkhdr)) {
2033 MCLGET(m, M_DONTWAIT);
2034 if ((m->m_flags & M_EXT) == 0) {
2035 m_freem(m);
2036 error = ENOBUFS;
2037 goto out;
2038 }
2039 }
2040 m->m_data += max_linkhdr;
2041 m->m_len = hdrlen;
2042 }
2043 m->m_pkthdr.rcvif = 0;
2044 #if MPTCP
2045 /* Before opt is copied to the mbuf, set the csum field */
2046 mptcp_output_csum(tp, m, len, hdrlen, dss_val, sseqp);
2047 #endif /* MPTCP */
2048 #if CONFIG_MACF_NET
2049 mac_mbuf_label_associate_inpcb(inp, m);
2050 #endif
2051 #if INET6
2052 if (isipv6) {
2053 ip6 = mtod(m, struct ip6_hdr *);
2054 th = (struct tcphdr *)(void *)(ip6 + 1);
2055 tcp_fillheaders(tp, ip6, th);
2056 if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len &&
2057 !SEQ_LT(tp->snd_nxt, tp->snd_max) && !sack_rxmit) {
2058 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
2059 }
2060 svc_flags |= PKT_SCF_IPV6;
2061 #if PF_ECN
2062 m_pftag(m)->pftag_hdr = (void *)ip6;
2063 m_pftag(m)->pftag_flags |= PF_TAG_HDR_INET6;
2064 #endif /* PF_ECN */
2065 } else
2066 #endif /* INET6 */
2067 {
2068 ip = mtod(m, struct ip *);
2069 ipov = (struct ipovly *)ip;
2070 th = (struct tcphdr *)(void *)(ip + 1);
2071 /* this picks up the pseudo header (w/o the length) */
2072 tcp_fillheaders(tp, ip, th);
2073 if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len &&
2074 !SEQ_LT(tp->snd_nxt, tp->snd_max) &&
2075 !sack_rxmit && !(flags & TH_SYN)) {
2076 ip->ip_tos |= IPTOS_ECN_ECT0;
2077 }
2078 #if PF_ECN
2079 m_pftag(m)->pftag_hdr = (void *)ip;
2080 m_pftag(m)->pftag_flags |= PF_TAG_HDR_INET;
2081 #endif /* PF_ECN */
2082 }
2083
2084 /*
2085 * Fill in fields, remembering maximum advertised
2086 * window for use in delaying messages about window sizes.
2087 * If resending a FIN, be sure not to use a new sequence number.
2088 */
2089 if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) &&
2090 tp->snd_nxt == tp->snd_max)
2091 tp->snd_nxt--;
2092 /*
2093 * If we are doing retransmissions, then snd_nxt will
2094 * not reflect the first unsent octet. For ACK only
2095 * packets, we do not want the sequence number of the
2096 * retransmitted packet, we want the sequence number
2097 * of the next unsent octet. So, if there is no data
2098 * (and no SYN or FIN), use snd_max instead of snd_nxt
2099 * when filling in ti_seq. But if we are in persist
2100 * state, snd_max might reflect one byte beyond the
2101 * right edge of the window, so use snd_nxt in that
2102 * case, since we know we aren't doing a retransmission.
2103 * (retransmit and persist are mutually exclusive...)
2104 *
2105 * Note the state of this retransmit segment to detect spurious
2106 * retransmissions.
2107 */
2108 if (sack_rxmit == 0) {
2109 if (len || (flags & (TH_SYN|TH_FIN)) ||
2110 tp->t_timer[TCPT_PERSIST]) {
2111 th->th_seq = htonl(tp->snd_nxt);
2112 if (len > 0) {
2113 m->m_pkthdr.tx_start_seq = tp->snd_nxt;
2114 m->m_pkthdr.pkt_flags |= PKTF_START_SEQ;
2115 }
2116 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
2117 if (SACK_ENABLED(tp) && len > 1) {
2118 tcp_rxtseg_insert(tp, tp->snd_nxt,
2119 (tp->snd_nxt + len - 1));
2120 }
2121 if (len > 0)
2122 m->m_pkthdr.pkt_flags |=
2123 PKTF_TCP_REXMT;
2124 }
2125 } else {
2126 th->th_seq = htonl(tp->snd_max);
2127 }
2128 } else {
2129 th->th_seq = htonl(p->rxmit);
2130 if (len > 0) {
2131 m->m_pkthdr.pkt_flags |=
2132 (PKTF_TCP_REXMT | PKTF_START_SEQ);
2133 m->m_pkthdr.tx_start_seq = p->rxmit;
2134 }
2135 tcp_rxtseg_insert(tp, p->rxmit, (p->rxmit + len - 1));
2136 p->rxmit += len;
2137 tp->sackhint.sack_bytes_rexmit += len;
2138 }
2139 th->th_ack = htonl(tp->rcv_nxt);
2140 tp->last_ack_sent = tp->rcv_nxt;
2141 #if MPTCP
2142 /* Initialize the ACK field to a value as 0 ack fields are dropped */
2143 if (early_data_sent) {
2144 th->th_ack = th->th_seq + 1;
2145 }
2146 #endif /* MPTCP */
2147 if (optlen) {
2148 bcopy(opt, th + 1, optlen);
2149 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
2150 }
2151 th->th_flags = flags;
2152 th->th_win = htons((u_short) (recwin>>tp->rcv_scale));
2153
2154 /*
2155 * Adjust the RXWIN0SENT flag - indicate that we have advertised
2156 * a 0 window. This may cause the remote transmitter to stall. This
2157 * flag tells soreceive() to disable delayed acknowledgements when
2158 * draining the buffer. This can occur if the receiver is attempting
2159 * to read more data then can be buffered prior to transmitting on
2160 * the connection.
2161 */
2162 if (th->th_win == 0)
2163 tp->t_flags |= TF_RXWIN0SENT;
2164 else
2165 tp->t_flags &= ~TF_RXWIN0SENT;
2166 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
2167 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
2168 th->th_flags |= TH_URG;
2169 } else {
2170 /*
2171 * If no urgent pointer to send, then we pull
2172 * the urgent pointer to the left edge of the send window
2173 * so that it doesn't drift into the send window on sequence
2174 * number wraparound.
2175 */
2176 tp->snd_up = tp->snd_una; /* drag it along */
2177 }
2178
2179 /*
2180 * Put TCP length in extended header, and then
2181 * checksum extended header and data.
2182 */
2183 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
2184
2185 /*
2186 * If this is potentially the last packet on the stream, then mark
2187 * it in order to enable some optimizations in the underlying
2188 * layers
2189 */
2190 if (tp->t_state != TCPS_ESTABLISHED &&
2191 (tp->t_state == TCPS_CLOSING || tp->t_state == TCPS_TIME_WAIT
2192 || tp->t_state == TCPS_LAST_ACK || (th->th_flags & TH_RST)))
2193 m->m_pkthdr.pkt_flags |= PKTF_LAST_PKT;
2194
2195 #if INET6
2196 if (isipv6) {
2197 /*
2198 * ip6_plen is not need to be filled now, and will be filled
2199 * in ip6_output.
2200 */
2201 m->m_pkthdr.csum_flags = CSUM_TCPIPV6;
2202 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
2203 if (len + optlen)
2204 th->th_sum = in_addword(th->th_sum,
2205 htons((u_short)(optlen + len)));
2206 }
2207 else
2208 #endif /* INET6 */
2209 {
2210 m->m_pkthdr.csum_flags = CSUM_TCP;
2211 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
2212 if (len + optlen)
2213 th->th_sum = in_addword(th->th_sum,
2214 htons((u_short)(optlen + len)));
2215 }
2216
2217 /*
2218 * Enable TSO and specify the size of the segments.
2219 * The TCP pseudo header checksum is always provided.
2220 */
2221 if (tso) {
2222 #if INET6
2223 if (isipv6)
2224 m->m_pkthdr.csum_flags |= CSUM_TSO_IPV6;
2225 else
2226 #endif /* INET6 */
2227 m->m_pkthdr.csum_flags |= CSUM_TSO_IPV4;
2228
2229 m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen;
2230 } else {
2231 m->m_pkthdr.tso_segsz = 0;
2232 }
2233
2234 /*
2235 * In transmit state, time the transmission and arrange for
2236 * the retransmit. In persist state, just set snd_max.
2237 */
2238 if (!(tp->t_flagsext & TF_FORCE)
2239 || tp->t_timer[TCPT_PERSIST] == 0) {
2240 tcp_seq startseq = tp->snd_nxt;
2241
2242 /*
2243 * Advance snd_nxt over sequence space of this segment.
2244 */
2245 if (flags & (TH_SYN|TH_FIN)) {
2246 if (flags & TH_SYN)
2247 tp->snd_nxt++;
2248 if ((flags & TH_FIN) &&
2249 !(tp->t_flags & TF_SENTFIN)) {
2250 tp->snd_nxt++;
2251 tp->t_flags |= TF_SENTFIN;
2252 }
2253 }
2254 if (sack_rxmit)
2255 goto timer;
2256 if (sack_rescue_rxt == TRUE) {
2257 tp->snd_nxt = old_snd_nxt;
2258 sack_rescue_rxt = FALSE;
2259 tcpstat.tcps_pto_in_recovery++;
2260 } else {
2261 tp->snd_nxt += len;
2262 }
2263 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
2264 tp->snd_max = tp->snd_nxt;
2265 /*
2266 * Time this transmission if not a retransmission and
2267 * not currently timing anything.
2268 */
2269 if (tp->t_rtttime == 0) {
2270 tp->t_rtttime = tcp_now;
2271 tp->t_rtseq = startseq;
2272 tcpstat.tcps_segstimed++;
2273
2274 /* update variables related to pipe ack */
2275 tp->t_pipeack_lastuna = tp->snd_una;
2276 }
2277 }
2278
2279 /*
2280 * Set retransmit timer if not currently set,
2281 * and not doing an ack or a keep-alive probe.
2282 */
2283 timer:
2284 if (tp->t_timer[TCPT_REXMT] == 0 &&
2285 ((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
2286 tp->snd_nxt != tp->snd_una || (flags & TH_FIN))) {
2287 if (tp->t_timer[TCPT_PERSIST]) {
2288 tp->t_timer[TCPT_PERSIST] = 0;
2289 tp->t_persist_stop = 0;
2290 TCP_RESET_REXMT_STATE(tp);
2291 }
2292 tp->t_timer[TCPT_REXMT] =
2293 OFFSET_FROM_START(tp, tp->t_rxtcur);
2294 }
2295
2296 /*
2297 * Set tail loss probe timeout if new data is being
2298 * transmitted. This will be supported only when
2299 * SACK option is enabled on a connection.
2300 *
2301 * Every time new data is sent PTO will get reset.
2302 */
2303 if (tcp_enable_tlp && tp->t_state == TCPS_ESTABLISHED &&
2304 SACK_ENABLED(tp) && !IN_FASTRECOVERY(tp)
2305 && tp->snd_nxt == tp->snd_max
2306 && SEQ_GT(tp->snd_nxt, tp->snd_una)
2307 && tp->t_rxtshift == 0
2308 && (tp->t_flagsext & (TF_SENT_TLPROBE|TF_PKTS_REORDERED)) == 0) {
2309 u_int32_t pto, srtt, new_rto = 0;
2310
2311 /*
2312 * Using SRTT alone to set PTO can cause spurious
2313 * retransmissions on wireless networks where there
2314 * is a lot of variance in RTT. Taking variance
2315 * into account will avoid this.
2316 */
2317 srtt = tp->t_srtt >> TCP_RTT_SHIFT;
2318 pto = ((TCP_REXMTVAL(tp)) * 3) >> 1;
2319 pto = max (2 * srtt, pto);
2320 if ((tp->snd_max - tp->snd_una) == tp->t_maxseg)
2321 pto = max(pto,
2322 (((3 * pto) >> 2) + tcp_delack * 2));
2323 else
2324 pto = max(10, pto);
2325
2326 /* if RTO is less than PTO, choose RTO instead */
2327 if (tp->t_rxtcur < pto) {
2328 /*
2329 * Schedule PTO instead of RTO in favor of
2330 * fast recovery.
2331 */
2332 pto = tp->t_rxtcur;
2333
2334 /* Reset the next RTO to be after PTO. */
2335 TCPT_RANGESET(new_rto,
2336 (pto + TCP_REXMTVAL(tp)),
2337 max(tp->t_rttmin, tp->t_rttcur + 2),
2338 TCPTV_REXMTMAX, 0);
2339 tp->t_timer[TCPT_REXMT] =
2340 OFFSET_FROM_START(tp, new_rto);
2341 }
2342 tp->t_timer[TCPT_PTO] = OFFSET_FROM_START(tp, pto);
2343 }
2344 } else {
2345 /*
2346 * Persist case, update snd_max but since we are in
2347 * persist mode (no window) we do not update snd_nxt.
2348 */
2349 int xlen = len;
2350 if (flags & TH_SYN)
2351 ++xlen;
2352 if ((flags & TH_FIN) &&
2353 !(tp->t_flags & TF_SENTFIN)) {
2354 ++xlen;
2355 tp->t_flags |= TF_SENTFIN;
2356 }
2357 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
2358 tp->snd_max = tp->snd_nxt + len;
2359 }
2360
2361 #if TCPDEBUG
2362 /*
2363 * Trace.
2364 */
2365 if (so_options & SO_DEBUG)
2366 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
2367 #endif
2368
2369 /*
2370 * Fill in IP length and desired time to live and
2371 * send to IP level. There should be a better way
2372 * to handle ttl and tos; we could keep them in
2373 * the template, but need a way to checksum without them.
2374 */
2375 #if INET6
2376 /*
2377 * m->m_pkthdr.len should have been set before cksum calcuration,
2378 * because in6_cksum() need it.
2379 */
2380 if (isipv6) {
2381 /*
2382 * we separately set hoplimit for every segment, since the
2383 * user might want to change the value via setsockopt.
2384 * Also, desired default hop limit might be changed via
2385 * Neighbor Discovery.
2386 */
2387 ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ?
2388 inp->in6p_route.ro_rt->rt_ifp : NULL);
2389
2390 /* TODO: IPv6 IP6TOS_ECT bit on */
2391 KERNEL_DEBUG(DBG_LAYER_BEG,
2392 ((inp->inp_fport << 16) | inp->inp_lport),
2393 (((inp->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
2394 (inp->in6p_faddr.s6_addr16[0] & 0xffff)),
2395 sendalot,0,0);
2396 } else
2397 #endif /* INET6 */
2398 {
2399 ip->ip_len = m->m_pkthdr.len;
2400 ip->ip_ttl = inp->inp_ip_ttl; /* XXX */
2401 ip->ip_tos |= (inp->inp_ip_tos & ~IPTOS_ECN_MASK);/* XXX */
2402 KERNEL_DEBUG(DBG_LAYER_BEG,
2403 ((inp->inp_fport << 16) | inp->inp_lport),
2404 (((inp->inp_laddr.s_addr & 0xffff) << 16) |
2405 (inp->inp_faddr.s_addr & 0xffff)), 0,0,0);
2406 }
2407
2408 /*
2409 * See if we should do MTU discovery.
2410 * Look at the flag updated on the following criterias:
2411 * 1) Path MTU discovery is authorized by the sysctl
2412 * 2) The route isn't set yet (unlikely but could happen)
2413 * 3) The route is up
2414 * 4) the MTU is not locked (if it is, then discovery has been
2415 * disabled for that route)
2416 */
2417 #if INET6
2418 if (!isipv6)
2419 #endif /* INET6 */
2420 if (path_mtu_discovery && (tp->t_flags & TF_PMTUD))
2421 ip->ip_off |= IP_DF;
2422
2423 #if NECP
2424 {
2425 necp_kernel_policy_id policy_id;
2426 u_int32_t route_rule_id;
2427 if (!necp_socket_is_allowed_to_send_recv(inp, &policy_id, &route_rule_id)) {
2428 m_freem(m);
2429 error = EHOSTUNREACH;
2430 goto out;
2431 }
2432 necp_mark_packet_from_socket(m, inp, policy_id, route_rule_id);
2433
2434 if (net_qos_policy_restricted != 0) {
2435 necp_socket_update_qos_marking(inp, inp->inp_route.ro_rt,
2436 NULL, route_rule_id);
2437 }
2438 }
2439 #endif /* NECP */
2440
2441 #if IPSEC
2442 if (inp->inp_sp != NULL)
2443 ipsec_setsocket(m, so);
2444 #endif /*IPSEC*/
2445
2446 /*
2447 * The socket is kept locked while sending out packets in ip_output, even if packet chaining is not active.
2448 */
2449 lost = 0;
2450
2451 /*
2452 * Embed the flow hash in pkt hdr and mark the packet as
2453 * capable of flow controlling
2454 */
2455 m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB;
2456 m->m_pkthdr.pkt_flowid = inp->inp_flowhash;
2457 m->m_pkthdr.pkt_flags |= PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC;
2458 #if MPTCP
2459 /* Disable flow advisory when using MPTCP. */
2460 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE))
2461 #endif /* MPTCP */
2462 m->m_pkthdr.pkt_flags |= PKTF_FLOW_ADV;
2463 m->m_pkthdr.pkt_proto = IPPROTO_TCP;
2464
2465 m->m_nextpkt = NULL;
2466
2467 if (inp->inp_last_outifp != NULL &&
2468 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2469 /* Hint to prioritize this packet if
2470 * 1. if the packet has no data
2471 * 2. the interface supports transmit-start model and did
2472 * not disable ACK prioritization.
2473 * 3. Only ACK flag is set.
2474 * 4. there is no outstanding data on this connection.
2475 */
2476 if (tcp_prioritize_acks != 0 && len == 0 &&
2477 (inp->inp_last_outifp->if_eflags &
2478 (IFEF_TXSTART | IFEF_NOACKPRI)) == IFEF_TXSTART) {
2479 if (th->th_flags == TH_ACK &&
2480 tp->snd_una == tp->snd_max &&
2481 tp->t_timer[TCPT_REXMT] == 0)
2482 svc_flags |= PKT_SCF_TCP_ACK;
2483 if (th->th_flags & TH_SYN)
2484 svc_flags |= PKT_SCF_TCP_SYN;
2485 }
2486 set_packet_service_class(m, so, sotc, svc_flags);
2487 } else {
2488 /*
2489 * Optimization for loopback just set the mbuf
2490 * service class
2491 */
2492 (void) m_set_service_class(m, so_tc2msc(sotc));
2493 }
2494
2495 tp->t_pktlist_sentlen += len;
2496 tp->t_lastchain++;
2497
2498 #if INET6
2499 if (isipv6) {
2500 DTRACE_TCP5(send, struct mbuf *, m, struct inpcb *, inp,
2501 struct ip6 *, ip6, struct tcpcb *, tp, struct tcphdr *,
2502 th);
2503 } else
2504 #endif /* INET6 */
2505 {
2506 DTRACE_TCP5(send, struct mbuf *, m, struct inpcb *, inp,
2507 struct ip *, ip, struct tcpcb *, tp, struct tcphdr *, th);
2508 }
2509
2510 if (tp->t_pktlist_head != NULL) {
2511 tp->t_pktlist_tail->m_nextpkt = m;
2512 tp->t_pktlist_tail = m;
2513 } else {
2514 packchain_newlist++;
2515 tp->t_pktlist_head = tp->t_pktlist_tail = m;
2516 }
2517
2518 if ((lro_ackmore) && (!sackoptlen) && (!tp->t_timer[TCPT_PERSIST]) &&
2519 ((th->th_flags & TH_ACK) == TH_ACK) && (!len) &&
2520 (tp->t_state == TCPS_ESTABLISHED)) {
2521 /* For a pure ACK, see if you need to send more of them */
2522 mnext = tcp_send_lroacks(tp, m, th);
2523 if (mnext) {
2524 tp->t_pktlist_tail->m_nextpkt = mnext;
2525 if (mnext->m_nextpkt == NULL) {
2526 tp->t_pktlist_tail = mnext;
2527 tp->t_lastchain++;
2528 } else {
2529 struct mbuf *tail, *next;
2530 next = mnext->m_nextpkt;
2531 tail = next->m_nextpkt;
2532 while (tail) {
2533 next = tail;
2534 tail = tail->m_nextpkt;
2535 tp->t_lastchain++;
2536 }
2537 tp->t_pktlist_tail = next;
2538 }
2539 }
2540 }
2541
2542 if (sendalot == 0 || (tp->t_state != TCPS_ESTABLISHED) ||
2543 (tp->snd_cwnd <= (tp->snd_wnd / 8)) ||
2544 (tp->t_flags & (TH_PUSH | TF_ACKNOW)) ||
2545 (tp->t_flagsext & TF_FORCE) ||
2546 tp->t_lastchain >= tcp_packet_chaining) {
2547 error = 0;
2548 while (inp->inp_sndinprog_cnt == 0 &&
2549 tp->t_pktlist_head != NULL) {
2550 packetlist = tp->t_pktlist_head;
2551 packchain_listadd = tp->t_lastchain;
2552 packchain_sent++;
2553 lost = tp->t_pktlist_sentlen;
2554 TCP_PKTLIST_CLEAR(tp);
2555
2556 error = tcp_ip_output(so, tp, packetlist,
2557 packchain_listadd, tp_inp_options,
2558 (so_options & SO_DONTROUTE),
2559 (sack_rxmit | (sack_bytes_rxmt != 0)), recwin,
2560 isipv6);
2561 if (error) {
2562 /*
2563 * Take into account the rest of unsent
2564 * packets in the packet list for this tcp
2565 * into "lost", since we're about to free
2566 * the whole list below.
2567 */
2568 lost += tp->t_pktlist_sentlen;
2569 break;
2570 } else {
2571 lost = 0;
2572 }
2573 }
2574 /* tcp was closed while we were in ip; resume close */
2575 if (inp->inp_sndinprog_cnt == 0 &&
2576 (tp->t_flags & TF_CLOSING)) {
2577 tp->t_flags &= ~TF_CLOSING;
2578 (void) tcp_close(tp);
2579 return (0);
2580 }
2581 } else {
2582 error = 0;
2583 packchain_looped++;
2584 tcpstat.tcps_sndtotal++;
2585
2586 goto again;
2587 }
2588 if (error) {
2589 /*
2590 * Assume that the packets were lost, so back out the
2591 * sequence number advance, if any. Note that the "lost"
2592 * variable represents the amount of user data sent during
2593 * the recent call to ip_output_list() plus the amount of
2594 * user data in the packet list for this tcp at the moment.
2595 */
2596 if (!(tp->t_flagsext & TF_FORCE)
2597 || tp->t_timer[TCPT_PERSIST] == 0) {
2598 /*
2599 * No need to check for TH_FIN here because
2600 * the TF_SENTFIN flag handles that case.
2601 */
2602 if ((flags & TH_SYN) == 0) {
2603 if (sack_rxmit) {
2604 if (SEQ_GT((p->rxmit - lost),
2605 tp->snd_una)) {
2606 p->rxmit -= lost;
2607 } else {
2608 lost = p->rxmit - tp->snd_una;
2609 p->rxmit = tp->snd_una;
2610 }
2611 tp->sackhint.sack_bytes_rexmit -= lost;
2612 } else {
2613 if (SEQ_GT((tp->snd_nxt - lost),
2614 tp->snd_una))
2615 tp->snd_nxt -= lost;
2616 else
2617 tp->snd_nxt = tp->snd_una;
2618 }
2619 }
2620 }
2621 out:
2622 if (tp->t_pktlist_head != NULL)
2623 m_freem_list(tp->t_pktlist_head);
2624 TCP_PKTLIST_CLEAR(tp);
2625
2626 if (error == ENOBUFS) {
2627 if (!tp->t_timer[TCPT_REXMT] &&
2628 !tp->t_timer[TCPT_PERSIST])
2629 tp->t_timer[TCPT_REXMT] =
2630 OFFSET_FROM_START(tp, tp->t_rxtcur);
2631 tp->snd_cwnd = tp->t_maxseg;
2632 tp->t_bytes_acked = 0;
2633 tcp_check_timer_state(tp);
2634 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
2635
2636 tcp_ccdbg_trace(tp, NULL, TCP_CC_OUTPUT_ERROR);
2637 return (0);
2638 }
2639 if (error == EMSGSIZE) {
2640 /*
2641 * ip_output() will have already fixed the route
2642 * for us. tcp_mtudisc() will, as its last action,
2643 * initiate retransmission, so it is important to
2644 * not do so here.
2645 *
2646 * If TSO was active we either got an interface
2647 * without TSO capabilits or TSO was turned off.
2648 * Disable it for this connection as too and
2649 * immediatly retry with MSS sized segments generated
2650 * by this function.
2651 */
2652 if (tso)
2653 tp->t_flags &= ~TF_TSO;
2654
2655 tcp_mtudisc(inp, 0);
2656 tcp_check_timer_state(tp);
2657
2658 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
2659 return 0;
2660 }
2661 /*
2662 * Unless this is due to interface restriction policy,
2663 * treat EHOSTUNREACH/ENETDOWN as a soft error.
2664 */
2665 if ((error == EHOSTUNREACH || error == ENETDOWN) &&
2666 TCPS_HAVERCVDSYN(tp->t_state) &&
2667 !inp_restricted_send(inp, inp->inp_last_outifp)) {
2668 tp->t_softerror = error;
2669 error = 0;
2670 }
2671 tcp_check_timer_state(tp);
2672 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
2673 return (error);
2674 }
2675
2676 tcpstat.tcps_sndtotal++;
2677
2678 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,0,0,0,0,0);
2679 if (sendalot)
2680 goto again;
2681
2682 tcp_check_timer_state(tp);
2683 return (0);
2684 }
2685
2686 static int
2687 tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt,
2688 int cnt, struct mbuf *opt, int flags, int sack_in_progress, int recwin,
2689 boolean_t isipv6)
2690 {
2691 int error = 0;
2692 boolean_t chain;
2693 boolean_t unlocked = FALSE;
2694 boolean_t ifdenied = FALSE;
2695 struct inpcb *inp = tp->t_inpcb;
2696 struct ip_out_args ipoa =
2697 { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF|IPOAF_BOUND_SRCADDR, 0,
2698 SO_TC_UNSPEC, _NET_SERVICE_TYPE_UNSPEC };
2699 struct route ro;
2700 struct ifnet *outif = NULL;
2701 #if INET6
2702 struct ip6_out_args ip6oa =
2703 { IFSCOPE_NONE, { 0 }, IP6OAF_SELECT_SRCIF|IP6OAF_BOUND_SRCADDR, 0,
2704 SO_TC_UNSPEC, _NET_SERVICE_TYPE_UNSPEC };
2705 struct route_in6 ro6;
2706 struct flowadv *adv =
2707 (isipv6 ? &ip6oa.ip6oa_flowadv : &ipoa.ipoa_flowadv);
2708 #else /* INET6 */
2709 struct flowadv *adv = &ipoa.ipoa_flowadv;
2710 #endif /* !INET6 */
2711
2712 /* If socket was bound to an ifindex, tell ip_output about it */
2713 if (inp->inp_flags & INP_BOUND_IF) {
2714 #if INET6
2715 if (isipv6) {
2716 ip6oa.ip6oa_boundif = inp->inp_boundifp->if_index;
2717 ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF;
2718 } else
2719 #endif /* INET6 */
2720 {
2721 ipoa.ipoa_boundif = inp->inp_boundifp->if_index;
2722 ipoa.ipoa_flags |= IPOAF_BOUND_IF;
2723 }
2724 }
2725
2726 if (INP_NO_CELLULAR(inp)) {
2727 #if INET6
2728 if (isipv6)
2729 ip6oa.ip6oa_flags |= IP6OAF_NO_CELLULAR;
2730 else
2731 #endif /* INET6 */
2732 ipoa.ipoa_flags |= IPOAF_NO_CELLULAR;
2733 }
2734 if (INP_NO_EXPENSIVE(inp)) {
2735 #if INET6
2736 if (isipv6)
2737 ip6oa.ip6oa_flags |= IP6OAF_NO_EXPENSIVE;
2738 else
2739 #endif /* INET6 */
2740 ipoa.ipoa_flags |= IPOAF_NO_EXPENSIVE;
2741
2742 }
2743 if (INP_AWDL_UNRESTRICTED(inp)) {
2744 #if INET6
2745 if (isipv6)
2746 ip6oa.ip6oa_flags |= IP6OAF_AWDL_UNRESTRICTED;
2747 else
2748 #endif /* INET6 */
2749 ipoa.ipoa_flags |= IPOAF_AWDL_UNRESTRICTED;
2750
2751 }
2752 #if INET6
2753 if (INP_INTCOPROC_ALLOWED(inp) && isipv6) {
2754 ip6oa.ip6oa_flags |= IP6OAF_INTCOPROC_ALLOWED;
2755 }
2756 if (isipv6) {
2757 ip6oa.ip6oa_sotc = so->so_traffic_class;
2758 ip6oa.ip6oa_netsvctype = so->so_netsvctype;
2759 } else
2760 #endif /* INET6 */
2761 {
2762 ipoa.ipoa_sotc = so->so_traffic_class;
2763 ipoa.ipoa_netsvctype = so->so_netsvctype;
2764 }
2765 if ((so->so_flags1 & SOF1_QOSMARKING_ALLOWED)) {
2766 #if INET6
2767 if (isipv6)
2768 ip6oa.ip6oa_flags |= IP6OAF_QOSMARKING_ALLOWED;
2769 else
2770 #endif /* INET6 */
2771 ipoa.ipoa_flags |= IPOAF_QOSMARKING_ALLOWED;
2772 }
2773 #if INET6
2774 if (isipv6)
2775 flags |= IPV6_OUTARGS;
2776 else
2777 #endif /* INET6 */
2778 flags |= IP_OUTARGS;
2779
2780 /* Copy the cached route and take an extra reference */
2781 #if INET6
2782 if (isipv6)
2783 in6p_route_copyout(inp, &ro6);
2784 else
2785 #endif /* INET6 */
2786 inp_route_copyout(inp, &ro);
2787
2788 /*
2789 * Data sent (as far as we can tell).
2790 * If this advertises a larger window than any other segment,
2791 * then remember the size of the advertised window.
2792 * Make sure ACK/DELACK conditions are cleared before
2793 * we unlock the socket.
2794 */
2795 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
2796 tp->rcv_adv = tp->rcv_nxt + recwin;
2797 tp->last_ack_sent = tp->rcv_nxt;
2798 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
2799 tp->t_timer[TCPT_DELACK] = 0;
2800 tp->t_unacksegs = 0;
2801
2802 /* Increment the count of outstanding send operations */
2803 inp->inp_sndinprog_cnt++;
2804
2805 /*
2806 * If allowed, unlock TCP socket while in IP
2807 * but only if the connection is established and
2808 * in a normal mode where reentrancy on the tcpcb won't be
2809 * an issue:
2810 * - there is no SACK episode
2811 * - we're not in Fast Recovery mode
2812 * - if we're not sending from an upcall.
2813 */
2814 if (tcp_output_unlocked && !so->so_upcallusecount &&
2815 (tp->t_state == TCPS_ESTABLISHED) && (sack_in_progress == 0) &&
2816 !IN_FASTRECOVERY(tp)) {
2817
2818 unlocked = TRUE;
2819 socket_unlock(so, 0);
2820 }
2821
2822 /*
2823 * Don't send down a chain of packets when:
2824 * - TCP chaining is disabled
2825 * - there is an IPsec rule set
2826 * - there is a non default rule set for the firewall
2827 */
2828
2829 chain = tcp_packet_chaining > 1
2830 #if IPSEC
2831 && ipsec_bypass
2832 #endif
2833 #if IPFIREWALL
2834 && (fw_enable == 0 || fw_bypass)
2835 #endif
2836 ; // I'm important, not extraneous
2837
2838
2839 while (pkt != NULL) {
2840 struct mbuf *npkt = pkt->m_nextpkt;
2841
2842 if (!chain) {
2843 pkt->m_nextpkt = NULL;
2844 /*
2845 * If we are not chaining, make sure to set the packet
2846 * list count to 0 so that IP takes the right path;
2847 * this is important for cases such as IPSec where a
2848 * single mbuf might result in multiple mbufs as part
2849 * of the encapsulation. If a non-zero count is passed
2850 * down to IP, the head of the chain might change and
2851 * we could end up skipping it (thus generating bogus
2852 * packets). Fixing it in IP would be desirable, but
2853 * for now this would do it.
2854 */
2855 cnt = 0;
2856 }
2857 #if INET6
2858 if (isipv6) {
2859 error = ip6_output_list(pkt, cnt,
2860 inp->in6p_outputopts, &ro6, flags, NULL, NULL,
2861 &ip6oa);
2862 ifdenied = (ip6oa.ip6oa_retflags & IP6OARF_IFDENIED);
2863 } else {
2864 #endif /* INET6 */
2865 error = ip_output_list(pkt, cnt, opt, &ro, flags, NULL,
2866 &ipoa);
2867 ifdenied = (ipoa.ipoa_retflags & IPOARF_IFDENIED);
2868 }
2869
2870 if (chain || error) {
2871 /*
2872 * If we sent down a chain then we are done since
2873 * the callee had taken care of everything; else
2874 * we need to free the rest of the chain ourselves.
2875 */
2876 if (!chain)
2877 m_freem_list(npkt);
2878 break;
2879 }
2880 pkt = npkt;
2881 }
2882
2883 if (unlocked)
2884 socket_lock(so, 0);
2885
2886 /*
2887 * Enter flow controlled state if the connection is established
2888 * and is not in recovery.
2889 *
2890 * A connection will enter suspended state even if it is in
2891 * recovery.
2892 */
2893 if (((adv->code == FADV_FLOW_CONTROLLED && !IN_FASTRECOVERY(tp)) ||
2894 adv->code == FADV_SUSPENDED) &&
2895 !(tp->t_flags & TF_CLOSING) &&
2896 tp->t_state == TCPS_ESTABLISHED) {
2897 int rc;
2898 rc = inp_set_fc_state(inp, adv->code);
2899
2900 if (rc == 1)
2901 tcp_ccdbg_trace(tp, NULL,
2902 ((adv->code == FADV_FLOW_CONTROLLED) ?
2903 TCP_CC_FLOW_CONTROL : TCP_CC_SUSPEND));
2904 }
2905
2906 /*
2907 * When an interface queue gets suspended, some of the
2908 * packets are dropped. Return ENOBUFS, to update the
2909 * pcb state.
2910 */
2911 if (adv->code == FADV_SUSPENDED)
2912 error = ENOBUFS;
2913
2914 VERIFY(inp->inp_sndinprog_cnt > 0);
2915 if ( --inp->inp_sndinprog_cnt == 0)
2916 inp->inp_flags &= ~(INP_FC_FEEDBACK);
2917
2918 #if INET6
2919 if (isipv6) {
2920 if (ro6.ro_rt != NULL)
2921 outif = ro6.ro_rt->rt_ifp;
2922 } else
2923 #endif /* INET6 */
2924 if (ro.ro_rt != NULL)
2925 outif = ro.ro_rt->rt_ifp;
2926
2927 if (outif != NULL && outif != inp->inp_last_outifp &&
2928 so->so_snd.sb_cc > 0) {
2929 /* Update the send byte count */
2930 if (so->so_snd.sb_flags & SB_SNDBYTE_CNT) {
2931 inp_decr_sndbytes_total(so, so->so_snd.sb_cc);
2932 inp_decr_sndbytes_allunsent(so, tp->snd_una);
2933 so->so_snd.sb_flags &= ~SB_SNDBYTE_CNT;
2934 }
2935 inp->inp_last_outifp = outif;
2936 }
2937
2938 if (error != 0 && ifdenied &&
2939 (INP_NO_CELLULAR(inp) || INP_NO_EXPENSIVE(inp)))
2940 soevent(so,
2941 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_IFDENIED));
2942
2943 /* Synchronize cached PCB route & options */
2944 #if INET6
2945 if (isipv6)
2946 in6p_route_copyin(inp, &ro6);
2947 else
2948 #endif /* INET6 */
2949 inp_route_copyin(inp, &ro);
2950
2951 if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift == 0 &&
2952 tp->t_inpcb->inp_route.ro_rt != NULL) {
2953 /* If we found the route and there is an rtt on it
2954 * reset the retransmit timer
2955 */
2956 tcp_getrt_rtt(tp, tp->t_inpcb->in6p_route.ro_rt);
2957 tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
2958 }
2959 return (error);
2960 }
2961
2962 void
2963 tcp_setpersist(struct tcpcb *tp)
2964 {
2965 int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
2966
2967 /* If a PERSIST_TIMER option was set we will limit the
2968 * time the persist timer will be active for that connection
2969 * in order to avoid DOS by using zero window probes.
2970 * see rdar://5805356
2971 */
2972
2973 if ((tp->t_persist_timeout != 0) &&
2974 (tp->t_timer[TCPT_PERSIST] == 0) &&
2975 (tp->t_persist_stop == 0)) {
2976 tp->t_persist_stop = tcp_now + tp->t_persist_timeout;
2977 }
2978
2979 /*
2980 * Start/restart persistance timer.
2981 */
2982 TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
2983 t * tcp_backoff[tp->t_rxtshift],
2984 TCPTV_PERSMIN, TCPTV_PERSMAX, 0);
2985 tp->t_timer[TCPT_PERSIST] = OFFSET_FROM_START(tp, tp->t_timer[TCPT_PERSIST]);
2986
2987 if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
2988 tp->t_rxtshift++;
2989 }
2990
2991 /*
2992 * Send as many acks as data coalesced. Every other packet when stretch
2993 * ACK is not enabled. Every 8 packets, if stretch ACK is enabled.
2994 */
2995 static struct mbuf*
2996 tcp_send_lroacks(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
2997 {
2998 struct mbuf *mnext = NULL, *ack_chain = NULL, *tail = NULL;
2999 int count = 0;
3000 tcp_seq org_ack = ntohl(th->th_ack);
3001 tcp_seq prev_ack = 0;
3002 int tack_offset = 28; /* IPv6 and IP options not supported */
3003 int twin_offset = 34; /* IPv6 and IP options not supported */
3004 int ack_size = (tp->t_flags & TF_STRETCHACK) ?
3005 (maxseg_unacked * tp->t_maxseg) : (tp->t_maxseg << 1);
3006 int segs_acked = (tp->t_flags & TF_STRETCHACK) ? maxseg_unacked : 2;
3007 struct mbuf *prev_ack_pkt = NULL;
3008 struct socket *so = tp->t_inpcb->inp_socket;
3009 unsigned short winsz = ntohs(th->th_win);
3010 unsigned int scaled_win = winsz<<tp->rcv_scale;
3011 tcp_seq win_rtedge = org_ack + scaled_win;
3012
3013 count = tp->t_lropktlen/tp->t_maxseg;
3014
3015 prev_ack = (org_ack - tp->t_lropktlen) + ack_size;
3016 if (prev_ack < org_ack) {
3017 ack_chain = m_dup(m, M_DONTWAIT);
3018 if (ack_chain) {
3019 th->th_ack = htonl(prev_ack);
3020 /* Keep adv window constant for duplicated ACK packets */
3021 scaled_win = win_rtedge - prev_ack;
3022 if (scaled_win > (int32_t)(TCP_MAXWIN << tp->rcv_scale))
3023 scaled_win = (int32_t)(TCP_MAXWIN << tp->rcv_scale);
3024 th->th_win = htons(scaled_win>>tp->rcv_scale);
3025 if (lrodebug == 5) {
3026 printf("%s: win = %d winsz = %d sc = %d"
3027 " lro_len %d %d\n",
3028 __func__, scaled_win>>tp->rcv_scale, winsz,
3029 tp->rcv_scale, tp->t_lropktlen, count);
3030 }
3031 tail = ack_chain;
3032 count -= segs_acked; /* accounts for prev_ack packet */
3033 count = (count <= segs_acked) ? 0 : count - segs_acked;
3034 tcpstat.tcps_sndacks++;
3035 so_tc_update_stats(m, so, m_get_service_class(m));
3036 } else {
3037 return NULL;
3038 }
3039 }
3040 else {
3041 tp->t_lropktlen = 0;
3042 return NULL;
3043 }
3044
3045 prev_ack_pkt = ack_chain;
3046
3047 while (count > 0) {
3048 if ((prev_ack + ack_size) < org_ack) {
3049 prev_ack += ack_size;
3050 } else {
3051 /*
3052 * The last ACK sent must have the ACK number that TCP
3053 * thinks is the last sent ACK number.
3054 */
3055 prev_ack = org_ack;
3056 }
3057 mnext = m_dup(prev_ack_pkt, M_DONTWAIT);
3058 if (mnext) {
3059 /* Keep adv window constant for duplicated ACK packets */
3060 scaled_win = win_rtedge - prev_ack;
3061 if (scaled_win > (int32_t)(TCP_MAXWIN << tp->rcv_scale))
3062 scaled_win = (int32_t)(TCP_MAXWIN << tp->rcv_scale);
3063 winsz = htons(scaled_win>>tp->rcv_scale);
3064 if (lrodebug == 5) {
3065 printf("%s: winsz = %d ack %x count %d\n",
3066 __func__, scaled_win>>tp->rcv_scale,
3067 prev_ack, count);
3068 }
3069 bcopy(&winsz, mtod(prev_ack_pkt, caddr_t) + twin_offset, 2);
3070 HTONL(prev_ack);
3071 bcopy(&prev_ack, mtod(prev_ack_pkt, caddr_t) + tack_offset, 4);
3072 NTOHL(prev_ack);
3073 tail->m_nextpkt = mnext;
3074 tail = mnext;
3075 count -= segs_acked;
3076 tcpstat.tcps_sndacks++;
3077 so_tc_update_stats(m, so, m_get_service_class(m));
3078 } else {
3079 if (lrodebug == 5) {
3080 printf("%s: failed to alloc mbuf.\n", __func__);
3081 }
3082 break;
3083 }
3084 prev_ack_pkt = mnext;
3085 }
3086 tp->t_lropktlen = 0;
3087 return ack_chain;
3088 }
3089
3090 static int
3091 tcp_recv_throttle (struct tcpcb *tp)
3092 {
3093 uint32_t base_rtt, newsize;
3094 struct sockbuf *sbrcv = &tp->t_inpcb->inp_socket->so_rcv;
3095
3096 if (tcp_use_rtt_recvbg == 1 &&
3097 TSTMP_SUPPORTED(tp)) {
3098 /*
3099 * Timestamps are supported on this connection. Use
3100 * RTT to look for an increase in latency.
3101 */
3102
3103 /*
3104 * If the connection is already being throttled, leave it
3105 * in that state until rtt comes closer to base rtt
3106 */
3107 if (tp->t_flagsext & TF_RECV_THROTTLE)
3108 return (1);
3109
3110 base_rtt = get_base_rtt(tp);
3111
3112 if (base_rtt != 0 && tp->t_rttcur != 0) {
3113 /*
3114 * if latency increased on a background flow,
3115 * return 1 to start throttling.
3116 */
3117 if (tp->t_rttcur > (base_rtt + target_qdelay)) {
3118 tp->t_flagsext |= TF_RECV_THROTTLE;
3119 if (tp->t_recv_throttle_ts == 0)
3120 tp->t_recv_throttle_ts = tcp_now;
3121 /*
3122 * Reduce the recv socket buffer size to
3123 * minimize latecy.
3124 */
3125 if (sbrcv->sb_idealsize >
3126 tcp_recv_throttle_minwin) {
3127 newsize = sbrcv->sb_idealsize >> 1;
3128 /* Set a minimum of 16 K */
3129 newsize =
3130 max(newsize,
3131 tcp_recv_throttle_minwin);
3132 sbrcv->sb_idealsize = newsize;
3133 }
3134 return (1);
3135 } else {
3136 return (0);
3137 }
3138 }
3139 }
3140
3141 /*
3142 * Timestamps are not supported or there is no good RTT
3143 * measurement. Use IPDV in this case.
3144 */
3145 if (tp->acc_iaj > tcp_acc_iaj_react_limit)
3146 return (1);
3147
3148 return (0);
3149 }