]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/tcp_output.c
xnu-201.tar.gz
[apple/xnu.git] / bsd / netinet / tcp_output.c
1 /*
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /*
23 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
24 * The Regents of the University of California. All rights reserved.
25 *
26 * Redistribution and use in source and binary forms, with or without
27 * modification, are permitted provided that the following conditions
28 * are met:
29 * 1. Redistributions of source code must retain the above copyright
30 * notice, this list of conditions and the following disclaimer.
31 * 2. Redistributions in binary form must reproduce the above copyright
32 * notice, this list of conditions and the following disclaimer in the
33 * documentation and/or other materials provided with the distribution.
34 * 3. All advertising materials mentioning features or use of this software
35 * must display the following acknowledgement:
36 * This product includes software developed by the University of
37 * California, Berkeley and its contributors.
38 * 4. Neither the name of the University nor the names of its contributors
39 * may be used to endorse or promote products derived from this software
40 * without specific prior written permission.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 *
54 * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
55 */
56
57 #if ISFB31
58 #include "opt_tcpdebug.h"
59 #endif
60 #define _IP_VHL
61
62 #include <stddef.h>
63
64 #include <sys/param.h>
65 #include <sys/systm.h>
66 #include <sys/mbuf.h>
67 #include <sys/domain.h>
68 #include <sys/protosw.h>
69 #include <sys/socket.h>
70 #include <sys/socketvar.h>
71
72 #include <net/route.h>
73
74 #include <netinet/in.h>
75 #include <netinet/in_systm.h>
76 #include <netinet/ip.h>
77 #include <netinet/ip_var.h>
78 #if INET6
79 #include <netinet/ip6.h>
80 #include <netinet/ip_var.h>
81 #include <netinet6/ip6_var.h>
82 #endif
83 #include <netinet/in_pcb.h>
84 #include <netinet/tcp.h>
85 #define TCPOUTFLAGS
86 #include <netinet/tcp_fsm.h>
87 #include <netinet/tcp_seq.h>
88 #include <netinet/tcp_timer.h>
89 #include <netinet/tcp_var.h>
90 #include <netinet/tcpip.h>
91 #if TCPDEBUG
92 #include <netinet/tcp_debug.h>
93 #endif
94 #include <sys/kdebug.h>
95
96 #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 1)
97 #define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 3)
98 #define DBG_FNC_TCP_OUTPUT NETDBG_CODE(DBG_NETTCP, (4 << 8) | 1)
99
100
101 #ifdef notyet
102 extern struct mbuf *m_copypack();
103 #endif
104
105
106 /*
107 * Tcp output routine: figure out what should be sent and send it.
108 */
109 int
110 tcp_output(tp)
111 register struct tcpcb *tp;
112 {
113 register struct socket *so = tp->t_inpcb->inp_socket;
114 register long len, win;
115 int off, flags, error;
116 register struct mbuf *m;
117 struct ip *ip = NULL;
118 struct ipovly *ipov = NULL;
119 #if INET6
120 struct ip6_hdr *ip6 = NULL;
121 #endif /* INET6 */
122 struct tcphdr *th;
123 u_char opt[TCP_MAXOLEN];
124 unsigned ipoptlen, optlen, hdrlen;
125 int idle, sendalot;
126 struct rmxp_tao *taop;
127 struct rmxp_tao tao_noncached;
128 #if INET6
129 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0;
130 #endif
131
132 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0);
133 KERNEL_DEBUG(DBG_LAYER_BEG,
134 ((tp->t_template->tt_dport << 16) | tp->t_template->tt_sport),
135 (((tp->t_template->tt_src.s_addr & 0xffff) << 16) |
136 (tp->t_template->tt_dst.s_addr & 0xffff)),
137 0,0,0);
138
139 /*
140 * Determine length of data that should be transmitted,
141 * and flags that will be used.
142 * If there is some data or critical controls (SYN, RST)
143 * to send, then transmit; otherwise, investigate further.
144 */
145 idle = (tp->snd_max == tp->snd_una);
146 if (idle && tp->t_idle >= tp->t_rxtcur)
147 /*
148 * We have been idle for "a while" and no acks are
149 * expected to clock out any data we send --
150 * slow start to get ack "clock" running again.
151 */
152 tp->snd_cwnd = tp->t_maxseg;
153
154 /* Never send data that's already been acked */
155 if (SEQ_GT(tp->snd_una, tp->snd_nxt))
156 tp->snd_nxt = tp->snd_una;
157 again:
158 sendalot = 0;
159 off = tp->snd_nxt - tp->snd_una;
160 win = min(tp->snd_wnd, tp->snd_cwnd);
161
162 flags = tcp_outflags[tp->t_state];
163 /*
164 * Get standard flags, and add SYN or FIN if requested by 'hidden'
165 * state flags.
166 */
167 if (tp->t_flags & TF_NEEDFIN)
168 flags |= TH_FIN;
169 if (tp->t_flags & TF_NEEDSYN)
170 flags |= TH_SYN;
171
172 /*
173 * If in persist timeout with window of 0, send 1 byte.
174 * Otherwise, if window is small but nonzero
175 * and timer expired, we will send what we can
176 * and go to transmit state.
177 */
178 if (tp->t_force) {
179 if (win == 0) {
180 /*
181 * If we still have some data to send, then
182 * clear the FIN bit. Usually this would
183 * happen below when it realizes that we
184 * aren't sending all the data. However,
185 * if we have exactly 1 byte of unsent data,
186 * then it won't clear the FIN bit below,
187 * and if we are in persist state, we wind
188 * up sending the packet without recording
189 * that we sent the FIN bit.
190 *
191 * We can't just blindly clear the FIN bit,
192 * because if we don't have any more data
193 * to send then the probe will be the FIN
194 * itself.
195 */
196 if (off < so->so_snd.sb_cc)
197 flags &= ~TH_FIN;
198 win = 1;
199 } else {
200 tp->t_timer[TCPT_PERSIST] = 0;
201 tp->t_rxtshift = 0;
202 }
203 }
204
205 len = (long)ulmin(so->so_snd.sb_cc, win) - off;
206
207 if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) {
208 taop = &tao_noncached;
209 bzero(taop, sizeof(*taop));
210 }
211
212 /*
213 * Lop off SYN bit if it has already been sent. However, if this
214 * is SYN-SENT state and if segment contains data and if we don't
215 * know that foreign host supports TAO, suppress sending segment.
216 */
217 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
218 flags &= ~TH_SYN;
219 off--, len++;
220 if (len > 0 && tp->t_state == TCPS_SYN_SENT &&
221 taop->tao_ccsent == 0) {
222 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
223 return 0;
224 }
225 }
226
227 /*
228 * Be careful not to send data and/or FIN on SYN segments
229 * in cases when no CC option will be sent.
230 * This measure is needed to prevent interoperability problems
231 * with not fully conformant TCP implementations.
232 */
233 if ((flags & TH_SYN) &&
234 ((tp->t_flags & TF_NOOPT) || !(tp->t_flags & TF_REQ_CC) ||
235 ((flags & TH_ACK) && !(tp->t_flags & TF_RCVD_CC)))) {
236 len = 0;
237 flags &= ~TH_FIN;
238 }
239
240 if (len < 0) {
241 /*
242 * If FIN has been sent but not acked,
243 * but we haven't been called to retransmit,
244 * len will be -1. Otherwise, window shrank
245 * after we sent into it. If window shrank to 0,
246 * cancel pending retransmit, pull snd_nxt back
247 * to (closed) window, and set the persist timer
248 * if it isn't already going. If the window didn't
249 * close completely, just wait for an ACK.
250 */
251 len = 0;
252 if (win == 0) {
253 tp->t_timer[TCPT_REXMT] = 0;
254 tp->t_rxtshift = 0;
255 tp->snd_nxt = tp->snd_una;
256 if (tp->t_timer[TCPT_PERSIST] == 0)
257 tcp_setpersist(tp);
258 }
259 }
260 if (len > tp->t_maxseg) {
261 len = tp->t_maxseg;
262 sendalot = 1;
263 }
264 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
265 flags &= ~TH_FIN;
266
267 win = sbspace(&so->so_rcv);
268
269 /*
270 * Sender silly window avoidance. If connection is idle
271 * and can send all data, a maximum segment,
272 * at least a maximum default-size segment do it,
273 * or are forced, do it; otherwise don't bother.
274 * If peer's buffer is tiny, then send
275 * when window is at least half open.
276 * If retransmitting (possibly after persist timer forced us
277 * to send into a small window), then must resend.
278 */
279 if (len) {
280 if (len == tp->t_maxseg)
281 goto send;
282 if (!(tp->t_flags & TF_MORETOCOME) &&
283 (idle || tp->t_flags & TF_NODELAY) &&
284 (tp->t_flags & TF_NOPUSH) == 0 &&
285 len + off >= so->so_snd.sb_cc)
286 goto send;
287 if (tp->t_force)
288 goto send;
289 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
290 goto send;
291 if (SEQ_LT(tp->snd_nxt, tp->snd_max))
292 goto send;
293 }
294
295 /*
296 * Compare available window to amount of window
297 * known to peer (as advertised window less
298 * next expected input). If the difference is at least two
299 * max size segments, or at least 50% of the maximum possible
300 * window, then want to send a window update to peer.
301 */
302 if (win > 0) {
303 /*
304 * "adv" is the amount we can increase the window,
305 * taking into account that we are limited by
306 * TCP_MAXWIN << tp->rcv_scale.
307 */
308 long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) -
309 (tp->rcv_adv - tp->rcv_nxt);
310
311 if (adv >= (long) (2 * tp->t_maxseg))
312 goto send;
313 if (2 * adv >= (long) so->so_rcv.sb_hiwat)
314 goto send;
315 }
316
317 /*
318 * Send if we owe peer an ACK.
319 */
320 if (tp->t_flags & TF_ACKNOW)
321 goto send;
322 if ((flags & TH_RST) ||
323 ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
324 goto send;
325 if (SEQ_GT(tp->snd_up, tp->snd_una))
326 goto send;
327 /*
328 * If our state indicates that FIN should be sent
329 * and we have not yet done so, or we're retransmitting the FIN,
330 * then we need to send.
331 */
332 if (flags & TH_FIN &&
333 ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
334 goto send;
335
336 /*
337 * TCP window updates are not reliable, rather a polling protocol
338 * using ``persist'' packets is used to insure receipt of window
339 * updates. The three ``states'' for the output side are:
340 * idle not doing retransmits or persists
341 * persisting to move a small or zero window
342 * (re)transmitting and thereby not persisting
343 *
344 * tp->t_timer[TCPT_PERSIST]
345 * is set when we are in persist state.
346 * tp->t_force
347 * is set when we are called to send a persist packet.
348 * tp->t_timer[TCPT_REXMT]
349 * is set when we are retransmitting
350 * The output side is idle when both timers are zero.
351 *
352 * If send window is too small, there is data to transmit, and no
353 * retransmit or persist is pending, then go to persist state.
354 * If nothing happens soon, send when timer expires:
355 * if window is nonzero, transmit what we can,
356 * otherwise force out a byte.
357 */
358 if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
359 tp->t_timer[TCPT_PERSIST] == 0) {
360 tp->t_rxtshift = 0;
361 tcp_setpersist(tp);
362 }
363
364 /*
365 * No reason to send a segment, just return.
366 */
367 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
368 return (0);
369
370 send:
371 /*
372 * Before ESTABLISHED, force sending of initial options
373 * unless TCP set not to do any options.
374 * NOTE: we assume that the IP/TCP header plus TCP options
375 * always fit in a single mbuf, leaving room for a maximum
376 * link header, i.e.
377 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN
378 */
379 optlen = 0;
380 #if INET6
381 if (isipv6)
382 hdrlen = sizeof (struct tcpip6hdr);
383 else
384 #endif
385 hdrlen = sizeof (struct tcpiphdr);
386 if (flags & TH_SYN) {
387 tp->snd_nxt = tp->iss;
388 if ((tp->t_flags & TF_NOOPT) == 0) {
389 u_short mss;
390
391 opt[0] = TCPOPT_MAXSEG;
392 opt[1] = TCPOLEN_MAXSEG;
393 mss = htons((u_short) tcp_mssopt(tp, isipv6));
394 (void)memcpy(opt + 2, &mss, sizeof(mss));
395 optlen = TCPOLEN_MAXSEG;
396
397 if ((tp->t_flags & TF_REQ_SCALE) &&
398 ((flags & TH_ACK) == 0 ||
399 (tp->t_flags & TF_RCVD_SCALE))) {
400 *((u_int32_t *)(opt + optlen)) = htonl(
401 TCPOPT_NOP << 24 |
402 TCPOPT_WINDOW << 16 |
403 TCPOLEN_WINDOW << 8 |
404 tp->request_r_scale);
405 optlen += 4;
406 }
407 }
408 }
409
410 /*
411 * Send a timestamp and echo-reply if this is a SYN and our side
412 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
413 * and our peer have sent timestamps in our SYN's.
414 */
415 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
416 (flags & TH_RST) == 0 &&
417 ((flags & TH_ACK) == 0 ||
418 (tp->t_flags & TF_RCVD_TSTMP))) {
419 u_int32_t *lp = (u_int32_t *)(opt + optlen);
420
421 /* Form timestamp option as shown in appendix A of RFC 1323. */
422 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
423 *lp++ = htonl(tcp_now);
424 *lp = htonl(tp->ts_recent);
425 optlen += TCPOLEN_TSTAMP_APPA;
426 }
427
428 /*
429 * Send `CC-family' options if our side wants to use them (TF_REQ_CC),
430 * options are allowed (!TF_NOOPT) and it's not a RST.
431 */
432 if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
433 (flags & TH_RST) == 0) {
434 switch (flags & (TH_SYN|TH_ACK)) {
435 /*
436 * This is a normal ACK, send CC if we received CC before
437 * from our peer.
438 */
439 case TH_ACK:
440 if (!(tp->t_flags & TF_RCVD_CC))
441 break;
442 /*FALLTHROUGH*/
443
444 /*
445 * We can only get here in T/TCP's SYN_SENT* state, when
446 * we're a sending a non-SYN segment without waiting for
447 * the ACK of our SYN. A check above assures that we only
448 * do this if our peer understands T/TCP.
449 */
450 case 0:
451 opt[optlen++] = TCPOPT_NOP;
452 opt[optlen++] = TCPOPT_NOP;
453 opt[optlen++] = TCPOPT_CC;
454 opt[optlen++] = TCPOLEN_CC;
455 *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send);
456
457 optlen += 4;
458 break;
459
460 /*
461 * This is our initial SYN, check whether we have to use
462 * CC or CC.new.
463 */
464 case TH_SYN:
465 opt[optlen++] = TCPOPT_NOP;
466 opt[optlen++] = TCPOPT_NOP;
467 opt[optlen++] = tp->t_flags & TF_SENDCCNEW ?
468 TCPOPT_CCNEW : TCPOPT_CC;
469 opt[optlen++] = TCPOLEN_CC;
470 *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send);
471 optlen += 4;
472 break;
473
474 /*
475 * This is a SYN,ACK; send CC and CC.echo if we received
476 * CC from our peer.
477 */
478 case (TH_SYN|TH_ACK):
479 if (tp->t_flags & TF_RCVD_CC) {
480 opt[optlen++] = TCPOPT_NOP;
481 opt[optlen++] = TCPOPT_NOP;
482 opt[optlen++] = TCPOPT_CC;
483 opt[optlen++] = TCPOLEN_CC;
484 *(u_int32_t *)&opt[optlen] =
485 htonl(tp->cc_send);
486 optlen += 4;
487 opt[optlen++] = TCPOPT_NOP;
488 opt[optlen++] = TCPOPT_NOP;
489 opt[optlen++] = TCPOPT_CCECHO;
490 opt[optlen++] = TCPOLEN_CC;
491 *(u_int32_t *)&opt[optlen] =
492 htonl(tp->cc_recv);
493 optlen += 4;
494 }
495 break;
496 }
497 }
498
499 hdrlen += optlen;
500 #if INET6
501 if (isipv6)
502 ipoptlen = ip6_optlen(tp->t_inpcb);
503 else
504 #endif
505 if (tp->t_inpcb->inp_options) {
506 ipoptlen = tp->t_inpcb->inp_options->m_len -
507 offsetof(struct ipoption, ipopt_list);
508 } else {
509 ipoptlen = 0;
510 }
511 #if IPSEC
512 #if INET6
513 ipoptlen += ipsec_hdrsiz_tcp(tp, isipv6);
514 #else
515 ipoptlen += ipsec_hdrsiz_tcp(tp, 0);
516 #endif
517 #endif
518
519 /*
520 * Adjust data length if insertion of options will
521 * bump the packet length beyond the t_maxopd length.
522 * Clear the FIN bit because we cut off the tail of
523 * the segment.
524 */
525 if (len + optlen + ipoptlen > tp->t_maxopd) {
526 /*
527 * If there is still more to send, don't close the connection.
528 */
529 flags &= ~TH_FIN;
530 len = tp->t_maxopd - optlen - ipoptlen;
531 sendalot = 1;
532 }
533
534 /*#ifdef DIAGNOSTIC*/
535 if (max_linkhdr + hdrlen > MHLEN)
536 panic("tcphdr too big");
537 /*#endif*/
538
539 /*
540 * Grab a header mbuf, attaching a copy of data to
541 * be transmitted, and initialize the header from
542 * the template for sends on this connection.
543 */
544 if (len) {
545 if (tp->t_force && len == 1)
546 tcpstat.tcps_sndprobe++;
547 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
548 tcpstat.tcps_sndrexmitpack++;
549 tcpstat.tcps_sndrexmitbyte += len;
550 } else {
551 tcpstat.tcps_sndpack++;
552 tcpstat.tcps_sndbyte += len;
553 }
554 #ifdef notyet
555 if ((m = m_copypack(so->so_snd.sb_mb, off,
556 (int)len, max_linkhdr + hdrlen)) == 0) {
557 error = ENOBUFS;
558 goto out;
559 }
560 /*
561 * m_copypack left space for our hdr; use it.
562 */
563 m->m_len += hdrlen;
564 m->m_data -= hdrlen;
565 #else
566 MGETHDR(m, M_DONTWAIT, MT_HEADER);
567 if (m == NULL) {
568 error = ENOBUFS;
569 goto out;
570 }
571 #if INET6
572 if (MHLEN < hdrlen + max_linkhdr) {
573 MCLGET(m, M_DONTWAIT);
574 if ((m->m_flags & M_EXT) == 0) {
575 m_freem(m);
576 error = ENOBUFS;
577 goto out;
578 }
579 }
580 #endif
581 m->m_data += max_linkhdr;
582 m->m_len = hdrlen;
583 if (len <= MHLEN - hdrlen - max_linkhdr) {
584 m_copydata(so->so_snd.sb_mb, off, (int) len,
585 mtod(m, caddr_t) + hdrlen);
586 m->m_len += len;
587 } else {
588 m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
589 if (m->m_next == 0) {
590 (void) m_free(m);
591 error = ENOBUFS;
592 goto out;
593 }
594 }
595 #endif
596 /*
597 * If we're sending everything we've got, set PUSH.
598 * (This will keep happy those implementations which only
599 * give data to the user when a buffer fills or
600 * a PUSH comes in.)
601 */
602 if (off + len == so->so_snd.sb_cc)
603 flags |= TH_PUSH;
604 } else {
605 if (tp->t_flags & TF_ACKNOW)
606 tcpstat.tcps_sndacks++;
607 else if (flags & (TH_SYN|TH_FIN|TH_RST))
608 tcpstat.tcps_sndctrl++;
609 else if (SEQ_GT(tp->snd_up, tp->snd_una))
610 tcpstat.tcps_sndurg++;
611 else
612 tcpstat.tcps_sndwinup++;
613
614 MGETHDR(m, M_DONTWAIT, MT_HEADER);
615 if (m == NULL) {
616 error = ENOBUFS;
617 goto out;
618 }
619 #if INET6
620 if (isipv6) {
621 MH_ALIGN(m, hdrlen);
622 } else
623 #endif
624 m->m_data += max_linkhdr;
625 m->m_len = hdrlen;
626 }
627 m->m_pkthdr.rcvif = (struct ifnet *)0;
628 if (tp->t_template == 0)
629 panic("tcp_output");
630 #if INET6
631 if (isipv6) {
632 ip6 = mtod(m, struct ip6_hdr *);
633 th = (struct tcphdr *)(ip6 + 1);
634 bcopy((caddr_t)&tp->t_template->tt_i6, (caddr_t)ip6,
635 sizeof(struct ip6_hdr));
636 bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th,
637 sizeof(struct tcphdr));
638 } else {
639 #endif /* INET6 */
640 ip = mtod(m, struct ip *);
641 ipov = (struct ipovly *)ip;
642 th = (struct tcphdr *)(ip + 1);
643 bcopy((caddr_t)&tp->t_template->tt_i, (caddr_t)ip, sizeof(struct ip));
644 bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th,
645 sizeof(struct tcphdr));
646 #if INET6
647 }
648 #endif /* INET6 */
649
650 /*
651 * Fill in fields, remembering maximum advertised
652 * window for use in delaying messages about window sizes.
653 * If resending a FIN, be sure not to use a new sequence number.
654 */
655 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
656 tp->snd_nxt == tp->snd_max)
657 tp->snd_nxt--;
658 /*
659 * If we are doing retransmissions, then snd_nxt will
660 * not reflect the first unsent octet. For ACK only
661 * packets, we do not want the sequence number of the
662 * retransmitted packet, we want the sequence number
663 * of the next unsent octet. So, if there is no data
664 * (and no SYN or FIN), use snd_max instead of snd_nxt
665 * when filling in ti_seq. But if we are in persist
666 * state, snd_max might reflect one byte beyond the
667 * right edge of the window, so use snd_nxt in that
668 * case, since we know we aren't doing a retransmission.
669 * (retransmit and persist are mutually exclusive...)
670 */
671 if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST])
672 th->th_seq = htonl(tp->snd_nxt);
673 else
674 th->th_seq = htonl(tp->snd_max);
675 th->th_ack = htonl(tp->rcv_nxt);
676 if (optlen) {
677 bcopy(opt, th + 1, optlen);
678 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
679 }
680 th->th_flags = flags;
681 /*
682 * Calculate receive window. Don't shrink window,
683 * but avoid silly window syndrome.
684 */
685 if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg)
686 win = 0;
687 if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
688 win = (long)(tp->rcv_adv - tp->rcv_nxt);
689 if (win > (long)TCP_MAXWIN << tp->rcv_scale)
690 win = (long)TCP_MAXWIN << tp->rcv_scale;
691 th->th_win = htons((u_short) (win>>tp->rcv_scale));
692 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
693 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
694 th->th_flags |= TH_URG;
695 } else
696 /*
697 * If no urgent pointer to send, then we pull
698 * the urgent pointer to the left edge of the send window
699 * so that it doesn't drift into the send window on sequence
700 * number wraparound.
701 */
702 tp->snd_up = tp->snd_una; /* drag it along */
703
704 /*
705 * Put TCP length in extended header, and then
706 * checksum extended header and data.
707 */
708 m->m_pkthdr.len = hdrlen + len;
709 #if INET6
710 if (isipv6) {
711 #if 0 /* ip6_plen will be filled in ip6_output. */
712 ip6->ip6_plen = htons((u_short)(sizeof(struct tcphdr) +
713 optlen + len));
714 #endif
715
716 th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
717 sizeof(struct tcphdr) + optlen + len);
718 } else
719 #endif /* INET6 */
720 {
721
722 if (len + optlen)
723 ipov->ih_len = htons((u_short)(sizeof (struct tcphdr) +
724 optlen + len));
725 m->m_pkthdr.csum_flags = CSUM_TCP;
726 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
727 if (len + optlen) {
728 th->th_sum = in_addword(th->th_sum,
729 htons((u_short)(optlen + len)));
730 }
731
732 }
733
734 /*
735 * In transmit state, time the transmission and arrange for
736 * the retransmit. In persist state, just set snd_max.
737 */
738 if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
739 tcp_seq startseq = tp->snd_nxt;
740
741 /*
742 * Advance snd_nxt over sequence space of this segment.
743 */
744 if (flags & (TH_SYN|TH_FIN)) {
745 if (flags & TH_SYN)
746 tp->snd_nxt++;
747 if (flags & TH_FIN) {
748 tp->snd_nxt++;
749 tp->t_flags |= TF_SENTFIN;
750 }
751 }
752 tp->snd_nxt += len;
753 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
754 tp->snd_max = tp->snd_nxt;
755 /*
756 * Time this transmission if not a retransmission and
757 * not currently timing anything.
758 */
759 if (tp->t_rtt == 0) {
760 tp->t_rtt = 1;
761 tp->t_rtseq = startseq;
762 tcpstat.tcps_segstimed++;
763 }
764 }
765
766 /*
767 * Set retransmit timer if not currently set,
768 * and not doing an ack or a keep-alive probe.
769 * Initial value for retransmit timer is smoothed
770 * round-trip time + 2 * round-trip time variance.
771 * Initialize shift counter which is used for backoff
772 * of retransmit time.
773 */
774 if (tp->t_timer[TCPT_REXMT] == 0 &&
775 tp->snd_nxt != tp->snd_una) {
776 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
777 if (tp->t_timer[TCPT_PERSIST]) {
778 tp->t_timer[TCPT_PERSIST] = 0;
779 tp->t_rxtshift = 0;
780 }
781 }
782 } else
783 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
784 tp->snd_max = tp->snd_nxt + len;
785
786 #if TCPDEBUG
787 /*
788 * Trace.
789 */
790 if (so->so_options & SO_DEBUG) {
791 #if INET6
792 if (isipv6)
793 ip6->ip6_vfc = IPV6_VERSION;
794 else
795 ip->ip_vhl = IP_MAKE_VHL(IPVERSION,
796 IP_VHL_HL(ip->ip_vhl));
797 #endif /* INET6 */
798 tcp_trace(TA_OUTPUT, tp->t_state, tp,
799 #if INET6
800 isipv6 ? (void *)ip6 :
801 #endif /* INET6 */
802 ip,
803 th, 0);
804
805 }
806 #endif /* TCPDEBUG */
807
808 /*
809 * Fill in IP length and desired time to live and
810 * send to IP level. There should be a better way
811 * to handle ttl and tos; we could keep them in
812 * the template, but need a way to checksum without them.
813 */
814 #if INET6
815 if (isipv6) {
816 /*
817 * we separately set hoplimit for every segment, since the
818 * user might want to change the value via setsockopt.
819 * Also, desired default hop limit might be changed via
820 * Neighbor Discovery.
821 */
822 ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb,
823 tp->t_inpcb->in6p_route.ro_rt ?
824 tp->t_inpcb->in6p_route.ro_rt->rt_ifp
825 : NULL);
826
827 /* TODO: IPv6 IP6TOS_ECT bit on */
828 #if IPSEC
829 ipsec_setsocket(m, so);
830 #endif /*IPSEC*/
831 error = ip6_output(m,
832 tp->t_inpcb->in6p_outputopts,
833 &tp->t_inpcb->in6p_route,
834 (so->so_options & SO_DONTROUTE) /* | IP6_DONTFRAG */,
835 NULL, NULL);
836 } else
837 #endif /* INET6 */
838 {
839 #if 1
840 struct rtentry *rt;
841 #endif
842 ip->ip_len = m->m_pkthdr.len;
843 #if INET6
844 if (INP_CHECK_SOCKAF(so, AF_INET6))
845 ip->ip_ttl = in6_selecthlim(tp->t_inpcb,
846 tp->t_inpcb->in6p_route.ro_rt ?
847 tp->t_inpcb->in6p_route.ro_rt->rt_ifp
848 : NULL);
849 else
850 #endif /* INET6 */
851 ip->ip_ttl = tp->t_inpcb->inp_ip_ttl; /* XXX */
852 ip->ip_tos = tp->t_inpcb->inp_ip_tos; /* XXX */
853
854 #define thtoti(x) \
855 ((struct tcpiphdr *)(((char *)(x)) - (sizeof (struct ip))))
856
857 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
858 (((thtoti(th)->ti_src.s_addr & 0xffff) << 16) | (thtoti(th)->ti_dst.s_addr & 0xffff)),
859 th->th_seq, th->th_ack, th->th_win);
860
861
862 #if 1
863 /*
864 * See if we should do MTU discovery. We do it only if the following
865 * are true:
866 * 1) we have a valid route to the destination
867 * 2) the MTU is not locked (if it is, then discovery has been
868 * disabled)
869 */
870 if ((rt = tp->t_inpcb->inp_route.ro_rt)
871 && rt->rt_flags & RTF_UP
872 && !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
873 ip->ip_off |= IP_DF;
874 }
875 #endif
876
877 #if IPSEC
878 ipsec_setsocket(m, so);
879 #endif /*IPSEC*/
880
881 error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
882 so->so_options & SO_DONTROUTE, 0);
883 }
884 if (error) {
885 out:
886 if (error == ENOBUFS) {
887 if (!tp->t_timer[TCPT_REXMT] &&
888 !tp->t_timer[TCPT_PERSIST])
889 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
890 tcp_quench(tp->t_inpcb, 0);
891 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
892 return (0);
893 }
894 #if 1
895 if (error == EMSGSIZE) {
896 /*
897 * ip_output() will have already fixed the route
898 * for us. tcp_mtudisc() will, as its last action,
899 * initiate retransmission, so it is important to
900 * not do so here.
901 */
902 tcp_mtudisc(tp->t_inpcb, 0);
903 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
904 return 0;
905 }
906 #endif
907 if ((error == EHOSTUNREACH || error == ENETDOWN)
908 && TCPS_HAVERCVDSYN(tp->t_state)) {
909 tp->t_softerror = error;
910 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
911 return (0);
912 }
913 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
914 return (error);
915 }
916 tcpstat.tcps_sndtotal++;
917
918 /*
919 * Data sent (as far as we can tell).
920 * If this advertises a larger window than any other segment,
921 * then remember the size of the advertised window.
922 * Any pending ACK has now been sent.
923 */
924 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
925 tp->rcv_adv = tp->rcv_nxt + win;
926 tp->last_ack_sent = tp->rcv_nxt;
927 tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
928 if (sendalot)
929 goto again;
930 KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
931 return (0);
932 }
933
934 void
935 tcp_setpersist(tp)
936 register struct tcpcb *tp;
937 {
938 register int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
939
940 if (tp->t_timer[TCPT_REXMT])
941 panic("tcp_output REXMT");
942 /*
943 * Start/restart persistance timer.
944 */
945 TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
946 t * tcp_backoff[tp->t_rxtshift],
947 TCPTV_PERSMIN, TCPTV_PERSMAX);
948 if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
949 tp->t_rxtshift++;
950 }