+ /*
+ * Record the interface where this segment arrived on; this does not
+ * affect normal data output (for non-detached TCP) as it provides a
+ * hint about which route and interface to use for sending in the
+ * absence of a PCB, when scoped routing (and thus source interface
+ * selection) are enabled.
+ */
+ if ((m->m_pkthdr.pkt_flags & PKTF_LOOP) || m->m_pkthdr.rcvif == NULL) {
+ ifscope = IFSCOPE_NONE;
+ } else {
+ ifscope = m->m_pkthdr.rcvif->if_index;
+ }
+
+ /*
+ * Convert TCP protocol specific fields to host format.
+ */
+
+#if BYTE_ORDER != BIG_ENDIAN
+ NTOHL(th->th_seq);
+ NTOHL(th->th_ack);
+ NTOHS(th->th_win);
+ NTOHS(th->th_urp);
+#endif
+
+ /*
+ * Locate pcb for segment.
+ */
+findpcb:
+
+ isconnected = FALSE;
+ isdisconnected = FALSE;
+
+ if (isipv6) {
+ inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport,
+ &ip6->ip6_dst, th->th_dport, 1,
+ m->m_pkthdr.rcvif);
+ } else {
+ inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
+ ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif);
+ }
+
+ /*
+ * Use the interface scope information from the PCB for outbound
+ * segments. If the PCB isn't present and if scoped routing is
+ * enabled, tcp_respond will use the scope of the interface where
+ * the segment arrived on.
+ */
+ if (inp != NULL && (inp->inp_flags & INP_BOUND_IF)) {
+ ifscope = inp->inp_boundifp->if_index;
+ }
+
+ /*
+ * If the state is CLOSED (i.e., TCB does not exist) then
+ * all data in the incoming segment is discarded.
+ * If the TCB exists but is in CLOSED state, it is embryonic,
+ * but should either do a listen or a connect soon.
+ */
+ if (inp == NULL) {
+ if (log_in_vain) {
+ char dbuf[MAX_IPv6_STR_LEN], sbuf[MAX_IPv6_STR_LEN];
+
+ if (isipv6) {
+ inet_ntop(AF_INET6, &ip6->ip6_dst, dbuf, sizeof(dbuf));
+ inet_ntop(AF_INET6, &ip6->ip6_src, sbuf, sizeof(sbuf));
+ } else {
+ inet_ntop(AF_INET, &ip->ip_dst, dbuf, sizeof(dbuf));
+ inet_ntop(AF_INET, &ip->ip_src, sbuf, sizeof(sbuf));
+ }
+ switch (log_in_vain) {
+ case 1:
+ if (thflags & TH_SYN) {
+ log(LOG_INFO,
+ "Connection attempt to TCP %s:%d from %s:%d\n",
+ dbuf, ntohs(th->th_dport),
+ sbuf,
+ ntohs(th->th_sport));
+ }
+ break;
+ case 2:
+ log(LOG_INFO,
+ "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n",
+ dbuf, ntohs(th->th_dport), sbuf,
+ ntohs(th->th_sport), thflags);
+ break;
+ case 3:
+ case 4:
+ if ((thflags & TH_SYN) && !(thflags & TH_ACK) &&
+ !(m->m_flags & (M_BCAST | M_MCAST)) &&
+ ((isipv6 && !IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) ||
+ (!isipv6 && ip->ip_dst.s_addr != ip->ip_src.s_addr))) {
+ log_in_vain_log((LOG_INFO,
+ "Stealth Mode connection attempt to TCP %s:%d from %s:%d\n",
+ dbuf, ntohs(th->th_dport),
+ sbuf,
+ ntohs(th->th_sport)));
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ if (blackhole) {
+ if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type != IFT_LOOP) {
+ switch (blackhole) {
+ case 1:
+ if (thflags & TH_SYN) {
+ TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "blackhole 1 syn for closed port");
+ goto dropnosock;
+ }
+ break;
+ case 2:
+ TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "blackhole 2 closed port");
+ goto dropnosock;
+ default:
+ TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "blackhole closed port");
+ goto dropnosock;
+ }
+ }
+ }
+ IF_TCP_STATINC(ifp, noconnnolist);
+ TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "closed port");
+ goto dropwithresetnosock;
+ }
+ so = inp->inp_socket;
+ if (so == NULL) {
+ /* This case shouldn't happen as the socket shouldn't be null
+ * if inp_state isn't set to INPCB_STATE_DEAD
+ * But just in case, we pretend we didn't find the socket if we hit this case
+ * as this isn't cause for a panic (the socket might be leaked however)...
+ */
+ inp = NULL;
+#if TEMPDEBUG
+ printf("tcp_input: no more socket for inp=%x. This shouldn't happen\n", inp);
+#endif
+ TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "inp_socket NULL");
+ goto dropnosock;
+ }
+
+ socket_lock(so, 1);
+ if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
+ socket_unlock(so, 1);
+ inp = NULL; // pretend we didn't find it
+ TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "inp state WNT_STOPUSING");
+ goto dropnosock;
+ }
+
+ if (!isipv6 && inp->inp_faddr.s_addr != INADDR_ANY) {
+ if (inp->inp_faddr.s_addr != ip->ip_src.s_addr ||
+ inp->inp_laddr.s_addr != ip->ip_dst.s_addr ||
+ inp->inp_fport != th->th_sport ||
+ inp->inp_lport != th->th_dport) {
+ os_log_error(OS_LOG_DEFAULT, "%s 5-tuple does not match: %u:%u %u:%u\n",
+ __func__,
+ ntohs(inp->inp_fport), ntohs(th->th_sport),
+ ntohs(inp->inp_lport), ntohs(th->th_dport));
+ if (findpcb_iterated) {
+ goto drop;
+ }
+ findpcb_iterated = true;
+ socket_unlock(so, 1);
+ inp = NULL;
+ goto findpcb;
+ }
+ } else if (isipv6 && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
+ if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src) ||
+ !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst) ||
+ inp->inp_fport != th->th_sport ||
+ inp->inp_lport != th->th_dport) {
+ os_log_error(OS_LOG_DEFAULT, "%s 5-tuple does not match: %u:%u %u:%u\n",
+ __func__,
+ ntohs(inp->inp_fport), ntohs(th->th_sport),
+ ntohs(inp->inp_lport), ntohs(th->th_dport));
+ if (findpcb_iterated) {
+ goto drop;
+ }
+ findpcb_iterated = true;
+ socket_unlock(so, 1);
+ inp = NULL;
+ goto findpcb;
+ }
+ }
+
+ tp = intotcpcb(inp);
+ if (tp == NULL) {
+ IF_TCP_STATINC(ifp, noconnlist);
+ TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "tp is NULL");
+ goto dropwithreset;
+ }
+
+ /* Now that we found the tcpcb, we can adjust the TCP timestamp */
+ if (to.to_flags & TOF_TS) {
+ to.to_tsecr -= tp->t_ts_offset;
+ }
+
+ TCP_LOG_TH_FLAGS(TCP_LOG_HDR, th, tp, false, ifp);
+
+ if (tp->t_state == TCPS_CLOSED) {
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "tp state TCPS_CLOSED");
+ goto drop;
+ }
+
+#if NECP
+ if (so->so_state & SS_ISCONNECTED) {
+ // Connected TCP sockets have a fully-bound local and remote,
+ // so the policy check doesn't need to override addresses
+ if (!necp_socket_is_allowed_to_send_recv(inp, ifp, pf_tag, NULL, NULL, NULL, NULL)) {
+ TCP_LOG_DROP_NECP(TCP_LOG_HDR, th, intotcpcb(inp), false);
+ IF_TCP_STATINC(ifp, badformat);
+ goto drop;
+ }
+ } else {
+ /*
+ * If the proc_uuid_policy table has been updated since the last use
+ * of the listening socket (i.e., the proc_uuid_policy_table_gencount
+ * has been updated), the flags in the socket may be out of date.
+ * If INP2_WANT_APP_POLICY is stale, inbound packets may
+ * be dropped by NECP if the socket should now match a per-app
+ * exception policy.
+ * In order to avoid this refresh the proc_uuid_policy state to
+ * potentially recalculate the socket's flags before checking
+ * with NECP.
+ */
+ (void) inp_update_policy(inp);
+
+ if (isipv6) {
+ if (!necp_socket_is_allowed_to_send_recv_v6(inp,
+ th->th_dport, th->th_sport, &ip6->ip6_dst,
+ &ip6->ip6_src, ifp, pf_tag, NULL, NULL, NULL, NULL)) {
+ TCP_LOG_DROP_NECP(TCP_LOG_HDR, th, intotcpcb(inp), false);
+ IF_TCP_STATINC(ifp, badformat);
+ goto drop;
+ }
+ } else {
+ if (!necp_socket_is_allowed_to_send_recv_v4(inp,
+ th->th_dport, th->th_sport, &ip->ip_dst, &ip->ip_src,
+ ifp, pf_tag, NULL, NULL, NULL, NULL)) {
+ TCP_LOG_DROP_NECP(TCP_LOG_HDR, th, intotcpcb(inp), false);
+ IF_TCP_STATINC(ifp, badformat);
+ goto drop;
+ }
+ }
+ }
+#endif /* NECP */
+
+ prev_t_state = tp->t_state;
+
+ /* If none of the FIN|SYN|RST|ACK flag is set, drop */
+ if ((thflags & TH_ACCEPT) == 0) {
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 TH_ACCEPT == 0");
+ goto drop;
+ }
+
+ /* Unscale the window into a 32-bit value. */
+ if ((thflags & TH_SYN) == 0) {
+ tiwin = th->th_win << tp->snd_scale;
+ } else {
+ tiwin = th->th_win;
+ }
+
+ /* Avoid processing packets while closing a listen socket */
+ if (tp->t_state == TCPS_LISTEN &&
+ (so->so_options & SO_ACCEPTCONN) == 0) {
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "closing a listening socket");
+ goto drop;
+ }
+
+ if (so->so_options & (SO_DEBUG | SO_ACCEPTCONN)) {
+#if TCPDEBUG
+ if (so->so_options & SO_DEBUG) {
+ ostate = tp->t_state;
+ if (isipv6) {
+ bcopy((char *)ip6, (char *)tcp_saveipgen,
+ sizeof(*ip6));
+ } else {
+ bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
+ }
+ tcp_savetcp = *th;
+ }
+#endif
+ if (so->so_options & SO_ACCEPTCONN) {
+ struct tcpcb *tp0 = tp;
+ struct socket *so2;
+ struct socket *oso;
+ struct sockaddr_storage from;
+ struct sockaddr_storage to2;
+ struct inpcb *oinp = sotoinpcb(so);
+ struct ifnet *head_ifscope;
+ unsigned int head_nocell, head_recvanyif,
+ head_noexpensive, head_awdl_unrestricted,
+ head_intcoproc_allowed, head_external_port,
+ head_noconstrained;
+
+ /* Get listener's bound-to-interface, if any */
+ head_ifscope = (inp->inp_flags & INP_BOUND_IF) ?
+ inp->inp_boundifp : NULL;
+ /* Get listener's no-cellular information, if any */
+ head_nocell = INP_NO_CELLULAR(inp);
+ /* Get listener's recv-any-interface, if any */
+ head_recvanyif = (inp->inp_flags & INP_RECV_ANYIF);
+ /* Get listener's no-expensive information, if any */
+ head_noexpensive = INP_NO_EXPENSIVE(inp);
+ head_noconstrained = INP_NO_CONSTRAINED(inp);
+ head_awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp);
+ head_intcoproc_allowed = INP_INTCOPROC_ALLOWED(inp);
+ head_external_port = (inp->inp_flags2 & INP2_EXTERNAL_PORT);
+
+ /*
+ * If the state is LISTEN then ignore segment if it contains an RST.
+ * If the segment contains an ACK then it is bad and send a RST.
+ * If it does not contain a SYN then it is not interesting; drop it.
+ * If it is from this socket, drop it, it must be forged.
+ */
+ if ((thflags & (TH_RST | TH_ACK | TH_SYN)) != TH_SYN) {
+ IF_TCP_STATINC(ifp, listbadsyn);
+
+ if (thflags & TH_RST) {
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN with RST");
+ goto drop;
+ }
+ if (thflags & TH_ACK) {
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN with ACK");
+ tp = NULL;
+ tcpstat.tcps_badsyn++;
+ goto dropwithreset;
+ }
+
+ /* We come here if there is no SYN set */
+ tcpstat.tcps_badsyn++;
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad SYN");
+ goto drop;
+ }
+ KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_START, 0, 0, 0, 0, 0);
+ if (th->th_dport == th->th_sport) {
+ if (isipv6) {
+ if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
+ &ip6->ip6_src)) {
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad tuple same port");
+ goto drop;
+ }
+ } else if (ip->ip_dst.s_addr == ip->ip_src.s_addr) {
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad tuple same IPv4 address");
+ goto drop;
+ }
+ }
+ /*
+ * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
+ * in_broadcast() should never return true on a received
+ * packet with M_BCAST not set.
+ *
+ * Packets with a multicast source address should also
+ * be discarded.
+ */
+ if (m->m_flags & (M_BCAST | M_MCAST)) {
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "mbuf M_BCAST | M_MCAST");
+ goto drop;
+ }
+ if (isipv6) {
+ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
+ IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "IN6_IS_ADDR_MULTICAST");
+ goto drop;
+ }
+ } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
+ IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
+ ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
+ in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) {
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "multicast or broadcast address");
+ goto drop;
+ }
+
+
+ /*
+ * If deprecated address is forbidden,
+ * we do not accept SYN to deprecated interface
+ * address to prevent any new inbound connection from
+ * getting established.
+ * When we do not accept SYN, we send a TCP RST,
+ * with deprecated source address (instead of dropping
+ * it). We compromise it as it is much better for peer
+ * to send a RST, and RST will be the final packet
+ * for the exchange.
+ *
+ * If we do not forbid deprecated addresses, we accept
+ * the SYN packet. RFC 4862 forbids dropping SYN in
+ * this case.
+ */
+ if (isipv6 && !ip6_use_deprecated) {
+ uint32_t ia6_flags;
+
+ if (ip6_getdstifaddr_info(m, NULL,
+ &ia6_flags) == 0) {
+ if (ia6_flags & IN6_IFF_DEPRECATED) {
+ tp = NULL;
+ IF_TCP_STATINC(ifp, deprecate6);
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "deprecated IPv6 address");
+ goto dropwithreset;
+ }
+ }
+ }
+ if (so->so_filt || check_cfil) {
+ if (isipv6) {
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6*)&from;
+
+ sin6->sin6_len = sizeof(*sin6);
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = th->th_sport;
+ sin6->sin6_flowinfo = 0;
+ sin6->sin6_addr = ip6->ip6_src;
+ sin6->sin6_scope_id = 0;
+
+ sin6 = (struct sockaddr_in6*)&to2;
+
+ sin6->sin6_len = sizeof(struct sockaddr_in6);
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = th->th_dport;
+ sin6->sin6_flowinfo = 0;
+ sin6->sin6_addr = ip6->ip6_dst;
+ sin6->sin6_scope_id = 0;
+ } else {
+ struct sockaddr_in *sin = (struct sockaddr_in*)&from;
+
+ sin->sin_len = sizeof(*sin);
+ sin->sin_family = AF_INET;
+ sin->sin_port = th->th_sport;
+ sin->sin_addr = ip->ip_src;
+
+ sin = (struct sockaddr_in*)&to2;
+
+ sin->sin_len = sizeof(struct sockaddr_in);
+ sin->sin_family = AF_INET;
+ sin->sin_port = th->th_dport;
+ sin->sin_addr = ip->ip_dst;
+ }
+ }
+
+ if (so->so_filt) {
+ so2 = sonewconn(so, 0, (struct sockaddr*)&from);
+ } else {
+ so2 = sonewconn(so, 0, NULL);
+ }
+ if (so2 == 0) {
+ tcpstat.tcps_listendrop++;
+ if (tcp_dropdropablreq(so)) {
+ if (so->so_filt) {
+ so2 = sonewconn(so, 0, (struct sockaddr*)&from);
+ } else {
+ so2 = sonewconn(so, 0, NULL);
+ }
+ }
+ if (!so2) {
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " listen drop");
+ goto drop;
+ }
+ }
+
+ /* Point "inp" and "tp" in tandem to new socket */
+ inp = (struct inpcb *)so2->so_pcb;
+ tp = intotcpcb(inp);
+
+ oso = so;
+ socket_unlock(so, 0); /* Unlock but keep a reference on listener for now */
+
+ so = so2;
+ socket_lock(so, 1);
+ /*
+ * Mark socket as temporary until we're
+ * committed to keeping it. The code at
+ * ``drop'' and ``dropwithreset'' check the
+ * flag dropsocket to see if the temporary
+ * socket created here should be discarded.
+ * We mark the socket as discardable until
+ * we're committed to it below in TCPS_LISTEN.
+ * There are some error conditions in which we
+ * have to drop the temporary socket.
+ */
+ dropsocket++;
+ /*
+ * Inherit INP_BOUND_IF from listener; testing if
+ * head_ifscope is non-NULL is sufficient, since it
+ * can only be set to a non-zero value earlier if
+ * the listener has such a flag set.
+ */
+ if (head_ifscope != NULL) {
+ inp->inp_flags |= INP_BOUND_IF;
+ inp->inp_boundifp = head_ifscope;
+ } else {
+ inp->inp_flags &= ~INP_BOUND_IF;
+ }
+ /*
+ * Inherit restrictions from listener.
+ */
+ if (head_nocell) {
+ inp_set_nocellular(inp);
+ }
+ if (head_noexpensive) {
+ inp_set_noexpensive(inp);
+ }
+ if (head_noconstrained) {
+ inp_set_noconstrained(inp);
+ }
+ if (head_awdl_unrestricted) {
+ inp_set_awdl_unrestricted(inp);
+ }
+ if (head_intcoproc_allowed) {
+ inp_set_intcoproc_allowed(inp);
+ }
+ /*
+ * Inherit {IN,IN6}_RECV_ANYIF from listener.
+ */
+ if (head_recvanyif) {
+ inp->inp_flags |= INP_RECV_ANYIF;
+ } else {
+ inp->inp_flags &= ~INP_RECV_ANYIF;
+ }
+
+ if (head_external_port) {
+ inp->inp_flags2 |= INP2_EXTERNAL_PORT;
+ }
+ if (isipv6) {
+ inp->in6p_laddr = ip6->ip6_dst;
+ } else {
+ inp->inp_vflag &= ~INP_IPV6;
+ inp->inp_vflag |= INP_IPV4;
+ inp->inp_laddr = ip->ip_dst;
+ }
+ inp->inp_lport = th->th_dport;
+ if (in_pcbinshash(inp, 0) != 0) {
+ /*
+ * Undo the assignments above if we failed to
+ * put the PCB on the hash lists.
+ */
+ if (isipv6) {
+ inp->in6p_laddr = in6addr_any;
+ } else {
+ inp->inp_laddr.s_addr = INADDR_ANY;
+ }
+ inp->inp_lport = 0;
+ socket_lock(oso, 0); /* release ref on parent */
+ socket_unlock(oso, 1);
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " in_pcbinshash failed");
+ goto drop;
+ }
+ socket_lock(oso, 0);
+ if (isipv6) {
+ /*
+ * Inherit socket options from the listening
+ * socket.
+ * Note that in6p_inputopts are not (even
+ * should not be) copied, since it stores
+ * previously received options and is used to
+ * detect if each new option is different than
+ * the previous one and hence should be passed
+ * to a user.
+ * If we copied in6p_inputopts, a user would
+ * not be able to receive options just after
+ * calling the accept system call.
+ */
+ inp->inp_flags |=
+ oinp->inp_flags & INP_CONTROLOPTS;
+ if (oinp->in6p_outputopts) {
+ inp->in6p_outputopts =
+ ip6_copypktopts(oinp->in6p_outputopts,
+ M_NOWAIT);
+ }
+ } else {
+ inp->inp_options = ip_srcroute();
+ inp->inp_ip_tos = oinp->inp_ip_tos;
+ }
+#if IPSEC
+ /* copy old policy into new socket's */
+ if (sotoinpcb(oso)->inp_sp) {
+ int error = 0;
+ /* Is it a security hole here to silently fail to copy the policy? */
+ if (inp->inp_sp != NULL) {
+ error = ipsec_init_policy(so, &inp->inp_sp);
+ }
+ if (error != 0 || ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp)) {
+ printf("tcp_input: could not copy policy\n");
+ }
+ }
+#endif
+ /* inherit states from the listener */
+ DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
+ struct tcpcb *, tp, int32_t, TCPS_LISTEN);
+ tp->t_state = TCPS_LISTEN;
+ tp->t_flags |= tp0->t_flags & (TF_NOPUSH | TF_NOOPT | TF_NODELAY);
+ tp->t_flagsext |= (tp0->t_flagsext & (TF_RXTFINDROP | TF_NOTIMEWAIT | TF_FASTOPEN));
+ tp->t_keepinit = tp0->t_keepinit;
+ tp->t_keepcnt = tp0->t_keepcnt;
+ tp->t_keepintvl = tp0->t_keepintvl;
+ tp->t_adaptive_wtimo = tp0->t_adaptive_wtimo;
+ tp->t_adaptive_rtimo = tp0->t_adaptive_rtimo;
+ tp->t_inpcb->inp_ip_ttl = tp0->t_inpcb->inp_ip_ttl;
+ if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0) {
+ tp->t_notsent_lowat = tp0->t_notsent_lowat;
+ }
+ tp->t_inpcb->inp_flags2 |=
+ tp0->t_inpcb->inp_flags2 & INP2_KEEPALIVE_OFFLOAD;
+
+ /* now drop the reference on the listener */
+ socket_unlock(oso, 1);
+
+ tcp_set_max_rwinscale(tp, so);
+
+#if CONTENT_FILTER
+ if (check_cfil) {
+ int error = cfil_sock_attach(so2, (struct sockaddr*)&to2, (struct sockaddr*)&from,
+ CFS_CONNECTION_DIR_IN);
+ if (error != 0) {
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " cfil_sock_attach failed");
+ goto drop;
+ }
+ }
+#endif /* CONTENT_FILTER */
+
+ KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END, 0, 0, 0, 0, 0);
+ }
+ }
+ socket_lock_assert_owned(so);
+
+ if (net_mpklog_enabled && (m->m_pkthdr.rcvif->if_xflags & IFXF_MPK_LOG)) {
+ MPKL_TCP_INPUT(tcp_mpkl_log_object,
+ ntohs(tp->t_inpcb->inp_lport), ntohs(tp->t_inpcb->inp_fport),
+ th->th_seq, th->th_ack, tlen, thflags,
+ so->last_pid, so->so_log_seqn++);
+ }
+
+ if (tp->t_state == TCPS_ESTABLISHED && tlen > 0) {
+ /*
+ * Evaluate the rate of arrival of packets to see if the
+ * receiver can reduce the ack traffic. The algorithm to
+ * stretch acks will be enabled if the connection meets
+ * certain criteria defined in tcp_stretch_ack_enable function.
+ */
+ if ((tp->t_flagsext & TF_RCVUNACK_WAITSS) != 0) {
+ TCP_INC_VAR(tp->rcv_waitforss, segment_count);
+ }
+ if (tcp_stretch_ack_enable(tp, thflags)) {
+ tp->t_flags |= TF_STRETCHACK;
+ tp->t_flagsext &= ~(TF_RCVUNACK_WAITSS);
+ tp->rcv_waitforss = 0;
+ } else {
+ tp->t_flags &= ~(TF_STRETCHACK);
+ }
+ if (TSTMP_GT(tp->rcv_unackwin - (tcp_rcvunackwin >> 1), tcp_now)) {
+ tp->rcv_by_unackhalfwin += (tlen + off);
+ tp->rcv_by_unackwin += (tlen + off);
+ } else {
+ tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
+ tp->rcv_by_unackwin = tp->rcv_by_unackhalfwin + tlen + off;
+ tp->rcv_by_unackhalfwin = tlen + off;
+ }
+ }
+
+ /*
+ * Clear TE_SENDECE if TH_CWR is set. This is harmless, so we don't
+ * bother doing extensive checks for state and whatnot.
+ */
+ if (thflags & TH_CWR) {
+ tp->ecn_flags &= ~TE_SENDECE;
+ tp->t_ecn_recv_cwr++;
+ }
+
+ /*
+ * Explicit Congestion Notification - Flag that we need to send ECT if
+ * + The IP Congestion experienced flag was set.
+ * + Socket is in established state
+ * + We negotiated ECN in the TCP setup
+ * + This isn't a pure ack (tlen > 0)
+ * + The data is in the valid window
+ *
+ * TE_SENDECE will be cleared when we receive a packet with TH_CWR set.
+ */
+ if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED &&
+ TCP_ECN_ENABLED(tp) && tlen > 0 &&
+ SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
+ tp->t_ecn_recv_ce++;
+ tcpstat.tcps_ecn_recv_ce++;
+ INP_INC_IFNET_STAT(inp, ecn_recv_ce);
+ /* Mark this connection as it received CE from network */
+ tp->ecn_flags |= TE_RECV_ECN_CE;
+ tp->ecn_flags |= TE_SENDECE;
+ }
+
+ /*
+ * If we received an explicit notification of congestion in
+ * ip tos ecn bits or by the CWR bit in TCP header flags, reset
+ * the ack-stretching state. We need to handle ECN notification if
+ * an ECN setup SYN was sent even once.
+ */
+ if (tp->t_state == TCPS_ESTABLISHED &&
+ (tp->ecn_flags & TE_SETUPSENT) &&
+ (ip_ecn == IPTOS_ECN_CE || (thflags & TH_CWR))) {
+ tcp_reset_stretch_ack(tp);
+ tp->t_forced_acks = TCP_FORCED_ACKS_COUNT;
+ CLEAR_IAJ_STATE(tp);
+ }
+
+ if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED &&
+ !TCP_ECN_ENABLED(tp) && !(tp->ecn_flags & TE_CEHEURI_SET)) {
+ tcpstat.tcps_ecn_fallback_ce++;
+ tcp_heuristic_ecn_aggressive(tp);
+ tp->ecn_flags |= TE_CEHEURI_SET;
+ }
+
+ if (tp->t_state == TCPS_ESTABLISHED && TCP_ECN_ENABLED(tp) &&
+ ip_ecn == IPTOS_ECN_CE && !(tp->ecn_flags & TE_CEHEURI_SET)) {
+ if (inp->inp_stat->rxpackets < ECN_MIN_CE_PROBES) {
+ tp->t_ecn_recv_ce_pkt++;
+ } else if (tp->t_ecn_recv_ce_pkt > ECN_MAX_CE_RATIO) {
+ tcpstat.tcps_ecn_fallback_ce++;
+ tcp_heuristic_ecn_aggressive(tp);
+ tp->ecn_flags |= TE_CEHEURI_SET;
+ INP_INC_IFNET_STAT(inp, ecn_fallback_ce);
+ } else {
+ /* We tracked the first ECN_MIN_CE_PROBES segments, we
+ * now know that the path is good.
+ */
+ tp->ecn_flags |= TE_CEHEURI_SET;
+ }
+ }
+
+ /* Update rcvtime as a new segment was received on the connection */
+ tp->t_rcvtime = tcp_now;
+
+ /*
+ * Segment received on connection.
+ * Reset idle time and keep-alive timer.
+ */
+ if (TCPS_HAVEESTABLISHED(tp->t_state)) {
+ tcp_keepalive_reset(tp);
+
+ if (tp->t_mpsub) {
+ mptcp_reset_keepalive(tp);
+ }
+ }
+
+ /*
+ * Process options if not in LISTEN state,
+ * else do it below (after getting remote address).
+ */
+ if (tp->t_state != TCPS_LISTEN && optp) {
+ tcp_dooptions(tp, optp, optlen, th, &to);
+ }
+#if MPTCP
+ if (tp->t_state != TCPS_LISTEN && (so->so_flags & SOF_MP_SUBFLOW) &&
+ mptcp_input_preproc(tp, m, th, drop_hdrlen) != 0) {
+ tp->t_flags |= TF_ACKNOW;
+ (void) tcp_output(tp);
+ tcp_check_timer_state(tp);
+ socket_unlock(so, 1);
+ return;
+ }
+#endif /* MPTCP */
+ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
+ if (!(thflags & TH_ACK) ||
+ (SEQ_GT(th->th_ack, tp->iss) &&
+ SEQ_LEQ(th->th_ack, tp->snd_max))) {
+ tcp_finalize_options(tp, &to, ifscope);
+ }
+ }
+
+#if TRAFFIC_MGT
+ /*
+ * Compute inter-packet arrival jitter. According to RFC 3550,
+ * inter-packet arrival jitter is defined as the difference in
+ * packet spacing at the receiver compared to the sender for a
+ * pair of packets. When two packets of maximum segment size come
+ * one after the other with consecutive sequence numbers, we
+ * consider them as packets sent together at the sender and use
+ * them as a pair to compute inter-packet arrival jitter. This
+ * metric indicates the delay induced by the network components due
+ * to queuing in edge/access routers.
+ */
+ if (tp->t_state == TCPS_ESTABLISHED &&
+ (thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK | TH_ECE | TH_PUSH)) == TH_ACK &&
+ ((tp->t_flags & TF_NEEDFIN) == 0) &&
+ ((to.to_flags & TOF_TS) == 0 ||
+ TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
+ th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq)) {
+ int seg_size = tlen;
+ if (tp->iaj_pktcnt <= IAJ_IGNORE_PKTCNT) {
+ TCP_INC_VAR(tp->iaj_pktcnt, segment_count);
+ }
+
+ if (tp->iaj_size == 0 || seg_size > tp->iaj_size ||
+ (seg_size == tp->iaj_size && tp->iaj_rcv_ts == 0)) {
+ /*
+ * State related to inter-arrival jitter is
+ * uninitialized or we are trying to find a good
+ * first packet to start computing the metric
+ */
+ update_iaj_state(tp, seg_size, 0);
+ } else {
+ if (seg_size == tp->iaj_size) {
+ /*
+ * Compute inter-arrival jitter taking
+ * this packet as the second packet
+ */
+ compute_iaj(tp);
+ }
+ if (seg_size < tp->iaj_size) {
+ /*
+ * There is a smaller packet in the stream.
+ * Some times the maximum size supported
+ * on a path can change if there is a new
+ * link with smaller MTU. The receiver will
+ * not know about this change. If there
+ * are too many packets smaller than
+ * iaj_size, we try to learn the iaj_size
+ * again.
+ */
+ TCP_INC_VAR(tp->iaj_small_pkt, segment_count);
+ if (tp->iaj_small_pkt > RESET_IAJ_SIZE_THRESH) {
+ update_iaj_state(tp, seg_size, 1);
+ } else {
+ CLEAR_IAJ_STATE(tp);
+ }
+ } else {
+ update_iaj_state(tp, seg_size, 0);
+ }
+ }
+ } else {
+ CLEAR_IAJ_STATE(tp);
+ }
+#endif /* TRAFFIC_MGT */
+
+ /*
+ * Header prediction: check for the two common cases
+ * of a uni-directional data xfer. If the packet has
+ * no control flags, is in-sequence, the window didn't
+ * change and we're not retransmitting, it's a
+ * candidate. If the length is zero and the ack moved
+ * forward, we're the sender side of the xfer. Just
+ * free the data acked & wake any higher level process
+ * that was blocked waiting for space. If the length
+ * is non-zero and the ack didn't move, we're the
+ * receiver side. If we're getting packets in-order
+ * (the reassembly queue is empty), add the data to
+ * the socket buffer and note that we need a delayed ack.
+ * Make sure that the hidden state-flags are also off.
+ * Since we check for TCPS_ESTABLISHED above, it can only
+ * be TH_NEEDSYN.
+ */
+ if (tp->t_state == TCPS_ESTABLISHED &&
+ !(so->so_state & SS_CANTRCVMORE) &&
+ (thflags & TH_FLAGS) == TH_ACK &&
+ ((tp->t_flags & TF_NEEDFIN) == 0) &&
+ ((to.to_flags & TOF_TS) == 0 ||
+ TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
+ th->th_seq == tp->rcv_nxt &&
+ tiwin && tiwin == tp->snd_wnd &&
+ tp->snd_nxt == tp->snd_max) {
+ /*
+ * If last ACK falls within this segment's sequence numbers,
+ * record the timestamp.
+ * NOTE that the test is modified according to the latest
+ * proposal of the tcplw@cray.com list (Braden 1993/04/26).
+ */
+ if ((to.to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
+ tp->ts_recent_age = tcp_now;
+ tp->ts_recent = to.to_tsval;
+ }
+
+ if (tlen == 0) {
+ if (SEQ_GT(th->th_ack, tp->snd_una) &&
+ SEQ_LEQ(th->th_ack, tp->snd_max) &&
+ tp->snd_cwnd >= tp->snd_ssthresh &&
+ (!IN_FASTRECOVERY(tp) &&
+ ((!(SACK_ENABLED(tp)) &&
+ tp->t_dupacks < tp->t_rexmtthresh) ||
+ (SACK_ENABLED(tp) && to.to_nsacks == 0 &&
+ TAILQ_EMPTY(&tp->snd_holes))))) {
+ /*
+ * this is a pure ack for outstanding data.
+ */
+ ++tcpstat.tcps_predack;
+
+ tcp_bad_rexmt_check(tp, th, &to);
+
+ /* Recalculate the RTT */
+ tcp_compute_rtt(tp, &to, th);
+
+ VERIFY(SEQ_GEQ(th->th_ack, tp->snd_una));
+ acked = BYTES_ACKED(th, tp);
+ tcpstat.tcps_rcvackpack++;
+ tcpstat.tcps_rcvackbyte += acked;
+
+ /*
+ * Handle an ack that is in sequence during
+ * congestion avoidance phase. The
+ * calculations in this function
+ * assume that snd_una is not updated yet.
+ */
+ if (CC_ALGO(tp)->congestion_avd != NULL) {
+ CC_ALGO(tp)->congestion_avd(tp, th);
+ }
+ tcp_ccdbg_trace(tp, th, TCP_CC_INSEQ_ACK_RCVD);
+ sbdrop(&so->so_snd, acked);
+ tcp_sbsnd_trim(&so->so_snd);
+
+ if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
+ SEQ_LEQ(th->th_ack, tp->snd_recover)) {
+ tp->snd_recover = th->th_ack - 1;
+ }
+
+ tcp_update_snd_una(tp, th->th_ack);
+
+ TCP_RESET_REXMT_STATE(tp);
+
+ /*
+ * pull snd_wl2 up to prevent seq wrap relative
+ * to th_ack.
+ */
+ tp->snd_wl2 = th->th_ack;
+
+ if (tp->t_dupacks > 0) {
+ tp->t_dupacks = 0;
+ tp->t_rexmtthresh = tcprexmtthresh;
+ tp->t_new_dupacks = 0;
+ }
+
+ tp->sackhint.sack_bytes_acked = 0;
+
+ /*
+ * If all outstanding data are acked, stop
+ * retransmit timer, otherwise restart timer
+ * using current (possibly backed-off) value.
+ * If process is waiting for space,
+ * wakeup/selwakeup/signal. If data
+ * are ready to send, let tcp_output
+ * decide between more output or persist.
+ */
+ if (tp->snd_una == tp->snd_max) {
+ tp->t_timer[TCPT_REXMT] = 0;
+ tp->t_timer[TCPT_PTO] = 0;
+ } else if (tp->t_timer[TCPT_PERSIST] == 0) {
+ tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
+ }
+ if (!SLIST_EMPTY(&tp->t_rxt_segments) &&
+ !TCP_DSACK_SEQ_IN_WINDOW(tp,
+ tp->t_dsack_lastuna, tp->snd_una)) {
+ tcp_rxtseg_clean(tp);
+ }
+
+ if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
+ tp->t_bwmeas != NULL) {
+ tcp_bwmeas_check(tp);
+ }
+
+ write_wakeup = 1;
+ if (!SLIST_EMPTY(&tp->t_notify_ack)) {
+ tcp_notify_acknowledgement(tp, so);
+ }
+
+ if ((so->so_snd.sb_cc) || (tp->t_flags & TF_ACKNOW)) {
+ (void) tcp_output(tp);
+ }
+
+ tcp_tfo_rcv_ack(tp, th);
+
+ m_freem(m);
+
+ tcp_check_timer_state(tp);
+
+ tcp_handle_wakeup(so, read_wakeup, write_wakeup);
+
+ socket_unlock(so, 1);
+ KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
+ return;
+ }
+ } else if (th->th_ack == tp->snd_una && LIST_EMPTY(&tp->t_segq) &&
+ tlen <= tcp_sbspace(tp)) {
+ /*
+ * this is a pure, in-sequence data packet
+ * with nothing on the reassembly queue and
+ * we have enough buffer space to take it.
+ */
+
+ /* Clean receiver SACK report if present */
+ if (SACK_ENABLED(tp) && tp->rcv_numsacks) {
+ tcp_clean_sackreport(tp);
+ }
+ ++tcpstat.tcps_preddat;
+ tp->rcv_nxt += tlen;
+ /*
+ * Pull snd_wl1 up to prevent seq wrap relative to
+ * th_seq.
+ */
+ tp->snd_wl1 = th->th_seq;
+ /*
+ * Pull rcv_up up to prevent seq wrap relative to
+ * rcv_nxt.
+ */
+ tp->rcv_up = tp->rcv_nxt;
+ TCP_INC_VAR(tcpstat.tcps_rcvpack, segment_count);
+ tcpstat.tcps_rcvbyte += tlen;
+ if (nstat_collect) {
+ INP_ADD_STAT(inp, cell, wifi, wired,
+ rxpackets, 1);
+ INP_ADD_STAT(inp, cell, wifi, wired, rxbytes,
+ tlen);
+ inp_set_activity_bitmap(inp);
+ }
+
+ /*
+ * Calculate the RTT on the receiver only if the
+ * connection is in streaming mode and the last
+ * packet was not an end-of-write
+ */
+ if (tp->t_flags & TF_STREAMING_ON) {
+ tcp_compute_rtt(tp, &to, th);
+ }
+
+ tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen);
+
+ /*
+ * Add data to socket buffer.
+ */
+ so_recv_data_stat(so, m, 0);
+ m_adj(m, drop_hdrlen); /* delayed header drop */
+
+ if (isipv6) {
+ memcpy(&saved_hdr, ip6, sizeof(struct ip6_hdr));
+ ip6 = (struct ip6_hdr *)&saved_hdr[0];
+ } else {
+ memcpy(&saved_hdr, ip, ip->ip_hl << 2);
+ ip = (struct ip *)&saved_hdr[0];
+ }
+ memcpy(&saved_tcphdr, th, sizeof(struct tcphdr));
+
+ if (th->th_flags & TH_PUSH) {
+ tp->t_flagsext |= TF_LAST_IS_PSH;
+ } else {
+ tp->t_flagsext &= ~TF_LAST_IS_PSH;
+ }
+
+ if (sbappendstream_rcvdemux(so, m)) {
+ mptcp_handle_input(so);
+ read_wakeup = 1;
+ }
+ th = &saved_tcphdr;
+
+ if (isipv6) {
+ KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
+ (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
+ th->th_seq, th->th_ack, th->th_win);
+ } else {
+ KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
+ (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
+ th->th_seq, th->th_ack, th->th_win);
+ }
+ TCP_INC_VAR(tp->t_unacksegs, segment_count);
+ if (DELAY_ACK(tp, th)) {
+ if ((tp->t_flags & TF_DELACK) == 0) {
+ tp->t_flags |= TF_DELACK;
+ tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
+ }
+ } else {
+ tp->t_flags |= TF_ACKNOW;
+ tcp_output(tp);
+ }
+
+ tcp_adaptive_rwtimo_check(tp, tlen);
+
+ if (tlen > 0) {
+ tcp_tfo_rcv_data(tp);
+ }
+
+ tcp_check_timer_state(tp);
+
+ tcp_handle_wakeup(so, read_wakeup, write_wakeup);
+
+ socket_unlock(so, 1);
+ KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
+ return;
+ }
+ }
+
+ /*
+ * Calculate amount of space in receive window,
+ * and then do TCP input processing.
+ * Receive window is amount of space in rcv queue,
+ * but not less than advertised window.
+ */
+ socket_lock_assert_owned(so);
+ win = tcp_sbspace(tp);
+ if (win < 0) {
+ win = 0;
+ } else { /* clip rcv window to 4K for modems */
+ if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) {
+ win = min(win, slowlink_wsize);
+ }
+ }
+ tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
+#if MPTCP
+ /*
+ * Ensure that the subflow receive window isn't greater
+ * than the connection level receive window.
+ */
+ if ((tp->t_mpflags & TMPF_MPTCP_TRUE) && (mp_tp = tptomptp(tp))) {
+ socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
+ int64_t recwin_conn = (int64_t)(mp_tp->mpt_rcvadv - mp_tp->mpt_rcvnxt);
+
+ VERIFY(recwin_conn < INT32_MAX && recwin_conn > INT32_MIN);
+ if (recwin_conn > 0 && tp->rcv_wnd > (uint32_t)recwin_conn) {
+ tp->rcv_wnd = (uint32_t)recwin_conn;
+ tcpstat.tcps_mp_reducedwin++;
+ }
+ }
+#endif /* MPTCP */
+
+ switch (tp->t_state) {
+ /*
+ * Initialize tp->rcv_nxt, and tp->irs, select an initial
+ * tp->iss, and send a segment:
+ * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
+ * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
+ * Fill in remote peer address fields if not previously specified.
+ * Enter SYN_RECEIVED state, and process any other fields of this
+ * segment in this state.
+ */
+ case TCPS_LISTEN: {
+ struct sockaddr_in *sin;
+ struct sockaddr_in6 *sin6;
+
+ socket_lock_assert_owned(so);
+
+ /* Clear the logging flags inherited from the listening socket */
+ tp->t_log_flags = 0;
+ tp->t_flagsext &= ~TF_LOGGED_CONN_SUMMARY;
+
+ if (isipv6) {
+ MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6,
+ M_SONAME, M_NOWAIT);
+ if (sin6 == NULL) {
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "LISTEN malloc M_SONAME failed");
+ goto drop;
+ }
+ bzero(sin6, sizeof(*sin6));
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_len = sizeof(*sin6);
+ sin6->sin6_addr = ip6->ip6_src;
+ sin6->sin6_port = th->th_sport;
+ laddr6 = inp->in6p_laddr;
+ if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
+ inp->in6p_laddr = ip6->ip6_dst;
+ }
+ if (in6_pcbconnect(inp, (struct sockaddr *)sin6,
+ kernel_proc)) {
+ inp->in6p_laddr = laddr6;
+ FREE(sin6, M_SONAME);
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " LISTEN in6_pcbconnect failed");
+ goto drop;
+ }
+ FREE(sin6, M_SONAME);
+ } else {
+ socket_lock_assert_owned(so);
+ MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
+ M_NOWAIT);
+ if (sin == NULL) {
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "LISTEN malloc M_SONAME failed");
+ goto drop;
+ }
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(*sin);
+ sin->sin_addr = ip->ip_src;
+ sin->sin_port = th->th_sport;
+ bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
+ laddr = inp->inp_laddr;
+ if (inp->inp_laddr.s_addr == INADDR_ANY) {
+ inp->inp_laddr = ip->ip_dst;
+ }
+ if (in_pcbconnect(inp, (struct sockaddr *)sin, kernel_proc,
+ IFSCOPE_NONE, NULL)) {
+ inp->inp_laddr = laddr;
+ FREE(sin, M_SONAME);
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " LISTEN in_pcbconnect failed");
+ goto drop;
+ }
+ FREE(sin, M_SONAME);
+ }
+
+ tcp_dooptions(tp, optp, optlen, th, &to);
+ tcp_finalize_options(tp, &to, ifscope);
+
+ if (tfo_enabled(tp) && tcp_tfo_syn(tp, &to)) {
+ isconnected = TRUE;
+ }
+
+ if (iss) {
+ tp->iss = iss;
+ } else {
+ tp->iss = tcp_new_isn(tp);
+ }
+ tp->irs = th->th_seq;
+ tcp_sendseqinit(tp);
+ tcp_rcvseqinit(tp);
+ tp->snd_recover = tp->snd_una;
+ /*
+ * Initialization of the tcpcb for transaction;
+ * set SND.WND = SEG.WND,
+ * initialize CCsend and CCrecv.
+ */
+ tp->snd_wnd = tiwin; /* initial send-window */
+ tp->max_sndwnd = tp->snd_wnd;
+ tp->t_flags |= TF_ACKNOW;
+ tp->t_unacksegs = 0;
+ DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
+ struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
+ tp->t_state = TCPS_SYN_RECEIVED;
+ tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
+ TCP_CONN_KEEPINIT(tp));
+ tp->t_connect_time = tcp_now;
+ dropsocket = 0; /* committed to socket */
+
+ if (inp->inp_flowhash == 0) {
+ inp->inp_flowhash = inp_calc_flowhash(inp);
+ }
+ /* update flowinfo - RFC 6437 */
+ if (inp->inp_flow == 0 &&
+ inp->in6p_flags & IN6P_AUTOFLOWLABEL) {
+ inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
+ inp->inp_flow |=
+ (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
+ }
+
+ /* reset the incomp processing flag */
+ so->so_flags &= ~(SOF_INCOMP_INPROGRESS);
+ tcpstat.tcps_accepts++;
+ if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE | TH_CWR)) {
+ /* ECN-setup SYN */
+ tp->ecn_flags |= (TE_SETUPRECEIVED | TE_SENDIPECT);
+ }
+
+ /*
+ * The address and connection state are finalized
+ */
+ TCP_LOG_CONNECT(tp, false, 0);
+
+ tcp_add_fsw_flow(tp, ifp);
+
+ goto trimthenstep6;
+ }
+
+ /*
+ * If the state is SYN_RECEIVED and the seg contains an ACK,
+ * but not for our SYN/ACK, send a RST.
+ */
+ case TCPS_SYN_RECEIVED:
+ if ((thflags & TH_ACK) &&
+ (SEQ_LEQ(th->th_ack, tp->snd_una) ||
+ SEQ_GT(th->th_ack, tp->snd_max))) {
+ IF_TCP_STATINC(ifp, ooopacket);
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_RECEIVED bad ACK");
+ goto dropwithreset;
+ }
+
+ /*
+ * In SYN_RECEIVED state, if we recv some SYNS with
+ * window scale and others without, window scaling should
+ * be disabled. Otherwise the window advertised will be
+ * lower if we assume scaling and the other end does not.
+ */
+ if ((thflags & TH_SYN) &&
+ (tp->irs == th->th_seq) &&
+ !(to.to_flags & TOF_SCALE)) {
+ tp->t_flags &= ~TF_RCVD_SCALE;
+ }
+ break;
+
+ /*
+ * If the state is SYN_SENT:
+ * if seg contains an ACK, but not for our SYN, drop the input.
+ * if seg contains a RST, then drop the connection.
+ * if seg does not contain SYN, then drop it.
+ * Otherwise this is an acceptable SYN segment
+ * initialize tp->rcv_nxt and tp->irs
+ * if seg contains ack then advance tp->snd_una
+ * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
+ * arrange for segment to be acked (eventually)
+ * continue processing rest of data/controls, beginning with URG
+ */
+ case TCPS_SYN_SENT:
+ if ((thflags & TH_ACK) &&
+ (SEQ_LEQ(th->th_ack, tp->iss) ||
+ SEQ_GT(th->th_ack, tp->snd_max))) {
+ IF_TCP_STATINC(ifp, ooopacket);
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_SENT bad ACK");
+ goto dropwithreset;
+ }
+ if (thflags & TH_RST) {
+ if ((thflags & TH_ACK) != 0) {
+ if (tfo_enabled(tp) &&
+ !(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE)) {
+ tcp_heuristic_tfo_rst(tp);
+ }
+ if ((tp->ecn_flags & (TE_SETUPSENT | TE_RCVD_SYN_RST)) == TE_SETUPSENT) {
+ /*
+ * On local connections, send
+ * non-ECN syn one time before
+ * dropping the connection
+ */
+ if (tp->t_flags & TF_LOCAL) {
+ tp->ecn_flags |= TE_RCVD_SYN_RST;
+ goto drop;
+ } else {
+ tcp_heuristic_ecn_synrst(tp);
+ }
+ }
+ soevent(so,
+ (SO_FILT_HINT_LOCKED |
+ SO_FILT_HINT_CONNRESET));
+ tp = tcp_drop(tp, ECONNREFUSED);
+ }
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_SENT got RST");
+ goto drop;
+ }
+ if ((thflags & TH_SYN) == 0) {
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_SENT no SYN");
+ goto drop;
+ }
+ tp->snd_wnd = th->th_win; /* initial send window */
+ tp->max_sndwnd = tp->snd_wnd;
+
+ tp->irs = th->th_seq;
+ tcp_rcvseqinit(tp);
+ if (thflags & TH_ACK) {
+ tcpstat.tcps_connects++;
+
+ if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE)) {
+ /* ECN-setup SYN-ACK */
+ tp->ecn_flags |= TE_SETUPRECEIVED;
+ if (TCP_ECN_ENABLED(tp)) {
+ tcp_heuristic_ecn_success(tp);
+ tcpstat.tcps_ecn_client_success++;
+ }
+ } else {
+ if (tp->ecn_flags & TE_SETUPSENT &&
+ tp->t_rxtshift == 0) {
+ tcp_heuristic_ecn_success(tp);
+ tcpstat.tcps_ecn_not_supported++;
+ }
+ if (tp->ecn_flags & TE_SETUPSENT &&
+ tp->t_rxtshift > 0) {
+ tcp_heuristic_ecn_loss(tp);
+ }
+
+ /* non-ECN-setup SYN-ACK */
+ tp->ecn_flags &= ~TE_SENDIPECT;
+ }
+
+ /* Do window scaling on this connection? */
+ if (TCP_WINDOW_SCALE_ENABLED(tp)) {
+ tp->snd_scale = tp->requested_s_scale;
+ tp->rcv_scale = tp->request_r_scale;
+ }
+
+ tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale);
+ tp->snd_una++; /* SYN is acked */
+ if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
+ tp->snd_nxt = tp->snd_una;
+ }
+
+ /*
+ * We have sent more in the SYN than what is being
+ * acked. (e.g., TFO)
+ * We should restart the sending from what the receiver
+ * has acknowledged immediately.
+ */
+ if (SEQ_GT(tp->snd_nxt, th->th_ack)) {
+ /*
+ * rdar://problem/33214601
+ * There is a middlebox that acks all but one
+ * byte and still drops the data.
+ */
+ if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) &&
+ (tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
+ tp->snd_max == th->th_ack + 1 &&
+ tp->snd_max > tp->snd_una + 1) {
+ tcp_heuristic_tfo_middlebox(tp);
+
+ so->so_error = ENODATA;
+ soevent(so,
+ (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MP_SUB_ERROR));
+
+ tp->t_tfo_stats |= TFO_S_ONE_BYTE_PROXY;
+ }
+
+ tp->snd_max = tp->snd_nxt = th->th_ack;
+ }
+
+ /*
+ * If there's data, delay ACK; if there's also a FIN
+ * ACKNOW will be turned on later.
+ */
+ TCP_INC_VAR(tp->t_unacksegs, segment_count);
+ if (DELAY_ACK(tp, th) && tlen != 0) {
+ if ((tp->t_flags & TF_DELACK) == 0) {
+ tp->t_flags |= TF_DELACK;
+ tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
+ }
+ } else {
+ tp->t_flags |= TF_ACKNOW;
+ }
+ /*
+ * Received <SYN,ACK> in SYN_SENT[*] state.
+ * Transitions:
+ * SYN_SENT --> ESTABLISHED
+ * SYN_SENT* --> FIN_WAIT_1
+ */
+ tp->t_starttime = tcp_now;
+ tcp_sbrcv_tstmp_check(tp);
+ if (tp->t_flags & TF_NEEDFIN) {
+ DTRACE_TCP4(state__change, void, NULL,
+ struct inpcb *, inp,
+ struct tcpcb *, tp, int32_t,
+ TCPS_FIN_WAIT_1);
+ tp->t_state = TCPS_FIN_WAIT_1;
+ tp->t_flags &= ~TF_NEEDFIN;
+ thflags &= ~TH_SYN;
+
+ TCP_LOG_CONNECTION_SUMMARY(tp);
+ } else {
+ DTRACE_TCP4(state__change, void, NULL,
+ struct inpcb *, inp, struct tcpcb *,
+ tp, int32_t, TCPS_ESTABLISHED);
+ tp->t_state = TCPS_ESTABLISHED;
+ tp->t_timer[TCPT_KEEP] =
+ OFFSET_FROM_START(tp,
+ TCP_CONN_KEEPIDLE(tp));
+ if (nstat_collect) {
+ nstat_route_connect_success(
+ inp->inp_route.ro_rt);
+ }
+ /*
+ * The SYN is acknowledged but una is not
+ * updated yet. So pass the value of
+ * ack to compute sndbytes correctly
+ */
+ inp_count_sndbytes(inp, th->th_ack);
+ }
+ tp->t_forced_acks = TCP_FORCED_ACKS_COUNT;
+#if MPTCP
+ /*
+ * Do not send the connect notification for additional
+ * subflows until ACK for 3-way handshake arrives.
+ */
+ if ((!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
+ (tp->t_mpflags & TMPF_SENT_JOIN)) {
+ isconnected = FALSE;
+ } else
+#endif /* MPTCP */
+ isconnected = TRUE;
+
+ if ((tp->t_tfo_flags & (TFO_F_COOKIE_REQ | TFO_F_COOKIE_SENT)) ||
+ (tp->t_tfo_stats & TFO_S_SYN_DATA_SENT)) {
+ tcp_tfo_synack(tp, &to);
+
+ if ((tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
+ SEQ_LT(tp->snd_una, th->th_ack)) {
+ tp->t_tfo_stats |= TFO_S_SYN_DATA_ACKED;
+ tcpstat.tcps_tfo_syn_data_acked++;
+#if MPTCP
+ if (so->so_flags & SOF_MP_SUBFLOW) {
+ so->so_flags1 |= SOF1_TFO_REWIND;
+ }
+#endif
+ tcp_tfo_rcv_probe(tp, tlen);
+ }
+ }
+ } else {
+ /*
+ * Received initial SYN in SYN-SENT[*] state => simul-
+ * taneous open.
+ * Do 3-way handshake:
+ * SYN-SENT -> SYN-RECEIVED
+ * SYN-SENT* -> SYN-RECEIVED*
+ */
+ tp->t_flags |= TF_ACKNOW;
+ tp->t_timer[TCPT_REXMT] = 0;
+ DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
+ struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
+ tp->t_state = TCPS_SYN_RECEIVED;
+
+ /*
+ * During simultaneous open, TFO should not be used.
+ * So, we disable it here, to prevent that data gets
+ * sent on the SYN/ACK.
+ */
+ tcp_disable_tfo(tp);
+ }
+
+trimthenstep6:
+ /*
+ * Advance th->th_seq to correspond to first data byte.
+ * If data, trim to stay within window,
+ * dropping FIN if necessary.
+ */
+ th->th_seq++;
+ if (tlen > tp->rcv_wnd) {
+ todrop = tlen - tp->rcv_wnd;
+ m_adj(m, -todrop);
+ tlen = tp->rcv_wnd;
+ thflags &= ~TH_FIN;
+ tcpstat.tcps_rcvpackafterwin++;
+ tcpstat.tcps_rcvbyteafterwin += todrop;
+ }
+ tp->snd_wl1 = th->th_seq - 1;
+ tp->rcv_up = th->th_seq;
+ /*
+ * Client side of transaction: already sent SYN and data.
+ * If the remote host used T/TCP to validate the SYN,
+ * our data will be ACK'd; if so, enter normal data segment
+ * processing in the middle of step 5, ack processing.
+ * Otherwise, goto step 6.
+ */
+ if (thflags & TH_ACK) {
+ goto process_ACK;
+ }
+ goto step6;
+ /*
+ * If the state is LAST_ACK or CLOSING or TIME_WAIT:
+ * do normal processing.
+ *
+ * NB: Leftover from RFC1644 T/TCP. Cases to be reused later.
+ */
+ case TCPS_LAST_ACK:
+ case TCPS_CLOSING:
+ case TCPS_TIME_WAIT:
+ break; /* continue normal processing */
+
+ /* Received a SYN while connection is already established.
+ * This is a "half open connection and other anomalies" described
+ * in RFC793 page 34, send an ACK so the remote reset the connection
+ * or recovers by adjusting its sequence numbering. Sending an ACK is
+ * in accordance with RFC 5961 Section 4.2
+ */
+ case TCPS_ESTABLISHED:
+ if (thflags & TH_SYN && tlen <= 0) {
+ /* Drop the packet silently if we have reached the limit */
+ if (tcp_is_ack_ratelimited(tp)) {
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 rate limited");
+ goto drop;
+ } else {
+ /* Send challenge ACK */
+ tcpstat.tcps_synchallenge++;
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 challenge ACK");
+ goto dropafterack;
+ }
+ }
+ break;
+ }
+
+ /*
+ * States other than LISTEN or SYN_SENT.
+ * First check the RST flag and sequence number since reset segments
+ * are exempt from the timestamp and connection count tests. This
+ * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
+ * below which allowed reset segments in half the sequence space
+ * to fall though and be processed (which gives forged reset
+ * segments with a random sequence number a 50 percent chance of
+ * killing a connection).
+ * Then check timestamp, if present.
+ * Then check the connection count, if present.
+ * Then check that at least some bytes of segment are within
+ * receive window. If segment begins before rcv_nxt,
+ * drop leading data (and SYN); if nothing left, just ack.
+ *
+ *
+ * If the RST bit is set, check the sequence number to see
+ * if this is a valid reset segment.
+ * RFC 793 page 37:
+ * In all states except SYN-SENT, all reset (RST) segments
+ * are validated by checking their SEQ-fields. A reset is
+ * valid if its sequence number is in the window.
+ * Note: this does not take into account delayed ACKs, so
+ * we should test against last_ack_sent instead of rcv_nxt.
+ * The sequence number in the reset segment is normally an
+ * echo of our outgoing acknowlegement numbers, but some hosts
+ * send a reset with the sequence number at the rightmost edge
+ * of our receive window, and we have to handle this case.
+ * Note 2: Paul Watson's paper "Slipping in the Window" has shown
+ * that brute force RST attacks are possible. To combat this,
+ * we use a much stricter check while in the ESTABLISHED state,
+ * only accepting RSTs where the sequence number is equal to
+ * last_ack_sent. In all other states (the states in which a
+ * RST is more likely), the more permissive check is used.
+ * RFC 5961 Section 3.2: if the RST bit is set, sequence # is
+ * within the receive window and last_ack_sent == seq,
+ * then reset the connection. Otherwise if the seq doesn't
+ * match last_ack_sent, TCP must send challenge ACK. Perform
+ * rate limitation when sending the challenge ACK.
+ * If we have multiple segments in flight, the intial reset
+ * segment sequence numbers will be to the left of last_ack_sent,
+ * but they will eventually catch up.
+ * In any case, it never made sense to trim reset segments to
+ * fit the receive window since RFC 1122 says:
+ * 4.2.2.12 RST Segment: RFC-793 Section 3.4
+ *
+ * A TCP SHOULD allow a received RST segment to include data.
+ *
+ * DISCUSSION
+ * It has been suggested that a RST segment could contain
+ * ASCII text that encoded and explained the cause of the
+ * RST. No standard has yet been established for such
+ * data.
+ *
+ * If the reset segment passes the sequence number test examine
+ * the state:
+ * SYN_RECEIVED STATE:
+ * If passive open, return to LISTEN state.
+ * If active open, inform user that connection was refused.
+ * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
+ * Inform user that connection was reset, and close tcb.
+ * CLOSING, LAST_ACK STATES:
+ * Close the tcb.
+ * TIME_WAIT STATE:
+ * Drop the segment - see Stevens, vol. 2, p. 964 and
+ * RFC 1337.
+ *
+ * Radar 4803931: Allows for the case where we ACKed the FIN but
+ * there is already a RST in flight from the peer.
+ * In that case, accept the RST for non-established
+ * state if it's one off from last_ack_sent.
+ *
+ */
+ if (thflags & TH_RST) {
+ if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
+ (tp->rcv_wnd == 0 &&
+ ((tp->last_ack_sent == th->th_seq) ||
+ ((tp->last_ack_sent - 1) == th->th_seq)))) {
+ if (tp->last_ack_sent == th->th_seq) {
+ switch (tp->t_state) {
+ case TCPS_SYN_RECEIVED:
+ IF_TCP_STATINC(ifp, rstinsynrcv);
+ so->so_error = ECONNREFUSED;
+ goto close;
+
+ case TCPS_ESTABLISHED:
+ if (TCP_ECN_ENABLED(tp) &&
+ tp->snd_una == tp->iss + 1 &&
+ SEQ_GT(tp->snd_max, tp->snd_una)) {
+ /*
+ * If the first data packet on an
+ * ECN connection, receives a RST
+ * increment the heuristic
+ */
+ tcp_heuristic_ecn_droprst(tp);
+ }
+ OS_FALLTHROUGH;
+ case TCPS_FIN_WAIT_1:
+ case TCPS_CLOSE_WAIT:
+ case TCPS_FIN_WAIT_2:
+ so->so_error = ECONNRESET;
+close:
+ soevent(so,
+ (SO_FILT_HINT_LOCKED |
+ SO_FILT_HINT_CONNRESET));
+
+ tcpstat.tcps_drops++;
+ tp = tcp_close(tp);
+ break;
+
+ case TCPS_CLOSING:
+ case TCPS_LAST_ACK:
+ tp = tcp_close(tp);
+ break;
+
+ case TCPS_TIME_WAIT:
+ break;
+ }
+ } else {
+ tcpstat.tcps_badrst++;
+ /* Drop if we have reached the ACK limit */
+ if (tcp_is_ack_ratelimited(tp)) {
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 rate limited");
+ goto drop;
+ } else {
+ /* Send challenge ACK */
+ tcpstat.tcps_rstchallenge++;
+ TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 challenge ACK");
+ goto dropafterack;
+ }
+ }
+ }
+ goto drop;