2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
23 * Copyright (c) 1982, 1986, 1988, 1993
24 * The Regents of the University of California. All rights reserved.
26 * Redistribution and use in source and binary forms, with or without
27 * modification, are permitted provided that the following conditions
29 * 1. Redistributions of source code must retain the above copyright
30 * notice, this list of conditions and the following disclaimer.
31 * 2. Redistributions in binary form must reproduce the above copyright
32 * notice, this list of conditions and the following disclaimer in the
33 * documentation and/or other materials provided with the distribution.
34 * 3. All advertising materials mentioning features or use of this software
35 * must display the following acknowledgement:
36 * This product includes software developed by the University of
37 * California, Berkeley and its contributors.
38 * 4. Neither the name of the University nor the names of its contributors
39 * may be used to endorse or promote products derived from this software
40 * without specific prior written permission.
42 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
54 * $FreeBSD: src/sys/netinet/ip_divert.c,v 1.98 2004/08/17 22:05:54 andre Exp $
58 #error "IPDIVERT requires INET."
61 #include <sys/param.h>
62 #include <sys/kernel.h>
63 #include <sys/malloc.h>
65 #include <sys/socket.h>
66 #include <sys/domain.h>
67 #include <sys/protosw.h>
68 #include <sys/socketvar.h>
69 #include <sys/sysctl.h>
70 #include <sys/systm.h>
75 #include <net/route.h>
77 #include <netinet/in.h>
78 #include <netinet/in_systm.h>
79 #include <netinet/ip.h>
80 #include <netinet/in_pcb.h>
81 #include <netinet/in_var.h>
82 #include <netinet/ip_var.h>
83 #include <netinet/ip_fw.h>
84 #include <netinet/ip_divert.h>
86 #include <kern/zalloc.h>
93 * Allocate enough space to hold a full IP packet
95 #define DIVSNDQ (65536 + 100)
96 #define DIVRCVQ (65536 + 100)
99 * Divert sockets work in conjunction with ipfw, see the divert(4)
100 * manpage for features.
101 * Internally, packets selected by ipfw in ip_input() or ip_output(),
102 * and never diverted before, are passed to the input queue of the
103 * divert socket with a given 'divert_port' number (as specified in
104 * the matching ipfw rule), and they are tagged with a 16 bit cookie
105 * (representing the rule number of the matching ipfw rule), which
106 * is passed to process reading from the socket.
108 * Packets written to the divert socket are again tagged with a cookie
109 * (usually the same as above) and a destination address.
110 * If the destination address is INADDR_ANY then the packet is
111 * treated as outgoing and sent to ip_output(), otherwise it is
112 * treated as incoming and sent to ip_input().
113 * In both cases, the packet is tagged with the cookie.
115 * On reinjection, processing in ip_input() and ip_output()
116 * will be exactly the same as for the original packet, except that
117 * ipfw processing will start at the rule number after the one
118 * written in the cookie (so, tagging a packet with a cookie of 0
119 * will cause it to be effectively considered as a standard packet).
122 /* Internal variables */
123 static struct inpcbhead divcb
;
124 static struct inpcbinfo divcbinfo
;
126 static u_long div_sendspace
= DIVSNDQ
; /* XXX sysctl ? */
127 static u_long div_recvspace
= DIVRCVQ
; /* XXX sysctl ? */
129 /* Optimization: have this preinitialized */
130 static struct sockaddr_in divsrc
= { sizeof(divsrc
), AF_INET
, };
132 /* Internal functions */
133 static int div_output(struct socket
*so
,
134 struct mbuf
*m
, struct sockaddr
*addr
, struct mbuf
*control
);
136 extern int load_ipfw(void);
138 * Initialize divert connection block queue.
143 struct inpcbinfo
*pcbinfo
;
145 divcbinfo
.listhead
= &divcb
;
147 * XXX We don't use the hash list for divert IP, but it's easier
148 * to allocate a one entry hash list than it is to check all
149 * over the place for hashbase == NULL.
151 divcbinfo
.hashbase
= hashinit(1, M_PCB
, &divcbinfo
.hashmask
);
152 divcbinfo
.porthashbase
= hashinit(1, M_PCB
, &divcbinfo
.porthashmask
);
153 divcbinfo
.ipi_zone
= (void *) zinit(sizeof(struct inpcb
),(maxsockets
* sizeof(struct inpcb
)),
155 pcbinfo
= &divcbinfo
;
157 * allocate lock group attribute and group for udp pcb mutexes
159 pcbinfo
->mtx_grp_attr
= lck_grp_attr_alloc_init();
161 pcbinfo
->mtx_grp
= lck_grp_alloc_init("divcb", pcbinfo
->mtx_grp_attr
);
164 * allocate the lock attribute for divert pcb mutexes
166 pcbinfo
->mtx_attr
= lck_attr_alloc_init();
167 lck_attr_setdefault(pcbinfo
->mtx_attr
);
169 if ((pcbinfo
->mtx
= lck_rw_alloc_init(pcbinfo
->mtx_grp
, pcbinfo
->mtx_attr
)) == NULL
)
170 return; /* pretty much dead if this fails... */
178 * IPPROTO_DIVERT is not a real IP protocol; don't allow any packets
179 * with that protocol number to enter the system from the outside.
182 div_input(struct mbuf
*m
, __unused
int off
)
184 ipstat
.ips_noproto
++;
189 * Divert a packet by passing it up to the divert socket at port 'port'.
191 * Setup generic address and protocol structures for div_input routine,
192 * then pass them along with mbuf chain.
193 * ###LOCK called in ip_mutex from ip_output/ip_input
196 divert_packet(struct mbuf
*m
, int incoming
, int port
, int rule
)
204 KASSERT(port
!= 0, ("%s: port=0", __FUNCTION__
));
206 divsrc
.sin_port
= rule
; /* record matching rule */
209 if (m
->m_len
< sizeof(struct ip
) &&
210 (m
= m_pullup(m
, sizeof(struct ip
))) == 0) {
213 ip
= mtod(m
, struct ip
*);
216 * Record receive interface address, if any.
217 * But only for incoming packets.
219 divsrc
.sin_addr
.s_addr
= 0;
224 KASSERT((m
->m_flags
& M_PKTHDR
), ("%s: !PKTHDR", __FUNCTION__
));
226 /* Find IP address for receive interface */
227 ifnet_lock_shared(m
->m_pkthdr
.rcvif
);
228 TAILQ_FOREACH(ifa
, &m
->m_pkthdr
.rcvif
->if_addrhead
, ifa_link
) {
229 if (ifa
->ifa_addr
== NULL
)
231 if (ifa
->ifa_addr
->sa_family
!= AF_INET
)
234 ((struct sockaddr_in
*) ifa
->ifa_addr
)->sin_addr
;
237 ifnet_lock_done(m
->m_pkthdr
.rcvif
);
240 * Record the incoming interface name whenever we have one.
242 bzero(&divsrc
.sin_zero
, sizeof(divsrc
.sin_zero
));
243 if (m
->m_pkthdr
.rcvif
) {
245 * Hide the actual interface name in there in the
246 * sin_zero array. XXX This needs to be moved to a
247 * different sockaddr type for divert, e.g.
248 * sockaddr_div with multiple fields like
249 * sockaddr_dl. Presently we have only 7 bytes
250 * but that will do for now as most interfaces
251 * are 4 or less + 2 or less bytes for unit.
252 * There is probably a faster way of doing this,
253 * possibly taking it from the sockaddr_dl on the iface.
254 * This solves the problem of a P2P link and a LAN interface
255 * having the same address, which can result in the wrong
256 * interface being assigned to the packet when fed back
257 * into the divert socket. Theoretically if the daemon saves
258 * and re-uses the sockaddr_in as suggested in the man pages,
259 * this iface name will come along for the ride.
260 * (see div_output for the other half of this.)
262 snprintf(divsrc
.sin_zero
, sizeof(divsrc
.sin_zero
),
263 "%s%d", m
->m_pkthdr
.rcvif
->if_name
,
264 m
->m_pkthdr
.rcvif
->if_unit
);
267 /* Put packet on socket queue, if any */
269 nport
= htons((u_int16_t
)port
);
270 lck_rw_lock_shared(divcbinfo
.mtx
);
271 LIST_FOREACH(inp
, &divcb
, inp_list
) {
272 if (inp
->inp_lport
== nport
)
273 sa
= inp
->inp_socket
;
279 if (sbappendaddr(&sa
->so_rcv
, (struct sockaddr
*)&divsrc
,
280 m
, (struct mbuf
*)0, &error
) != 0)
282 socket_unlock(sa
, 1);
285 ipstat
.ips_noproto
++;
286 ipstat
.ips_delivered
--;
288 lck_rw_done(divcbinfo
.mtx
);
292 * Deliver packet back into the IP processing machinery.
294 * If no address specified, or address is 0.0.0.0, send to ip_output();
295 * otherwise, send to ip_input() and mark as having been received on
296 * the interface with that address.
297 * ###LOCK called in inet_proto mutex when from div_send.
300 div_output(so
, m
, addr
, control
)
302 register struct mbuf
*m
;
303 struct sockaddr
*addr
;
304 struct mbuf
*control
;
306 register struct inpcb
*const inp
= sotoinpcb(so
);
307 register struct ip
*const ip
= mtod(m
, struct ip
*);
308 struct sockaddr_in
*sin
= (struct sockaddr_in
*)addr
;
312 m_freem(control
); /* XXX */
314 /* Loopback avoidance and state recovery */
317 struct divert_tag
*dt
;
319 char *c
= sin
->sin_zero
;
321 mtag
= m_tag_alloc(KERNEL_MODULE_TAG_ID
, KERNEL_TAG_TYPE_DIVERT
,
322 sizeof(struct divert_tag
), M_NOWAIT
);
327 dt
= (struct divert_tag
*)(mtag
+1);
329 dt
->cookie
= sin
->sin_port
;
330 m_tag_prepend(m
, mtag
);
333 * Find receive interface with the given name or IP address.
334 * The name is user supplied data so don't trust it's size or
335 * that it is zero terminated. The name has priority.
336 * We are presently assuming that the sockaddr_in
337 * has not been replaced by a sockaddr_div, so we limit it
338 * to 16 bytes in total. the name is stuffed (if it exists)
339 * in the sin_zero[] field.
341 while (*c
++ && (len
++ < sizeof(sin
->sin_zero
)));
342 if ((len
> 0) && (len
< sizeof(sin
->sin_zero
)))
343 m
->m_pkthdr
.rcvif
= ifunit(sin
->sin_zero
);
346 /* Reinject packet into the system as incoming or outgoing */
347 if (!sin
|| sin
->sin_addr
.s_addr
== 0) {
349 * Don't allow both user specified and setsockopt options,
350 * and don't allow packet length sizes that will crash
352 if (((ip
->ip_hl
!= (sizeof (*ip
) >> 2)) && inp
->inp_options
) ||
353 ((u_short
)ntohs(ip
->ip_len
) > m
->m_pkthdr
.len
)) {
358 /* Convert fields to host order for ip_output() */
362 /* Send packet to output processing */
363 ipstat
.ips_rawout
++; /* XXX */
364 socket_unlock(so
, 0);
366 inp
->inp_options
, &inp
->inp_route
,
367 (so
->so_options
& SO_DONTROUTE
) |
368 IP_ALLOWBROADCAST
| IP_RAWOUTPUT
,
374 /* If no luck with the name above. check by IP address. */
375 if (m
->m_pkthdr
.rcvif
== NULL
) {
377 * Make sure there are no distractions
378 * for ifa_ifwithaddr. Clear the port and the ifname.
379 * Maybe zap all 8 bytes at once using a 64bit write?
381 bzero(sin
->sin_zero
, sizeof(sin
->sin_zero
));
382 /* *((u_int64_t *)sin->sin_zero) = 0; */ /* XXX ?? */
384 if (!(ifa
= ifa_ifwithaddr((struct sockaddr
*) sin
))) {
385 error
= EADDRNOTAVAIL
;
388 m
->m_pkthdr
.rcvif
= ifa
->ifa_ifp
;
392 if ((~IF_HWASSIST_CSUM_FLAGS(m
->m_pkthdr
.rcvif
->if_hwassist
) &
393 m
->m_pkthdr
.csum_flags
) == 0) {
394 if (m
->m_pkthdr
.csum_flags
& CSUM_DELAY_DATA
) {
395 m
->m_pkthdr
.csum_flags
&= ~CSUM_DELAY_DATA
;
397 m
->m_pkthdr
.csum_flags
|=
398 CSUM_DATA_VALID
| CSUM_PSEUDO_HDR
|
399 CSUM_IP_CHECKED
| CSUM_IP_VALID
;
400 m
->m_pkthdr
.csum_data
= 0xffff;
402 else if (m
->m_pkthdr
.csum_flags
& CSUM_DELAY_DATA
) {
406 hlen
= IP_VHL_HL(ip
->ip_vhl
) << 2;
408 hlen
= ip
->ip_hl
<< 2;
411 m
->m_pkthdr
.csum_flags
&= ~CSUM_DELAY_DATA
;
412 ip
->ip_sum
= in_cksum(m
, hlen
);
415 /* Send packet to input processing */
416 proto_inject(PF_INET
, m
);
427 div_attach(struct socket
*so
, int proto
, struct proc
*p
)
436 if (p
&& (error
= proc_suser(p
)) != 0)
439 error
= soreserve(so
, div_sendspace
, div_recvspace
);
442 error
= in_pcballoc(so
, &divcbinfo
, p
);
445 inp
= (struct inpcb
*)so
->so_pcb
;
446 inp
->inp_ip_p
= proto
;
447 inp
->inp_vflag
|= INP_IPV4
;
448 inp
->inp_flags
|= INP_HDRINCL
;
449 /* The socket is always "connected" because
450 we always know "where" to send the packet */
451 so
->so_state
|= SS_ISCONNECTED
;
453 #ifdef MORE_DICVLOCK_DEBUG
454 printf("div_attach: so=%x sopcb=%x lock=%x ref=%x\n",
455 so
, so
->so_pcb
, ((struct inpcb
*)so
->so_pcb
)->inpcb_mtx
, so
->so_usecount
);
461 div_detach(struct socket
*so
)
465 #ifdef MORE_DICVLOCK_DEBUG
466 printf("div_detach: so=%x sopcb=%x lock=%x ref=%x\n",
467 so
, so
->so_pcb
, ((struct inpcb
*)so
->so_pcb
)->inpcb_mtx
, so
->so_usecount
);
471 panic("div_detach: so=%x null inp\n", so
);
473 inp
->inp_state
= INPCB_STATE_DEAD
;
478 div_abort(struct socket
*so
)
480 soisdisconnected(so
);
481 return div_detach(so
);
485 div_disconnect(struct socket
*so
)
487 if ((so
->so_state
& SS_ISCONNECTED
) == 0)
489 return div_abort(so
);
493 div_bind(struct socket
*so
, struct sockaddr
*nam
, struct proc
*p
)
499 /* in_pcbbind assumes that the socket is a sockaddr_in
500 * and in_pcbbind requires a valid address. Since divert
501 * sockets don't we need to make sure the address is
502 * filled in properly.
503 * XXX -- divert should not be abusing in_pcbind
504 * and should probably have its own family.
506 if (nam
->sa_family
!= AF_INET
) {
507 error
= EAFNOSUPPORT
;
509 ((struct sockaddr_in
*)nam
)->sin_addr
.s_addr
= INADDR_ANY
;
510 error
= in_pcbbind(inp
, nam
, p
);
516 div_shutdown(struct socket
*so
)
523 div_send(struct socket
*so
, __unused
int flags
, struct mbuf
*m
, struct sockaddr
*nam
,
524 struct mbuf
*control
, __unused
struct proc
*p
)
526 /* Packet must have a header (but that's about it) */
527 if (m
->m_len
< sizeof (struct ip
) &&
528 (m
= m_pullup(m
, sizeof (struct ip
))) == 0) {
529 ipstat
.ips_toosmall
++;
535 return div_output(so
, m
, nam
, control
);
539 div_pcblist SYSCTL_HANDLER_ARGS
542 struct inpcb
*inp
, **inp_list
;
547 * The process of preparing the TCB list is too time-consuming and
548 * resource-intensive to repeat twice on every request.
550 lck_rw_lock_exclusive(divcbinfo
.mtx
);
551 if (req
->oldptr
== USER_ADDR_NULL
) {
552 n
= divcbinfo
.ipi_count
;
553 req
->oldidx
= 2 * (sizeof xig
)
554 + (n
+ n
/8) * sizeof(struct xinpcb
);
555 lck_rw_done(divcbinfo
.mtx
);
559 if (req
->newptr
!= USER_ADDR_NULL
) {
560 lck_rw_done(divcbinfo
.mtx
);
565 * OK, now we're committed to doing something.
567 gencnt
= divcbinfo
.ipi_gencnt
;
568 n
= divcbinfo
.ipi_count
;
570 xig
.xig_len
= sizeof xig
;
572 xig
.xig_gen
= gencnt
;
573 xig
.xig_sogen
= so_gencnt
;
574 error
= SYSCTL_OUT(req
, &xig
, sizeof xig
);
576 lck_rw_done(divcbinfo
.mtx
);
580 inp_list
= _MALLOC(n
* sizeof *inp_list
, M_TEMP
, M_WAITOK
);
582 lck_rw_done(divcbinfo
.mtx
);
586 for (inp
= LIST_FIRST(divcbinfo
.listhead
), i
= 0; inp
&& i
< n
;
587 inp
= LIST_NEXT(inp
, inp_list
)) {
589 if (inp
->inp_gencnt
<= gencnt
&& inp
->inp_state
!= INPCB_STATE_DEAD
)
591 if (inp
->inp_gencnt
<= gencnt
&& !prison_xinpcb(req
->p
, inp
))
598 for (i
= 0; i
< n
; i
++) {
600 if (inp
->inp_gencnt
<= gencnt
&& inp
->inp_state
!= INPCB_STATE_DEAD
) {
602 xi
.xi_len
= sizeof xi
;
603 /* XXX should avoid extra copy */
604 inpcb_to_compat(inp
, &xi
.xi_inp
);
606 sotoxsocket(inp
->inp_socket
, &xi
.xi_socket
);
607 error
= SYSCTL_OUT(req
, &xi
, sizeof xi
);
612 * Give the user an updated idea of our state.
613 * If the generation differs from what we told
614 * her before, she knows that something happened
615 * while we were processing this request, and it
616 * might be necessary to retry.
618 xig
.xig_gen
= divcbinfo
.ipi_gencnt
;
619 xig
.xig_sogen
= so_gencnt
;
620 xig
.xig_count
= divcbinfo
.ipi_count
;
621 error
= SYSCTL_OUT(req
, &xig
, sizeof xig
);
623 FREE(inp_list
, M_TEMP
);
624 lck_rw_done(divcbinfo
.mtx
);
628 __private_extern__
int
629 div_lock(struct socket
*so
, int refcount
, int lr
)
634 __asm__
volatile("mflr %0" : "=r" (lr_saved
));
639 #ifdef MORE_DICVLOCK_DEBUG
640 printf("div_lock: so=%x sopcb=%x lock=%x ref=%x lr=%x\n",
643 so
->so_pcb
? ((struct inpcb
*)so
->so_pcb
)->inpcb_mtx
: 0,
648 lck_mtx_lock(((struct inpcb
*)so
->so_pcb
)->inpcb_mtx
);
650 panic("div_lock: so=%x NO PCB! lr=%x\n", so
, lr_saved
);
651 lck_mtx_lock(so
->so_proto
->pr_domain
->dom_mtx
);
654 if (so
->so_usecount
< 0)
655 panic("div_lock: so=%x so_pcb=%x lr=%x ref=%x\n",
656 so
, so
->so_pcb
, lr_saved
, so
->so_usecount
);
660 so
->reserved3
= (void *)lr_saved
;
665 __private_extern__
int
666 div_unlock(struct socket
*so
, int refcount
, int lr
)
669 lck_mtx_t
* mutex_held
;
670 struct inpcb
*inp
= sotoinpcb(so
);
673 __asm__
volatile("mflr %0" : "=r" (lr_saved
));
678 #ifdef MORE_DICVLOCK_DEBUG
679 printf("div_unlock: so=%x sopcb=%x lock=%x ref=%x lr=%x\n",
682 so
->so_pcb
? ((struct inpcb
*)so
->so_pcb
)->inpcb_mtx
: 0,
689 if (so
->so_usecount
< 0)
690 panic("div_unlock: so=%x usecount=%x\n", so
, so
->so_usecount
);
691 if (so
->so_pcb
== NULL
) {
692 panic("div_unlock: so=%x NO PCB usecount=%x lr=%x\n", so
, so
->so_usecount
, lr_saved
);
693 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
695 mutex_held
= ((struct inpcb
*)so
->so_pcb
)->inpcb_mtx
;
698 if (so
->so_usecount
== 0 && (inp
->inp_wantcnt
== WNT_STOPUSING
)) {
699 lck_rw_lock_exclusive(divcbinfo
.mtx
);
701 lck_rw_done(divcbinfo
.mtx
);
704 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
705 lck_mtx_unlock(mutex_held
);
706 so
->reserved4
= (void *)lr_saved
;
710 __private_extern__ lck_mtx_t
*
711 div_getlock(struct socket
*so
, __unused
int locktype
)
713 struct inpcb
*inpcb
= (struct inpcb
*)so
->so_pcb
;
716 if (so
->so_usecount
< 0)
717 panic("div_getlock: so=%x usecount=%x\n", so
, so
->so_usecount
);
718 return(inpcb
->inpcb_mtx
);
720 panic("div_getlock: so=%x NULL so_pcb\n", so
);
721 return (so
->so_proto
->pr_domain
->dom_mtx
);
726 struct pr_usrreqs div_usrreqs
= {
727 div_abort
, pru_accept_notsupp
, div_attach
, div_bind
,
728 pru_connect_notsupp
, pru_connect2_notsupp
, in_control
, div_detach
,
729 div_disconnect
, pru_listen_notsupp
, in_setpeeraddr
, pru_rcvd_notsupp
,
730 pru_rcvoob_notsupp
, div_send
, pru_sense_null
, div_shutdown
,
731 in_setsockaddr
, sosend
, soreceive
, pru_sopoll_notsupp